1
1

refresh ROMIO based on v3.2a2-84-gef1cf14

Этот коммит содержится в:
Gilles Gouaillardet 2015-01-06 19:11:28 +09:00
родитель c857cc926c
Коммит 0914de9eae
385 изменённых файлов: 14451 добавлений и 21371 удалений

13
ompi/mca/io/romio/romio/.gitignore поставляемый Обычный файл
Просмотреть файл

@ -0,0 +1,13 @@
/Makefile
/.deps
/*.bb
/*.bbg
/*.gcda
/*.gcno
/.libs
/.libstamp*
/*.lo
/.*-cache
.state-cache
version.m4
confdb/config.rpath

Просмотреть файл

@ -5,15 +5,15 @@
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
# (C) 2011 by Argonne National Laboratory.
@ -33,11 +33,11 @@ include $(top_srcdir)/Makefile.options
ACLOCAL_AMFLAGS = -I confdb
# empty variable initializations so that later code can append (+=)
include_HEADERS =
include_HEADERS =
nodist_include_HEADERS =
noinst_HEADERS =
noinst_HEADERS =
EXTRA_DIST =
SUFFIXES =
SUFFIXES =
doc1_src_txt =
# ------------------------------------------------------------------------
@ -47,14 +47,14 @@ doc1_src_txt =
# In MPICH these will have an MPI_ and a PMPI_ version. Other implementations
# (like OMPI) only want these to be MPI_ routines, possibly with some
# name-shifting prefix.
romio_mpi_sources =
romio_mpi_sources =
# regular old source files that implement ROMIO, such as ADIO code
romio_other_sources =
# code that may need to be "up" called from the MPI library and/or is
# MPI-implementation-specific in some way
glue_sources =
glue_sources =
# ------------------------------------------------------------------------
# when building under MPICH we must be able to find mpi.h
@ -88,25 +88,30 @@ libromio_dist_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_so
## NOTE: ROMIO's old build system builds a bunch of _foo.o objects that contain
## PMPI_ implementations as well as calls to only other PMPI routines. In
## MPICH, these are the objects that need to go into libmpich, while the foo.o
## objects should go into libpmpich. Furthermore, the -D option for ROMIO's
## MPICH, these are the objects that need to go into libmpi, while the foo.o
## objects should go into libpmpi. Furthermore, the -D option for ROMIO's
## source files is different and inverted (in the boolean sense) compared with
## MPICH's defintion. And ROMIO was dumping all of the symbols into the main
## libmpich library, regardless of the separate profiling library's existence.
## libmpi library, regardless of the separate profiling library's existence.
##
## Annoying, right?
if BUILD_PROFILING_LIB
# The current best strategy for now is to build the PMPI symbols as a separate
# convenience lib to permit adding the special "-D..." argument for all objects.
# MPICH will then link in both convenience library into libmpich, since it
# MPICH will then link in both convenience library into libmpi, since it
# won't work very well the other way around.
noinst_LTLIBRARIES += libpromio.la
libpromio_la_SOURCES = $(romio_mpi_sources)
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
endif BUILD_PROFILING_LIB
else !BUILD_ROMIO_EMBEDDED
## TODO build a libromio.la (non-convenience) and possibly a libglue.la or something?
lib_LTLIBRARIES = libromio.la
libromio_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
if BUILD_PROFILING_LIB
libpromio_la_SOURCES = $(romio_mpi_sources)
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
endif BUILD_PROFILING_LIB
endif
@ -147,20 +152,27 @@ mandoc_path3=$(abs_top_builddir)/man/man3
htmldoc_path1=$(abs_top_builddir)/www/www1
htmldoc_path3=$(abs_top_builddir)/www/www3
doctext_docnotes=
# Provide an easily replaced url root for the generated index file.
# You can override this with URL desired in the index file generated by doctext.
# You can ignore this if you don't use mapnames or tohtml to add links
# to the MPI manual pages to documents.
htmldoc_root3="--your-url-here--"
.c.man-phony:
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path3) -ext 3 \
-heading MPI -quotefmt $(doctext_docnotes) $<
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
.c.html-phony:
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path3) \
-heading MPI -quotefmt $(doctext_docnotes) $<
-heading MPI -quotefmt -nolocation \
-index $(htmldoc_path3)/mpi.cit -indexdir $(htmldoc_root3) \
$(doctext_docnotes) $<
.txt.man1-phony:
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path1) -ext 1 \
-heading MPI -quotefmt $(doctext_docnotes) $<
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
.txt.html1-phony:
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path1) \
-heading MPI -quotefmt $(doctext_docnotes) $<
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
# use mandoc-local target to force directory creation before running DOCTEXT
mandoc:

Просмотреть файл

@ -6,14 +6,14 @@
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#

Просмотреть файл

@ -492,7 +492,7 @@ to include the file mpio.h for C or mpiof.h for Fortran in your MPI-IO
program.
Note that on HP machines running HPUX and on NEC SX-4, you need to
compile Fortran programs with mpif90, because the f77 compilers on
compile Fortran programs with mpifort, because the f77 compilers on
these machines don't support 8-byte integers.
With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
@ -500,9 +500,9 @@ With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
or
mpif77 foo.f
or
mpif90 foo.f
mpifort foo.f
As mentioned above, mpif90 is preferred over mpif77 on HPUX and NEC
As mentioned above, mpifort is preferred over mpif77 on HPUX and NEC
because the f77 compilers on those machines do not support 8-byte integers.
With SGI MPI, you can compile MPI-IO programs as
@ -566,7 +566,7 @@ systems because they don't support fcntl file locks, and ROMIO uses
that feature to implement shared file pointers.
* On HP machines running HPUX and on NEC SX-4, you need to compile
Fortran programs with mpif90 instead of mpif77, because the f77
Fortran programs with mpifort instead of mpif77, because the f77
compilers on these machines don't support 8-byte integers.
* The file-open mode MPI_MODE_EXCL does not work on Intel PFS file system,

Просмотреть файл

@ -1,28 +1,11 @@
Please note that this is *NOT* a vanilla MPICH 3.0.4 distribution of the
ROMIO package from Argonne National Labs. Various customizations had
to be applied to the configuration process. More to the point -- if
replace this copy of ROMIO with a newer version, it will likely not
work. :-(
Please note that this is *NOT* a vanilla MPICH v3.2a2-84-gef1cf14
distribution of the ROMIO package from Argonne National Labs.
Various customizations had to be applied to the configuration process.
More to the point -- if replace this copy of ROMIO with a newer version,
it will likely not work. :-(
- The Open MPI Team
-----------------------------------------------------------------------------
Local modifications:
====================
- Moved aclocal.m4 -> acinclude.m4
- Bunches of changes in acinclude.m4 to make it work with modern
versions of the GNU auto tools -- see comments in file.
- Bunches of changes in configure.ac to make it work with modern
versions of the GNU auto tools -- see comments in file.
- We define MPI_MAX_DATAREP_STRING, therefore
protect the redefinition in include/mpio.h
- Not all systems have snprintf(); include "opal/util/printf.h" in
test/noncontig_coll.c
Patches past 3.0.4: (Update whenever ROMIO is updated)
- Deal with endless ESTALE cases:
http://git.mpich.org/mpich.git/commit/b250d338e66667a8a1071a5f73a4151fd59f83b2
- Fix compile error with Lustre 2.4
http://trac.mpich.org/projects/mpich/changeset/a0c4278f1400a73eb63c5106e2bd3b1a6565ad5a
Local modifications are in ompi.patch

Просмотреть файл

@ -19,13 +19,11 @@ noinst_HEADERS += \
adio/include/mpio_error.h \
adio/include/mpipr.h \
adio/include/mpiu_greq.h \
adio/include/nopackage.h \
adio/include/mpiu_external32.h \
adio/include/romioconf-undefs.h
adio/include/nopackage.h
include $(top_srcdir)/adio/ad_bg/Makefile.mk
include $(top_srcdir)/adio/ad_bgl/Makefile.mk
include $(top_srcdir)/adio/ad_bglockless/Makefile.mk
include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk
include $(top_srcdir)/adio/ad_gpfs/pe/Makefile.mk
include $(top_srcdir)/adio/ad_gridftp/Makefile.mk
include $(top_srcdir)/adio/ad_hfs/Makefile.mk
include $(top_srcdir)/adio/ad_lustre/Makefile.mk

Просмотреть файл

@ -1,35 +0,0 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_BG
AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
noinst_HEADERS += \
adio/ad_bg/ad_bg_aggrs.h \
adio/ad_bg/ad_bg.h \
adio/ad_bg/ad_bg_pset.h \
adio/ad_bg/ad_bg_tuning.h
romio_other_sources += \
adio/ad_bg/ad_bg_aggrs.c \
adio/ad_bg/ad_bg_close.c \
adio/ad_bg/ad_bg_flush.c \
adio/ad_bg/ad_bg_hints.c \
adio/ad_bg/ad_bg_pset.c \
adio/ad_bg/ad_bg_read.c \
adio/ad_bg/ad_bg_tuning.c \
adio/ad_bg/ad_bg_write.c \
adio/ad_bg/ad_bg.c \
adio/ad_bg/ad_bg_fcntl.c \
adio/ad_bg/ad_bg_getsh.c \
adio/ad_bg/ad_bg_open.c \
adio/ad_bg/ad_bg_rdcoll.c \
adio/ad_bg/ad_bg_setsh.c \
adio/ad_bg/ad_bg_wrcoll.c
endif BUILD_AD_BG

Просмотреть файл

@ -1,51 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#define BG_OPTIM_STEP1_1 1
#include "ad_bg.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_BG_operations = {
ADIOI_BG_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_BG_ReadContig, /* ReadContig */
ADIOI_BG_WriteContig, /* WriteContig */
ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_BG_Fcntl, /* Fcntl */
ADIOI_BG_SetInfo, /* SetInfo */
ADIOI_BG_ReadStrided, /* ReadStrided */
ADIOI_BG_WriteStrided, /* WriteStrided */
ADIOI_BG_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
#warning Consider BG support for NFS before enabling this.
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_BG_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
};

Просмотреть файл

@ -1,97 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg.h
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_BG_INCLUDE
#define AD_BG_INCLUDE
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#endif
#if 0
int ADIOI_BG_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle);
#endif
void ADIOI_BG_Open(ADIO_File fd, int *error_code);
void ADIOI_BG_Close(ADIO_File fd, int *error_code);
void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
#if 0
void ADIOI_BG_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_BG_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_BG_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_BG_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_BG_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_BG_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
#endif
void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BG_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_BG_Flush(ADIO_File fd, int *error_code);
#include "ad_bg_tuning.h"
#endif

Просмотреть файл

@ -1,53 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_close.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "ad_bg_aggrs.h"
void ADIOI_BG_Close(ADIO_File fd, int *error_code)
{
int err, derr=0;
static char myname[] = "ADIOI_BG_CLOSE";
#ifdef PROFILE
MPE_Log_event(9, 0, "start close");
#endif
err = close(fd->fd_sys);
if (fd->fd_direct >= 0)
{
derr = close(fd->fd_direct);
}
#ifdef PROFILE
MPE_Log_event(10, 0, "end close");
#endif
/* FPRINTF(stderr,"%s(%d):'%s'. Free %#X\n",myname,__LINE__,fd->filename,(int)fd->fs_ptr);*/
if (fd->fs_ptr != NULL) {
ADIOI_Free(fd->fs_ptr);
fd->fs_ptr = NULL;
}
fd->fd_sys = -1;
fd->fd_direct = -1;
if (err == -1 || derr == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,58 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_fcntl.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "adio_extern.h"
/* #ifdef MPISGI
#include "mpisgi2.h"
#endif */
void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code)
{
static char myname[] = "ADIOI_BG_FCNTL";
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
if (fcntl_struct->fsize == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
break;
/* --BEGIN ERROR HANDLING-- */
default:
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"**flag", "**flag %d", flag);
/* --END ERROR HANDLING-- */
}
}

Просмотреть файл

@ -1,90 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_flush.c
* \brief Scalable flush based on underlying filesystem and psets
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "ad_bg_aggrs.h"
void ADIOI_BG_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_BG_FLUSH";
if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATION_ENABLED)
{
int rank;
/* Barrier so we can collectively do fewer fsync's */
MPI_Barrier(fd->comm);
MPI_Comm_rank(fd->comm, &rank);
/* All ranks marked as "fsync aggregators" should fsync.
(We currently only do one fsync on rank 0 but this is general
enough to support >1 aggregator using allreduce to get the
results instead of simply bcast'ing the results from rank 0.)*/
if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATOR)
{
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
}
else /* Non-aggregated fsync */
{
#ifdef USE_DBG_LOGGING
int rank;
#endif
err = fsync(fd->fd_sys);
#ifdef USE_DBG_LOGGING
MPI_Comm_rank(fd->comm, &rank);
if(rank == 0)
{
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
else
{
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
#endif
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,84 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_getsh.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
/* returns the current location of the shared_fp in terms of the
no. of etypes relative to the current view, and also increments the
shared_fp by the number of etypes to be accessed (incr) in the read
or write following this function. */
void ADIOI_BG_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
int *error_code)
{
ADIO_Offset new_fp;
int err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_BG_GET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
dupcommself,
fd->shared_fp_fname,
fd->file_system,
fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0,
MPI_BYTE,
MPI_BYTE,
MPI_INFO_NULL,
ADIO_PERM_NULL,
error_code);
if (*error_code != MPI_SUCCESS) return;
*shared_fp = 0;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
/* if the file is empty, the above read may return error
(reading beyond end of file). In that case, shared_fp = 0,
set above, is the correct value. */
}
else {
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
if (err == 0) {
err = read(fd->shared_fp_fd->fd_sys, shared_fp,
sizeof(ADIO_Offset));
}
if (err == -1) {
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
return;
}
}
new_fp = *shared_fp + incr;
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
if (err == 0) {
err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
}
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,542 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_hints.c
* \brief BlueGene hint processing
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_bg.h"
#include "ad_bg_pset.h"
#include "ad_bg_aggrs.h"
#define ADIOI_BG_CB_BUFFER_SIZE_DFLT "16777216"
#define ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
/** \page mpiio_vars MPIIO Configuration
*
* BlueGene MPIIO configuration and performance tuning. Used by ad_bg and ad_bglockless ADIO's.
*
* \section hint_sec Hints
* - bg_nodes_pset - Specify how many aggregators to use per pset.
* This hint will override the cb_nodes hint based on BlueGene psets.
* - N - Use N nodes per pset as aggregators.
* - Default is based on partition configuration and cb_nodes.
*
* The following default key/value pairs may differ from other platform defaults.
*
* - key = cb_buffer_size value = 16777216
* - key = romio_cb_read value = enable
* - key = romio_cb_write value = enable
* - key = ind_rd_buffer_size value = 4194304
* - key = ind_wr_buffer_size value = 4194304
*/
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
extern int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
Initialize fd->info to default values.
Initialize fd->hints to default values.
Examine the info object passed by the user. If it contains values that
ROMIO understands, override the default. */
MPI_Info info;
char *value;
int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_BG_SETINFO";
int did_anything = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
info = fd->info;
/* Note that fd->hints is allocated at file open time; thus it is
* not necessary to allocate it, or check for allocation, here.
*/
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_BG_assert ((value != NULL));
/* initialize info and hints to default values if they haven't been
* previously initialized
*/
if (!fd->hints->initialized) {
did_anything = 1;
/* buffer size for collective I/O */
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BG_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_BG_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use
* collective buffering
*/
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE;
ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
fd->hints->cb_config_list = NULL;
/* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1;
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
/* bg is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT);
if(fd->file_system == ADIO_UFS)
{
/* default for ufs/pvfs is to disable data sieving */
ADIOI_Info_set(info, "romio_ds_read", "disable");
fd->hints->ds_read = ADIOI_HINT_DISABLE;
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else
{
/* default is to let romio automatically decide when to use data
* sieving
*/
ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO;
ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1;
}
/* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) {
ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"cb_buffer_size",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "cb_buffer_size", value);
fd->hints->cb_buffer_size = intval;
}
#if 0
/* bg is not implementing file realms (ADIOI_IOStridedColl) ... */
/* aligning file realms to certain sizes (e.g. stripe sizes)
* may benefit I/O performance */
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_alignment",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
fd->hints->cb_fr_alignment = intval;
}
/* for collective I/O, try to be smarter about when to do data sieving
* using a specific threshold for the datatype size/extent
* (percentage 0-100%) */
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_ds_threshold",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
fd->hints->cb_ds_threshold = intval;
}
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_alltoall;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_alltoall) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_alltoall",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#endif
/* new hints for enabling/disabling coll. buffering on
* reads/writes
*/
ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
/* romio_cb_read overrides no_indep_rw */
ADIOI_Info_set(info, "romio_cb_read", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_read = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_read;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_read",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{
/* romio_cb_write overrides no_indep_rw, too */
ADIOI_Info_set(info, "romio_cb_write", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_write = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") ||
!strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_write;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_write",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#if 0
/* bg is not implementing file realms (ADIOI_IOStridedColl) ... */
/* enable/disable persistent file realms for collective I/O */
/* may want to check for no_indep_rdwr hint as well */
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_pfr;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_pfr) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_pfr",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* file realm assignment types ADIOI_FR_AAR(0),
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
a regular fr size in bytes. probably not the best way... */
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) >= -2)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_type",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_type", value);
fd->hints->cb_fr_type = intval;
}
#endif
/* new hint for specifying no indep. read/write will be performed */
ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
/* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io,
* then we have to do collective */
ADIOI_Info_set(info, "romio_no_indep_rw", value);
ADIOI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->no_indep_rw = 1;
fd->hints->cb_read = 1;
fd->hints->cb_write = 1;
tmp_val = 1;
}
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
ADIOI_Info_set(info, "romio_no_indep_rw", value);
fd->hints->no_indep_rw = 0;
tmp_val = 0;
}
else {
/* default is above */
tmp_val = 0;
}
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->no_indep_rw) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_no_indep_rw",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* new hints for enabling/disabling data sieving on
* reads/writes
*/
ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
ADIOI_Info_set(info, "ind_wr_buffer_size", value);
fd->hints->ind_wr_buffer_size = intval;
}
ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
ADIOI_Info_set(info, "ind_rd_buffer_size", value);
fd->hints->ind_rd_buffer_size = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "striping_unit", value);
fd->hints->striping_unit = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1;
ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval;
}
}
/* associate CB aggregators to certain CNs in every involved PSET */
if (did_anything) {
ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
}
/* ignore defered open hints and do not enable it for bluegene: need all
* processors in the open path so we can stat-and-broadcast the blocksize
*/
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0;
/* BobC commented this out, but since hint processing runs on both bg and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
ADIOI_Free(value);
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,307 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_open.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "ad_bg_aggrs.h"
#include <sys/statfs.h>
#include <sys/vfs.h>
/* COPIED FROM ad_fstype.c since it is static in that file
ADIO_FileSysType_parentdir - determines a string pathname for the
parent directory of a given filename.
Input Parameters:
. filename - pointer to file name character array
Output Parameters:
. dirnamep - pointer to location in which to store a pointer to a string
Note that the caller should free the memory located at the pointer returned
after the string is no longer needed.
*/
#ifndef PATH_MAX
#define PATH_MAX 65535
#endif
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
here. We assume that S_ISLNK is *always* defined as a macro. If
that is not universally true, then add a test to the romio
configure that trys to link a program that references S_ISLNK */
#if !defined(S_ISLNK)
# if defined(S_IFLNK)
/* Check for the link bit */
# define S_ISLNK(mode) ((mode) & S_IFLNK)
# else
/* no way to check if it is a link, so say false */
# define S_ISLNK(mode) 0
# endif
#endif /* !(S_ISLNK) */
/* ADIO_FileSysType_parentdir
*
* Returns pointer to string in dirnamep; that string is allocated with
* strdup and must be free()'d.
*/
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
{
int err;
char *dir = NULL, *slash;
struct stat statbuf;
err = lstat(filename, &statbuf);
if (err || (!S_ISLNK(statbuf.st_mode))) {
/* no such file, or file is not a link; these are the "normal"
* cases where we can just return the parent directory.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* filename is a symlink. we've presumably already tried
* to stat it and found it to be missing (dangling link),
* but this code doesn't care if the target is really there
* or not.
*/
int namelen;
char *linkbuf;
linkbuf = ADIOI_Malloc(PATH_MAX+1);
namelen = readlink(filename, linkbuf, PATH_MAX+1);
if (namelen == -1) {
/* something strange has happened between the time that
* we determined that this was a link and the time that
* we attempted to read it; punt and use the old name.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* successfully read the link */
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
dir = ADIOI_Strdup(linkbuf);
ADIOI_Free(linkbuf);
}
}
slash = strrchr(dir, '/');
if (!slash) ADIOI_Strncpy(dir, ".", 2);
else {
if (slash == dir) *(dir + 1) = '\0';
else *slash = '\0';
}
*dirnamep = dir;
return;
}
static void scaleable_stat(ADIO_File fd)
{
struct stat64 bg_stat;
struct statfs bg_statfs;
int rank, rc;
char * dir;
long buf[2];
MPI_Comm_rank(fd->comm, &rank);
if (rank == 0) {
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &bg_stat);
if (rc >= 0)
{
buf[0] = bg_stat.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,bg_stat.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
/* Get the (real) underlying file system type so we can
* plan our fsync scaling strategy */
rc = statfs(fd->filename,&bg_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",
fd->filename,bg_statfs.f_type);
buf[1] = bg_statfs.f_type;
}
else
{
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
ADIO_FileSysType_parentdir(fd->filename, &dir);
rc = statfs(dir,&bg_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",dir,bg_statfs.f_type);
buf[1] = bg_statfs.f_type;
}
else
{
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
* or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
buf[1] = -1; /* bogus magic number */
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
}
free(dir);
}
}
/* now we can broadcast the stat/statfs data to everyone else */
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
bg_stat.st_blksize = buf[0];
bg_statfs.f_type = buf[1];
/* data from stat64 */
/* store the blksize in the file system specific storage */
((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize;
/* data from statfs */
if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) ||
(bg_statfs.f_type == bglocklessmpio_f_type))
{
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BG_FSYNC_AGGREGATION_ENABLED;
/* Only one rank is an "fsync aggregator" because only one
* fsync is needed */
if (rank == 0)
{
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |=
ADIOI_BG_FSYNC_AGGREGATOR;
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
}
else
; /* aggregation enabled but this rank is not an aggregator*/
}
else
; /* Other filesystems default to no fsync aggregation */
}
void ADIOI_BG_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode;
static char myname[] = "ADIOI_BG_OPEN";
/* set internal variables for tuning environment variables */
ad_bg_get_env_vars();
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if(fd->fd_sys != -1)
{
/* Initialize the ad_bg file system specific information */
ADIOI_BG_assert(fd->fs_ptr == NULL);
fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs));
((ADIOI_BG_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
/* default is no fsync aggregation */
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BG_FSYNC_AGGREGATION_DISABLED;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
scaleable_stat(fd);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
}
if (fd->fd_sys == -1) {
if (errno == ENAMETOOLONG)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamelong",
"**filenamelong %s %d",
fd->filename,
strlen(fd->filename));
else if (errno == ENOENT)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_NO_SUCH_FILE,
"**filenoexist",
"**filenoexist %s",
fd->filename);
else if (errno == ENOTDIR || errno == ELOOP)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_BAD_FILE,
"**filenamedir",
"**filenamedir %s",
fd->filename);
else if (errno == EACCES) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_ACCESS,
"**fileaccess",
"**fileaccess %s",
fd->filename );
}
else if (errno == EROFS) {
/* Read only file or file system and write access requested */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_READ_ONLY,
"**ioneedrd", 0 );
}
else {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
}
else *error_code = MPI_SUCCESS;
}
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,558 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_read.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "adio_extern.h"
#include "ad_bg_tuning.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BG_READCONTIG";
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5034, 0, NULL);
#endif
#if BG_PROFILE
/* timing */
double io_time, io_time2;
if (bgmpio_timing) {
io_time = MPI_Wtime();
bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
}
#endif
MPI_Type_size(datatype, &datatype_size);
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
#if BG_PROFILE
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (bgmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bgmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (bgmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bgmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#else /* BG_PROFILE */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif /* BG_PROFILE */
#if BG_PROFILE
if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5035, 0, NULL);
#endif
}
#define ADIOI_BUFFERED_READ \
{ \
if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
err = read(fd->fd_sys, readbuf, readbuf_len);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
if (err == -1) err_flag = 1; \
} \
while (req_len > readbuf_off + readbuf_len - req_off) { \
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
ADIOI_Free(readbuf); \
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \
readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \
} \
ADIOI_Assert(req_len == (size_t)req_len); \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
}
void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, new_brd_size, brd_size, size;
int i, j, k, err=-1, st_index=0;
ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
unsigned num, bufsize;
int n_etypes_in_filetype;
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, partial_read;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off, req_len, sum;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value;
int err_flag=0, info_flag;
unsigned max_bufsize, readbuf_len;
static char myname[] = "ADIOI_BG_READSTRIDED";
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on reads, use naive
* approach instead.
*/
/*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
ADIOI_GEN_ReadStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset,
status,
error_code);
return;
}
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
readbuf_off = off;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
lseek(fd->fd_sys, readbuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
err = read(fd->fd_sys, readbuf, readbuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_READ
off += flat_buf->blocklens[i];
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* frd_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
frd_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i_offset < bufsize) {
i_offset += frd_size;
end_offset = off + frd_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read into readbuf */
readbuf_off = offset;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
lseek(fd->fd_sys, offset, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
err = read(fd->fd_sys, readbuf, readbuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i_offset < bufsize) {
if (frd_size) {
/* TYPE_UB and TYPE_LB can result in
frd_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
req_off = off;
req_len = frd_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
i_offset += frd_size;
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += frd_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(frd_size, brd_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
new_frd_size = frd_size;
new_brd_size = brd_size;
if (size == frd_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_frd_size = flat_file->blocklens[j];
if (size != brd_size) {
i_offset += size;
new_brd_size -= size;
}
}
if (size == brd_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k]);
new_brd_size = flat_buf->blocklens[k];
if (size != frd_size) {
off += size;
new_frd_size -= size;
}
}
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
num += size;
frd_size = new_frd_size;
brd_size = new_brd_size;
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -1,68 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_setsh.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
/* set the shared file pointer to "offset" etypes relative to the current
view */
/*
This looks very similar to ADIOI_GEN_Set_shared_fp, except this
function avoids locking the file twice. The generic version does
Write lock
ADIO_WriteContig
Unlock
For BG, ADIOI_BG_WriteContig does a lock before writing to disable
caching. To avoid the lock being called twice, this version for BG does
Write lock
Lseek
Write
Unlock
*/
void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
{
int err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_BG_SET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
fd->shared_fp_fname,
fd->file_system, fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
ADIO_PERM_NULL, error_code);
}
if (*error_code != MPI_SUCCESS) return;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,164 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_tuning.c
* \brief Defines ad_bg performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*---------------------------------------------------------------------
* ad_bg_tuning.c
*
* defines global variables and functions for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#include "ad_bg_tuning.h"
#include "mpi.h"
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
int bgmpio_timing;
int bgmpio_timing2;
int bgmpio_comm;
int bgmpio_tunegather;
int bgmpio_tuneblocking;
long bglocklessmpio_f_type;
double bgmpio_prof_cw [BGMPIO_CIO_LAST];
double bgmpio_prof_cr [BGMPIO_CIO_LAST];
/* set internal variables for tuning environment variables */
/** \page mpiio_vars MPIIO Configuration
\section env_sec Environment Variables
* - BGMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - BGMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Must also compile the library with BG_PROFILE defined. Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - BGMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
* Must also compile the library with BG_PROFILE defined. Possible values:
* - 0 - Do not collect/report averages.
* - 1 - Collect/report averages.
* - Default is 0.
*
* - BGMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - BGMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
* "bg:" or "bglockless:") on a file name will override this environment
* variable. Possible values:
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
* statfs() field f_type.
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
*
*/
void ad_bg_get_env_vars() {
char *x, *dummy;
bgmpio_comm = 0;
x = getenv( "BGMPIO_COMM" );
if (x) bgmpio_comm = atoi(x);
bgmpio_timing = 0;
x = getenv( "BGMPIO_TIMING" );
if (x) bgmpio_timing = atoi(x);
bgmpio_timing2 = 0;
x = getenv( "BGMPIO_TIMING2" );
if (x) bgmpio_timing2 = atoi(x);
bgmpio_tunegather = 1;
x = getenv( "BGMPIO_TUNEGATHER" );
if (x) bgmpio_tunegather = atoi(x);
bgmpio_tuneblocking = 1;
x = getenv( "BGMPIO_TUNEBLOCKING" );
if (x) bgmpio_tuneblocking = atoi(x);
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
bglocklessmpio_f_type,bglocklessmpio_f_type);
}
/* report timing breakdown for MPI I/O collective call */
void ad_bg_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
{
int i;
if (bgmpio_timing) {
double *bgmpio_prof_org = bgmpio_prof_cr;
if (rw) bgmpio_prof_org = bgmpio_prof_cw;
double bgmpio_prof_avg[ BGMPIO_CIO_LAST ];
double bgmpio_prof_max[ BGMPIO_CIO_LAST ];
MPI_Reduce( bgmpio_prof_org, bgmpio_prof_avg, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
MPI_Reduce( bgmpio_prof_org, bgmpio_prof_max, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
if (myrank == 0) {
for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nprocs;
if (bgmpio_timing2) {
bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW ];
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW ];
} else {
bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] = 0;
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] = 0;
}
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
printf( "SZ: %12.4f , ", bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs );
printf( "SK-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ] );
printf( "SK-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_SEEK ] );
printf( "LC-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_LCOMP ] );
printf( "GA-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_GATHER ] );
printf( "AN-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_PATANA ] );
printf( "FD-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_FD_PART ] );
printf( "MY-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ] );
printf( "OT-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ] );
printf( "EX-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ] );
printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
printf( "PXT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ] );
printf( "MPT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_RW ] );
printf("MPTC-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_CRW ] );
printf( "PXB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] );
printf( "MPB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] );
printf( "MPBC: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] );
}
}
}

Просмотреть файл

@ -1,96 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_tuning.h
* \brief ???
*/
/*---------------------------------------------------------------------
* ad_bg_tuning.h
*
* declares global variables and macros for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#ifndef AD_BG_TUNING_H_
#define AD_BG_TUNING_H_
#include "adio.h"
#define ADIOI_BG_assert( a ) if (!(a)) { \
fprintf( stderr, "AD_BG_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
MPI_Abort( MPI_COMM_WORLD, 1 ); \
}
/*-----------------------------------------
* Global variables for the control of
* 1. timing
* 2. select specific optimizations
*-----------------------------------------*/
/* timing fields */
enum {
BGMPIO_CIO_DATA_SIZE=0,
BGMPIO_CIO_T_SEEK,
BGMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
BGMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
BGMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
BGMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
BGMPIO_CIO_T_MYREQ, /* time for ADIOI_BG_Calc_my_req(), local */
BGMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
BGMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
BGMPIO_CIO_T_POSI_RW,
BGMPIO_CIO_B_POSI_RW,
BGMPIO_CIO_T_MPIO_RW, /* time for ADIOI_BG_WriteContig() */
BGMPIO_CIO_B_MPIO_RW,
BGMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_BG_WriteStridedColl() */
BGMPIO_CIO_B_MPIO_CRW,
BGMPIO_CIO_LAST
};
extern double bgmpio_prof_cw [BGMPIO_CIO_LAST];
extern double bgmpio_prof_cr [BGMPIO_CIO_LAST];
/* corresponds to environment variables to select optimizations and timing level */
extern int bgmpio_timing;
extern int bgmpio_timing2;
extern int bgmpio_comm;
extern int bgmpio_tunegather;
extern int bgmpio_tuneblocking;
extern long bglocklessmpio_f_type;
/* set internal variables for tuning environment variables */
void ad_bg_get_env_vars();
/* report timing breakdown for MPI I/O collective call */
void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
/* note:
* T := timing;
* CIO := collective I/O
*/
#define BGMPIO_T_CIO_RESET( LEVEL, RW ) \
if (bgmpio_timing_cw_level >= LEVEL) { \
int i; \
for ( i = 0; i < BGMPIO_T_LAST; i ++ ) \
bgmpio_prof_c##RW [ i ] = 0; \
}
#define BGMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
if (bgmpio_timing_cw_level >= LEVEL) { \
ad_bg_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
}
#define BGMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
if (bgmpio_timing_cw_level >= LEVEL) { \
if ( DOBAR ) MPI_Barrier( fd->comm ); \
double temp = MPI_Wtime(); \
if ( ISSET ) bgmpio_prof_c##RW [ VAR1 ] = temp; \
if ( ISGET ) bgmpio_prof_c##RW [ VAR2 ] = temp - bgmpio_prof_c##RW [ VAR2 ] ; \
}
#endif /* AD_BG_TUNING_H_ */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,611 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_write.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bg.h"
#include "adio_extern.h"
#include "ad_bg_tuning.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BG_WRITECONTIG";
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5036, 0, NULL);
#endif
#if BG_PROFILE
/* timing */
double io_time, io_time2;
if (bgmpio_timing) {
io_time = MPI_Wtime();
bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
}
#endif
MPI_Type_size(datatype, &datatype_size);
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
#if BG_PROFILE
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (bgmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bgmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (bgmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bgmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#else /* BG_PROFILE */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif /* BG_PROFILE */
#if BG_PROFILE
if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5037, 0, NULL);
#endif
}
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
} \
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, err=-1, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value;
unsigned bufsize, writebuf_len, max_bufsize, write_sz;
int err_flag=0, info_flag;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
static char myname[] = "ADIOI_BG_WRITESTRIDED";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on reads, use naive
* approach instead.
*/
/*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
ADIOI_GEN_WriteStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset,
status,
error_code);
return;
}
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
writebuf_off = off;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
err = write(fd->fd_sys, writebuf, writebuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (err == -1) err_flag = 1;
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
int i;
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read for the read-modify-write */
writebuf_off = offset;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
err = read(fd->fd_sys, writebuf, writebuf_len);
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"ADIOI_BG_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
return;
}
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i_offset += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
err = write(fd->fd_sys, writebuf, writebuf_len);
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (err == -1) err_flag = 1;
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -1,34 +0,0 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_BGL
noinst_HEADERS += \
adio/ad_bgl/ad_bgl.h \
adio/ad_bgl/ad_bgl_aggrs.h \
adio/ad_bgl/ad_bgl_pset.h \
adio/ad_bgl/ad_bgl_tuning.h
romio_other_sources += \
adio/ad_bgl/ad_bgl_open.c \
adio/ad_bgl/ad_bgl_close.c \
adio/ad_bgl/ad_bgl_fcntl.c \
adio/ad_bgl/ad_bgl_flush.c \
adio/ad_bgl/ad_bgl_read.c \
adio/ad_bgl/ad_bgl_write.c \
adio/ad_bgl/ad_bgl_getsh.c \
adio/ad_bgl/ad_bgl_setsh.c \
adio/ad_bgl/ad_bgl.c \
adio/ad_bgl/ad_bgl_aggrs.c \
adio/ad_bgl/ad_bgl_pset.c \
adio/ad_bgl/ad_bgl_hints.c \
adio/ad_bgl/ad_bgl_rdcoll.c \
adio/ad_bgl/ad_bgl_wrcoll.c \
adio/ad_bgl/ad_bgl_tuning.c
endif BUILD_AD_BGL

Просмотреть файл

@ -1,97 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl.h
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_BGL_INCLUDE
#define AD_BGL_INCLUDE
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#endif
#if 0
int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
int wr, void *handle);
#endif
void ADIOI_BGL_Open(ADIO_File fd, int *error_code);
void ADIOI_BGL_Close(ADIO_File fd, int *error_code);
void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
#if 0
void ADIOI_BGL_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
void ADIOI_BGL_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int
*error_code);
int ADIOI_BGL_ReadDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
int ADIOI_BGL_WriteDone(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_WriteComplete(ADIO_Request *request, ADIO_Status *status,
int *error_code);
#endif
void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
*error_code);
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code);
#include "ad_bgl_tuning.h"
#endif

Просмотреть файл

@ -1,966 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_aggrs.c
* \brief The externally used function from this file is is declared in ad_bgl_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_cb_config_list.h"
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
static int aggrsInPsetSize=0;
static int *aggrsInPset=NULL;
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* forward declaration */
static void
ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BGL_ConfInfo_t *confInfo,
ADIOI_BGL_ProcInfo_t *all_procInfo,
int *aggrsInPset );
/*
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* By compute these two parameters in a BGL-PSET-aware way, the default 2-phase collective IO of
* ADIO can work more efficiently.
*/
int
ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
{
int r, s;
ADIOI_BGL_ProcInfo_t *procInfo, *all_procInfo;
ADIOI_BGL_ConfInfo_t *confInfo;
MPI_Comm_size( fd->comm, &s );
MPI_Comm_rank( fd->comm, &r );
/* Collect individual BGL personality information */
confInfo = ADIOI_BGL_ConfInfo_new ();
procInfo = ADIOI_BGL_ProcInfo_new ();
ADIOI_BGL_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset );
/* Gather BGL personality infomation onto process 0 */
// if (r == 0)
all_procInfo = ADIOI_BGL_ProcInfo_new_n (s);
if(s > aggrsInPsetSize)
{
if(aggrsInPset) ADIOI_Free(aggrsInPset);
aggrsInPset = (int *) ADIOI_Malloc (s *sizeof(int));
aggrsInPsetSize = s;
}
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE,
(void *)all_procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE,
0,
fd->comm );
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
if (r == 0) {
ADIOI_BGL_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset);
// ADIOI_BGL_ProcInfo_free (all_procInfo);
}
ADIOI_BGL_ProcInfo_free (all_procInfo);
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
Declared in adio_cb_config_list.h */
ADIOI_cb_bcast_rank_map(fd);
/* Broadcast the BGL-GPFS related file domain info */
MPI_Bcast( (void *)aggrsInPset,
fd->hints->cb_nodes * sizeof(int), MPI_BYTE,
0,
fd->comm );
ADIOI_BGL_persInfo_free( confInfo, procInfo );
return 0;
}
/*
* the purpose of abstracting out this routine is to make it easy for trying different proxy-selection criteria.
*/
static int
ADIOI_BGL_select_agg_in_pset (const ADIOI_BGL_ConfInfo_t *confInfo,
ADIOI_BGL_ProcInfo_t *pset_procInfo,
int nCN_in_pset,
int *tmp_ranklist)
{
/* first implementation, based on their rank order. */
int i, j, k;
/* The number of aggregators in the PSET is proportional to the CNs in the PSET */
int nAggrs = nCN_in_pset * confInfo->aggRatio;
if (nAggrs < ADIOI_BGL_NAGG_PSET_MIN) nAggrs = ADIOI_BGL_NAGG_PSET_MIN;
/* for not virtual-node-mode, pick aggregators in this PSET based on the order of the global rank */
if (!confInfo->isVNM)
{
for (i=0; i<nAggrs; i++) tmp_ranklist[i] = pset_procInfo[i].rank;
}
/* for virtual-node-mode, first pick aggregators among CPU-0 */
else
{
/* Try to pick from CPU-0 first, then CPU-1, then ... CPU-n */
j = 0;
for (k=0; k < confInfo->cpuidSize; k++){
for (i=0; i< nCN_in_pset ; i++) {
if (pset_procInfo[i].cpuid == k)
tmp_ranklist[j++] = pset_procInfo[i].rank;
if ( j >= nAggrs) break;
}
if ( j >= nAggrs) break;
}
}
return nAggrs;
}
/*
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
* The first order of tmp_ranklist is : PSET number
* The secondary order of the list is determined in ADIOI_BGL_select_agg_in_pset() and thus adjustable.
*/
static int
ADIOI_BGL_compute_agg_ranklist_serial_do (const ADIOI_BGL_ConfInfo_t *confInfo,
ADIOI_BGL_ProcInfo_t *all_procInfo,
int *aggrsInPset,
int *tmp_ranklist)
{
int i, j;
/* a list of the numbers of all the PSETS */
int *psetNumList = (int *) ADIOI_Malloc ( confInfo->nProcs * sizeof(int) );
/* sweep through all processes' records, collect the numbers of all the PSETS.
* The reason for not doing MIN, MAX is that the owned PSETs may not have contiguous numbers */
int n_psets=0;
for (i=0; i<confInfo->nProcs; i++) {
ADIOI_BGL_ProcInfo_t *info_p = all_procInfo+i;
int exist = 0;
for (j=n_psets-1; j>=0; j--)
if (info_p->psetNum == psetNumList[j]) { exist=1; break; }
if (!exist) {
psetNumList [n_psets] = info_p->psetNum;
n_psets ++;
}
}
/* bucket sort: put the CN nodes into ordered buckets, each of which represents a PSET */
/* bucket space for bucket sort */
ADIOI_BGL_ProcInfo_t *sorted_procInfo = ADIOI_BGL_ProcInfo_new_n ( n_psets * confInfo->virtualPsetSize );
int *PsetIdx = (int *) ADIOI_Malloc ( n_psets * sizeof(int) );
AD_BGL_assert ( (PsetIdx != NULL) );
/* initialize bucket pointer */
for (i=0; i<n_psets; i++) {
PsetIdx[i] = i*confInfo->virtualPsetSize;
}
/* sort */
for (i=0; i<confInfo->nProcs; i++) {
int pset_id = all_procInfo[i].psetNum;
for (j=n_psets-1; j>=0; j--) if (pset_id == psetNumList[j]) break;
AD_BGL_assert ( (j >= 0) ); /* got to find a PSET bucket */
sorted_procInfo[ PsetIdx[j] ++ ] = all_procInfo[i];
}
ADIOI_Free(psetNumList);
/* select a number of CN aggregators from each Pset */
int naggs = 0;
for (i=0; i<n_psets; i++) {
/* the number of CN in this PSET -- may not be a full PSET */
int nCN_in_pset = PsetIdx[i] - i*confInfo->virtualPsetSize;
/* select aggregators and put them into tmp_ranklist contiguously. */
int local_naggs = ADIOI_BGL_select_agg_in_pset( confInfo,
sorted_procInfo + i*confInfo->virtualPsetSize,
nCN_in_pset,
tmp_ranklist + naggs);
aggrsInPset[i+1] = local_naggs;
naggs += local_naggs;
}
aggrsInPset[0] = n_psets;
/* leave */
ADIOI_Free ( PsetIdx );
ADIOI_BGL_ProcInfo_free ( sorted_procInfo );
return naggs;
}
/*
* compute aggregators ranklist and put it into fd->hints struct
*/
static void
ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BGL_ConfInfo_t *confInfo,
ADIOI_BGL_ProcInfo_t *all_procInfo,
int *aggrsInPset )
{
# if AGG_DEBUG
int i;
# endif
int naggs;
int *tmp_ranklist;
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) {
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
}
# endif
naggs =
ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);
# define VERIFY 0
# if VERIFY
DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n",
confInfo->PsetSize ,
confInfo->numPsets ,
confInfo->isVNM ,
confInfo->virtualPsetSize ,
confInfo->nProcs ,
confInfo->nAggrs ,
confInfo->aggRatio ,
naggs );
# endif
# if AGG_DEBUG
for (i=0; i<naggs; i++) {
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif
/* copy the ranklist of IO aggregators to fd->hints */
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
fd->hints->cb_nodes = naggs;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
/* */
ADIOI_Free( tmp_ranklist );
return;
}
/* Description from common/ad_aggregate.c. (Does it completely apply to bgl?)
* ADIOI_Calc_aggregator()
*
* The intention here is to implement a function which provides basically
* the same functionality as in Rajeev's original version of
* ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
* file domains, and we use the same approach here when calculating the
* location of an offset/len in a specific file domain. Further we assume
* this same distribution when calculating the rank_index, which is later
* used to map to a specific process rank in charge of the file domain.
*
* A better (i.e. more general) approach would be to use the list of file
* domains only. This would be slower in the case where the
* original ceiling division was used, but it would allow for arbitrary
* distributions of regions to aggregators. We'd need to know the
* nprocs_for_coll in that case though, which we don't have now.
*
* Note a significant difference between this function and Rajeev's old code:
* this code doesn't necessarily return a rank in the range
* 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
* result of the rank mapping; any set of ranks in the communicator could be
* used now.
*
* Returns an integer representing a rank in the collective I/O communicator.
*
* The "len" parameter is also modified to indicate the amount of data
* actually available in this file domain.
*/
/*
* This is more general aggregator search function which does not base on the assumption
* that each aggregator hosts the file domain with the same size
*/
int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end)
{
int rank_index, rank;
ADIO_Offset avail_bytes;
AD_BGL_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
/* binary search --> rank_index is returned */
int ub = fd->hints->cb_nodes;
int lb = 0;
/* get an index into our array of aggregators */
/* Common code for striping - bgl doesn't use it but it's
here to make diff'ing easier.
rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
if (fd->hints->striping_unit > 0) {
* wkliao: implementation for file domain alignment
fd_start[] and fd_end[] have been aligned with file lock
boundaries when returned from ADIOI_Calc_file_domains() so cannot
just use simple arithmatic as above *
rank_index = 0;
while (off > fd_end[rank_index]) rank_index++;
}
bgl does it's own striping below
*/
rank_index = fd->hints->cb_nodes / 2;
while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
if ( off > fd_end [rank_index] ) {
lb = rank_index;
rank_index = (rank_index + ub) / 2;
}
else
if ( off < fd_start[rank_index] ) {
ub = rank_index;
rank_index = (rank_index + lb) / 2;
}
}
/* we index into fd_end with rank_index, and fd_end was allocated to be no
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
* overrunning arrays. Obviously, we should never ever hit this abort */
if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
rank_index,fd->hints->cb_nodes,fd_size,off);
MPI_Abort(MPI_COMM_WORLD, 1);
}
// DBG_FPRINTF ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
/*
* remember here that even in Rajeev's original code it was the case that
* different aggregators could end up with different amounts of data to
* aggregate. here we use fd_end[] to make sure that we know how much
* data this aggregator is working with.
*
* the +1 is to take into account the end vs. length issue.
*/
avail_bytes = fd_end[rank_index] + 1 - off;
if (avail_bytes < *len && avail_bytes > 0) {
/* this file domain only has part of the requested contig. region */
*len = avail_bytes;
}
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
return rank;
}
/*
* Compute a dynamic access range based file domain partition among I/O aggregators,
* which align to the GPFS block size
* Divide the I/O workload among "nprocs_for_coll" processes. This is
* done by (logically) dividing the file into file domains (FDs); each
* process may directly access only its own file domain.
* Additional effort is to make sure that each I/O aggregator get
* a file domain that aligns to the GPFS block size. So, there will
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
*
* The common version of this now accepts a min_fd_size and striping_unit.
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
*/
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr)
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5004, 0, NULL);
#endif
# if AGG_DEBUG
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
myname,__LINE__,nprocs_for_coll);
# endif
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
# endif
/* find min of start offsets and max of end offsets of all processes */
min_st_offset = st_offsets [0];
max_end_offset = end_offsets[0];
for (i=1; i<nprocs; i++) {
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
}
// DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
/* determine the "file domain (FD)" of each process, i.e., the portion of
the file that will be "owned" by each process */
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
int naggs = nprocs_for_coll;
/* Tweak the file domains so that no fd is smaller than a threshold. We
* have to strike a balance between efficency and parallelism: somewhere
* between 10k processes sending 32-byte requests and one process sending a
* 320k request is a (system-dependent) sweet spot
This is from the common code - the new min_fd_size parm that we didn't implement.
(And common code uses a different declaration of fd_size so beware) */
/* this is not entirely sufficient on BlueGene: we must be mindful of
* imbalance over psets. the hint processing code has already picked, say,
* 8 processors per pset, so if we go increasing fd_size we'll end up with
* some psets with 8 processors and some psets with none. */
/*
if (fd_size < min_fd_size)
fd_size = min_fd_size;
*/
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
/* nb_cn_small * blksize: evenly split file domain among processors:
* equivalent to fd_gpfs_rnage/naggs
* (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big
*/
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
/*potential optimization: if n_gpfs_blk smalller than
* naggs, slip in some zero-sized file
* domains to spread the work across all psets. */
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): "
"gpfs_ub %llu, "
"gpfs_lb %llu, "
"gpfs_ub_rdoff %llu, "
"gpfs_lb_rdoff %llu, "
"fd_gpfs_range %llu, "
"n_gpfs_blk %llu, "
"nb_cn_small %llu, "
"naggs_large %llu, "
"naggs_small %llu, "
"\n",
myname,__LINE__,
gpfs_ub ,
gpfs_lb ,
gpfs_ub_rdoff,
gpfs_lb_rdoff,
fd_gpfs_range,
n_gpfs_blk ,
nb_cn_small ,
naggs_large ,
naggs_small
);
# endif
fd_size[0] -= gpfs_lb_rdoff;
fd_size[naggs-1] -= gpfs_ub_rdoff;
/* compute the file domain for each aggr */
ADIO_Offset offset = min_st_offset;
for (aggr=0; aggr<naggs; aggr++) {
fd_start[aggr] = offset;
fd_end [aggr] = offset + fd_size[aggr] - 1;
offset += fd_size[aggr];
}
*fd_size_ptr = fd_size[0];
*min_st_offset_ptr = min_st_offset;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5005, 0, NULL);
#endif
ADIOI_Free (fd_size);
}
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/*
* ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* is specific for static file domain partitioning.
*
* ADIOI_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr)
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
They are used as memory buffer indices so it seems like the 2G limit is in effect */
{
int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
int i, l, proc;
ADIO_Offset fd_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5024, 0, NULL);
#endif
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
process in process i's file domain. calloc initializes to zero.
I'm allocating memory of size nprocs, so that I can do an
MPI_Alltoall later on.*/
buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* buf_idx is relevant only if buftype_is_contig.
buf_idx[i] gives the index into user_buf where data received
from proc. i should be placed. This allows receives to be done
without extra buffer. This can't be done if buftype is not contig. */
/* initialize buf_idx to -1 */
for (i=0; i < nprocs; i++) buf_idx[i] = -1;
/* one pass just to calculate how much space to allocate for my_req;
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
*/
for (i=0; i < contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
/* note: we set fd_len to be the total size of the access. then
* ADIOI_Calc_aggregator() will modify the value to return the
* amount that was available from the file domain that holds the
* first part of the access.
*/
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
count_my_req_per_proc[proc]++;
/* figure out how much data is remaining in the access (i.e. wasn't
* part of the file domain that had the starting byte); we'll take
* care of this data (if there is any) in the while loop below.
*/
rem_len = len_list[i] - fd_len;
while (rem_len > 0) {
off += fd_len; /* point to first remaining byte */
fd_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
count_my_req_per_proc[proc]++;
rem_len -= fd_len; /* reduce remaining length by amount from fd */
}
}
/* now allocate space for my_req, offset, and len */
*my_req_ptr = (ADIOI_Access *)
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
my_req = *my_req_ptr;
count_my_req_procs = 0;
for (i=0; i < nprocs; i++) {
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
my_req[i].lens = (int *)
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed
later */
}
/* now fill in my_req */
curr_idx = 0;
for (i=0; i<contig_access_count; i++) {
/* short circuit offset/len processing if len == 0
* (zero-byte read/write */
if (len_list[i] == 0)
continue;
off = offset_list[i];
fd_len = len_list[i];
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
/* for each separate contiguous access from this process */
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += fd_len;
rem_len = len_list[i] - fd_len;
/* store the proc, offset, and len information in an array
* of structures, my_req. Each structure contains the
* offsets and lengths located in that process's FD,
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++;
while (rem_len > 0) {
off += fd_len;
fd_len = rem_len;
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
if (buf_idx[proc] == -1)
{
ADIOI_Assert(curr_idx == (int) curr_idx);
buf_idx[proc] = (int) curr_idx;
}
l = my_req[proc].count;
curr_idx += fd_len;
rem_len -= fd_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].count++;
}
}
#ifdef AGG_DEBUG
for (i=0; i<nprocs; i++) {
if (count_my_req_per_proc[i] > 0) {
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
my_req[i].count);
for (l=0; l < my_req[i].count; l++) {
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
}
#endif
*count_my_req_procs_ptr = count_my_req_procs;
*buf_idx_ptr = buf_idx;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5025, 0, NULL);
#endif
}
/*
* ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance)
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr)
{
/* determine what requests of other processes lie in this process's
file domain */
/* count_others_req_procs = number of processes whose requests lie in
this process's file domain (including this process itself)
count_others_req_per_proc[i] indicates how many separate contiguous
requests of proc. i lie in this process's file domain. */
int *count_others_req_per_proc, count_others_req_procs;
int i;
ADIOI_Access *others_req;
/* Parameters for MPI_Alltoallv */
int *scounts, *sdispls, *rcounts, *rdispls;
/* Parameters for MPI_Alltoallv. These are the buffers, which
* are later computed to be the lowest address of all buffers
* to be sent/received for offsets and lengths. Initialize to
* the highest possible address which is the current minimum.
*/
void *sendBufForOffsets=(void*)0xFFFFFFFF,
*sendBufForLens =(void*)0xFFFFFFFF,
*recvBufForOffsets=(void*)0xFFFFFFFF,
*recvBufForLens =(void*)0xFFFFFFFF;
/* first find out how much to send/recv and from/to whom */
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5026, 0, NULL);
#endif
/* Send 1 int to each process. count_my_req_per_proc[i] is the number of
* requests that my process will do to the file domain owned by process[i].
* Receive 1 int from each process. count_others_req_per_proc[i] is the number of
* requests that process[i] will do to the file domain owned by my process.
*/
count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* cora2a1=timebase(); */
MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
count_others_req_per_proc, 1, MPI_INT, fd->comm);
/* total_cora2a+=timebase()-cora2a1; */
/* Allocate storage for an array of other nodes' accesses of our
* node's file domain. Also allocate storage for the alltoallv
* parameters.
*/
*others_req_ptr = (ADIOI_Access *)
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
others_req = *others_req_ptr;
scounts = ADIOI_Malloc(nprocs*sizeof(int));
sdispls = ADIOI_Malloc(nprocs*sizeof(int));
rcounts = ADIOI_Malloc(nprocs*sizeof(int));
rdispls = ADIOI_Malloc(nprocs*sizeof(int));
/* If process[i] has any requests in my file domain,
* initialize an ADIOI_Access structure that will describe each request
* from process[i]. The offsets, lengths, and buffer pointers still need
* to be obtained to complete the setting of this structure.
*/
count_others_req_procs = 0;
for (i=0; i<nprocs; i++) {
if (count_others_req_per_proc[i]) {
others_req[i].count = count_others_req_per_proc[i];
others_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
others_req[i].lens = (int *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
recvBufForOffsets = others_req[i].offsets;
if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
recvBufForLens = others_req[i].lens;
others_req[i].mem_ptrs = (MPI_Aint *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
count_others_req_procs++;
}
else
{
others_req[i].count = 0;
others_req[i].offsets = NULL;
others_req[i].lens = NULL;
}
}
/* If no recv buffer was allocated in the loop above, make it NULL */
if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL;
if ( recvBufForLens == (void*)0xFFFFFFFF) recvBufForLens = NULL;
/* Now send the calculated offsets and lengths to respective processes */
/************************/
/* Exchange the offsets */
/************************/
/* Determine the lowest sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++)
{
if ( (my_req[i].count) &&
((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
sendBufForOffsets = my_req[i].offsets;
if ( (my_req[i].count) &&
((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
sendBufForLens = my_req[i].lens;
}
/* If no send buffer was found in the loop above, make it NULL */
if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL;
if ( sendBufForLens == (void*)0xFFFFFFFF) sendBufForLens = NULL;
/* Calculate the displacements from the sendBufForOffsets/Lens */
for (i=0; i<nprocs; i++)
{
// Send these offsets to process i.
scounts[i] = count_my_req_per_proc[i];
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].offsets -
(MPIR_Upint)sendBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
// Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].offsets -
(MPIR_Upint)recvBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
}
/* Exchange the offsets */
MPI_Alltoallv(sendBufForOffsets,
scounts, sdispls, ADIO_OFFSET,
recvBufForOffsets,
rcounts, rdispls, ADIO_OFFSET,
fd->comm);
/************************/
/* Exchange the lengths */
/************************/
for (i=0; i<nprocs; i++)
{
// Send these lengths to process i.
scounts[i] = count_my_req_per_proc[i];
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].lens -
(MPIR_Upint)sendBufForLens ) /
(MPIR_Upint) sizeof(int) );
// Receive these offsets from process i.
rcounts[i] = count_others_req_per_proc[i];
if ( rcounts[i] == 0 )
rdispls[i] = 0;
else
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].lens -
(MPIR_Upint)recvBufForLens ) /
(MPIR_Upint) sizeof(int) );
}
/* Exchange the lengths */
MPI_Alltoallv(sendBufForLens,
scounts, sdispls, MPI_INT,
recvBufForLens,
rcounts, rdispls, MPI_INT,
fd->comm);
/* Clean up */
ADIOI_Free(count_others_req_per_proc);
ADIOI_Free (scounts);
ADIOI_Free (sdispls);
ADIOI_Free (rcounts);
ADIOI_Free (rdispls);
*count_others_req_procs_ptr = count_others_req_procs;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5027, 0, NULL);
#endif
}

Просмотреть файл

@ -1,108 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_aggrs.h
* \brief ???
*/
/*
* File: ad_bgl_aggrs.h
*
* Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
* . Aligned file-domain partitioning, integrated in 7/28/2005
*
* In addition, following optimizations are planned:
* . Integrating multiple file-domain partitioning schemes
* (corresponding to Alok Chouhdary's persistent file domain work).
*/
#ifndef AD_BGL_AGGRS_H_
#define AD_BGL_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
/* File system (BGL) specific information -
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
typedef struct ADIOI_BGL_fs_s {
__blksize_t blksize;
int fsync_aggr; /* "fsync aggregation" flags (below) */
#define ADIOI_BGL_FSYNC_AGGREGATION_DISABLED 0x00
#define ADIOI_BGL_FSYNC_AGGREGATION_ENABLED 0x01
#define ADIOI_BGL_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
} ADIOI_BGL_fs;
/* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */
int ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr);
/* a utilitiy function for debugging */
int ADIOI_BGL_Aggrs_index(ADIO_File fd, int myrank );
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
static file domain partitioning */
int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end);
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */
void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr);
/*
* ADIOI_Calc_others_req
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr);
#endif /* AD_BGL_AGGRS_H_ */

Просмотреть файл

@ -1,58 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_fcntl.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "adio_extern.h"
/* #ifdef MPISGI
#include "mpisgi2.h"
#endif */
void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
int *error_code)
{
static char myname[] = "ADIOI_BGL_FCNTL";
switch(flag) {
case ADIO_FCNTL_GET_FSIZE:
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
if (fd->fp_sys_posn != -1)
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
if (fcntl_struct->fsize == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
break;
case ADIO_FCNTL_SET_DISKSPACE:
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
break;
case ADIO_FCNTL_SET_ATOMICITY:
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
*error_code = MPI_SUCCESS;
break;
/* --BEGIN ERROR HANDLING-- */
default:
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_ARG,
"**flag", "**flag %d", flag);
/* --END ERROR HANDLING-- */
}
}

Просмотреть файл

@ -1,90 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_flush.c
* \brief Scalable flush based on underlying filesystem and psets
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_BGL_FLUSH";
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATION_ENABLED)
{
int rank;
/* Barrier so we can collectively do fewer fsync's */
MPI_Barrier(fd->comm);
MPI_Comm_rank(fd->comm, &rank);
/* All ranks marked as "fsync aggregators" should fsync.
(We currently only do one fsync on rank 0 but this is general
enough to support >1 aggregator using allreduce to get the
results instead of simply bcast'ing the results from rank 0.)*/
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATOR)
{
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
}
else /* Non-aggregated fsync */
{
#ifdef USE_DBG_LOGGING
int rank;
#endif
err = fsync(fd->fd_sys);
#ifdef USE_DBG_LOGGING
MPI_Comm_rank(fd->comm, &rank);
if(rank == 0)
{
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
else
{
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
}
#endif
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,84 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_getsh.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
/* returns the current location of the shared_fp in terms of the
no. of etypes relative to the current view, and also increments the
shared_fp by the number of etypes to be accessed (incr) in the read
or write following this function. */
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
int *error_code)
{
ADIO_Offset new_fp;
int err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_BGL_GET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
dupcommself,
fd->shared_fp_fname,
fd->file_system,
fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0,
MPI_BYTE,
MPI_BYTE,
MPI_INFO_NULL,
ADIO_PERM_NULL,
error_code);
if (*error_code != MPI_SUCCESS) return;
*shared_fp = 0;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
/* if the file is empty, the above read may return error
(reading beyond end of file). In that case, shared_fp = 0,
set above, is the correct value. */
}
else {
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
if (err == 0) {
err = read(fd->shared_fp_fd->fd_sys, shared_fp,
sizeof(ADIO_Offset));
}
if (err == -1) {
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
return;
}
}
new_fp = *shared_fp + incr;
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
if (err == 0) {
err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
}
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,542 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_hints.c
* \brief BlueGene hint processing
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#define ADIOI_BGL_CB_BUFFER_SIZE_DFLT "16777216"
#define ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_BGL_NAGG_IN_PSET_HINT_NAME "bgl_nodes_pset"
/** \page mpiio_vars MPIIO Configuration
*
* BlueGene MPIIO configuration and performance tuning. Used by ad_bgl and ad_bglockless ADIO's.
*
* \section hint_sec Hints
* - bgl_nodes_pset - Specify how many aggregators to use per pset.
* This hint will override the cb_nodes hint based on BlueGene psets.
* - N - Use N nodes per pset as aggregators.
* - Default is based on partition configuration and cb_nodes.
*
* The following default key/value pairs may differ from other platform defaults.
*
* - key = cb_buffer_size value = 16777216
* - key = romio_cb_read value = enable
* - key = romio_cb_write value = enable
* - key = ind_rd_buffer_size value = 4194304
* - key = ind_wr_buffer_size value = 4194304
*/
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
extern int
ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
Initialize fd->info to default values.
Initialize fd->hints to default values.
Examine the info object passed by the user. If it contains values that
ROMIO understands, override the default. */
MPI_Info info;
char *value;
int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_BGL_SETINFO";
int did_anything = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
info = fd->info;
/* Note that fd->hints is allocated at file open time; thus it is
* not necessary to allocate it, or check for allocation, here.
*/
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
AD_BGL_assert ((value != NULL));
/* initialize info and hints to default values if they haven't been
* previously initialized
*/
if (!fd->hints->initialized) {
did_anything = 1;
/* buffer size for collective I/O */
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use
* collective buffering
*/
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE;
ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
fd->hints->cb_config_list = NULL;
/* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1;
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
/* bgl is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
if(fd->file_system == ADIO_UFS)
{
/* default for ufs/pvfs is to disable data sieving */
ADIOI_Info_set(info, "romio_ds_read", "disable");
fd->hints->ds_read = ADIOI_HINT_DISABLE;
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else
{
/* default is to let romio automatically decide when to use data
* sieving
*/
ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO;
ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1;
}
/* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) {
ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"cb_buffer_size",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "cb_buffer_size", value);
fd->hints->cb_buffer_size = intval;
}
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* aligning file realms to certain sizes (e.g. stripe sizes)
* may benefit I/O performance */
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_alignment",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
fd->hints->cb_fr_alignment = intval;
}
/* for collective I/O, try to be smarter about when to do data sieving
* using a specific threshold for the datatype size/extent
* (percentage 0-100%) */
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) > 0)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_ds_threshold",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
fd->hints->cb_ds_threshold = intval;
}
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_alltoall", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_alltoall;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_alltoall) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_alltoall",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#endif
/* new hints for enabling/disabling coll. buffering on
* reads/writes
*/
ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
/* romio_cb_read overrides no_indep_rw */
ADIOI_Info_set(info, "romio_cb_read", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_read = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_read", value);
fd->hints->cb_read = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_read;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_read) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_read",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
{
/* romio_cb_write overrides no_indep_rw, too */
ADIOI_Info_set(info, "romio_cb_write", value);
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->cb_write = ADIOI_HINT_DISABLE;
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") ||
!strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_write", value);
fd->hints->cb_write = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_write;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_write) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_write",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
#if 0
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
/* enable/disable persistent file realms for collective I/O */
/* may want to check for no_indep_rdwr hint as well */
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_cb_pfr", value);
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
}
tmp_val = fd->hints->cb_pfr;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->cb_pfr) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_pfr",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* file realm assignment types ADIOI_FR_AAR(0),
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
a regular fr size in bytes. probably not the best way... */
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval=atoi(value)) >= -2)) {
tmp_val = intval;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != intval) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_cb_fr_type",
error_code);
return;
}
/* --END ERROR HANDLING-- */
ADIOI_Info_set(info, "romio_cb_fr_type", value);
fd->hints->cb_fr_type = intval;
}
#endif
/* new hint for specifying no indep. read/write will be performed */
ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
/* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io,
* then we have to do collective */
ADIOI_Info_set(info, "romio_no_indep_rw", value);
ADIOI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->no_indep_rw = 1;
fd->hints->cb_read = 1;
fd->hints->cb_write = 1;
tmp_val = 1;
}
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
ADIOI_Info_set(info, "romio_no_indep_rw", value);
fd->hints->no_indep_rw = 0;
tmp_val = 0;
}
else {
/* default is above */
tmp_val = 0;
}
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
/* --BEGIN ERROR HANDLING-- */
if (tmp_val != fd->hints->no_indep_rw) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_no_indep_rw",
error_code);
return;
}
/* --END ERROR HANDLING-- */
}
/* new hints for enabling/disabling data sieving on
* reads/writes
*/
ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_ds_read", value);
fd->hints->ds_read = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
&flag);
if (flag) {
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_ENABLE;
}
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
{
ADIOI_Info_set(info, "romio_ds_write", value);
fd->hints->ds_write = ADIOI_HINT_AUTO;
}
/* otherwise ignore */
}
ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
ADIOI_Info_set(info, "ind_wr_buffer_size", value);
fd->hints->ind_wr_buffer_size = intval;
}
ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
ADIOI_Info_set(info, "ind_rd_buffer_size", value);
fd->hints->ind_rd_buffer_size = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "striping_unit", value);
fd->hints->striping_unit = intval;
}
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1;
ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval;
}
}
/* associate CB aggregators to certain CNs in every involved PSET */
if (did_anything) {
ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes);
}
/* ignore defered open hints and do not enable it for bluegene: need all
* processors in the open path so we can stat-and-broadcast the blocksize
*/
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0;
/* BobC commented this out, but since hint processing runs on both bgl and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
ADIOI_Free(value);
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,304 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_open.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
#include <sys/statfs.h>
#include <sys/vfs.h>
/* COPIED FROM ad_fstype.c since it is static in that file
ADIO_FileSysType_parentdir - determines a string pathname for the
parent directory of a given filename.
Input Parameters:
. filename - pointer to file name character array
Output Parameters:
. dirnamep - pointer to location in which to store a pointer to a string
Note that the caller should free the memory located at the pointer returned
after the string is no longer needed.
*/
#ifndef PATH_MAX
#define PATH_MAX 65535
#endif
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
here. We assume that S_ISLNK is *always* defined as a macro. If
that is not universally true, then add a test to the romio
configure that trys to link a program that references S_ISLNK */
#if !defined(S_ISLNK)
# if defined(S_IFLNK)
/* Check for the link bit */
# define S_ISLNK(mode) ((mode) & S_IFLNK)
# else
/* no way to check if it is a link, so say false */
# define S_ISLNK(mode) 0
# endif
#endif /* !(S_ISLNK) */
/* ADIO_FileSysType_parentdir
*
* Returns pointer to string in dirnamep; that string is allocated with
* strdup and must be free()'d.
*/
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
{
int err;
char *dir = NULL, *slash;
struct stat statbuf;
err = lstat(filename, &statbuf);
if (err || (!S_ISLNK(statbuf.st_mode))) {
/* no such file, or file is not a link; these are the "normal"
* cases where we can just return the parent directory.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* filename is a symlink. we've presumably already tried
* to stat it and found it to be missing (dangling link),
* but this code doesn't care if the target is really there
* or not.
*/
int namelen;
char *linkbuf;
linkbuf = ADIOI_Malloc(PATH_MAX+1);
namelen = readlink(filename, linkbuf, PATH_MAX+1);
if (namelen == -1) {
/* something strange has happened between the time that
* we determined that this was a link and the time that
* we attempted to read it; punt and use the old name.
*/
dir = ADIOI_Strdup(filename);
}
else {
/* successfully read the link */
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
dir = ADIOI_Strdup(linkbuf);
ADIOI_Free(linkbuf);
}
}
slash = strrchr(dir, '/');
if (!slash) ADIOI_Strncpy(dir, ".", 2);
else {
if (slash == dir) *(dir + 1) = '\0';
else *slash = '\0';
}
*dirnamep = dir;
return;
}
static void scaleable_stat(ADIO_File fd)
{
struct stat64 bgl_stat;
struct statfs bgl_statfs;
int rank, rc;
char * dir;
long buf[2];
MPI_Comm_rank(fd->comm, &rank);
if (rank == 0) {
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &bgl_stat);
if (rc >= 0)
{
buf[0] = bgl_stat.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,bgl_stat.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
/* Get the (real) underlying file system type so we can
* plan our fsync scaling strategy */
rc = statfs(fd->filename,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",
fd->filename,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
ADIO_FileSysType_parentdir(fd->filename, &dir);
rc = statfs(dir,&bgl_statfs);
if (rc >= 0)
{
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",dir,bgl_statfs.f_type);
buf[1] = bgl_statfs.f_type;
}
else
{
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
* or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
buf[1] = -1; /* bogus magic number */
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
}
free(dir);
}
}
/* now we can broadcast the stat/statfs data to everyone else */
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
bgl_stat.st_blksize = buf[0];
bgl_statfs.f_type = buf[1];
/* data from stat64 */
/* store the blksize in the file system specific storage */
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
/* data from statfs */
if ((bgl_statfs.f_type == GPFS_SUPER_MAGIC) ||
(bgl_statfs.f_type == bglocklessmpio_f_type))
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_ENABLED;
/* Only one rank is an "fsync aggregator" because only one
* fsync is needed */
if (rank == 0)
{
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr |=
ADIOI_BGL_FSYNC_AGGREGATOR;
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
}
else ; /* aggregation enabled but this rank is not an aggregator*/
}
else; /* Other filesystems default to no fsync aggregation */
}
void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode;
static char myname[] = "ADIOI_BGL_OPEN";
/* set internal variables for tuning environment variables */
ad_bgl_get_env_vars();
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1;
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if(fd->fd_sys != -1)
{
/* Initialize the ad_bgl file system specific information */
AD_BGL_assert(fd->fs_ptr == NULL);
fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs));
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
/* default is no fsync aggregation */
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
ADIOI_BGL_FSYNC_AGGREGATION_DISABLED;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
scaleable_stat(fd);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
}
if (fd->fd_sys == -1) {
if (errno == ENAMETOOLONG)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamelong",
"**filenamelong %s %d",
fd->filename,
strlen(fd->filename));
else if (errno == ENOENT)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_NO_SUCH_FILE,
"**filenoexist",
"**filenoexist %s",
fd->filename);
else if (errno == ENOTDIR || errno == ELOOP)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_BAD_FILE,
"**filenamedir",
"**filenamedir %s",
fd->filename);
else if (errno == EACCES) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_ACCESS,
"**fileaccess",
"**fileaccess %s",
fd->filename );
}
else if (errno == EROFS) {
/* Read only file or file system and write access requested */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_READ_ONLY,
"**ioneedrd", 0 );
}
else {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
}
else *error_code = MPI_SUCCESS;
}
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Просмотреть файл

@ -1,109 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_pset.c
* \brief Definition of functions associated to structs ADIOI_BGL_ProcInfo_t and ADIOI_BGL_ConfInfo_t
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include <stdlib.h>
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "mpidimpl.h"
ADIOI_BGL_ProcInfo_t *
ADIOI_BGL_ProcInfo_new()
{
ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ProcInfo_t));
AD_BGL_assert ((p != NULL));
return p;
}
ADIOI_BGL_ProcInfo_t *
ADIOI_BGL_ProcInfo_new_n( int n )
{
ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BGL_ProcInfo_t));
AD_BGL_assert ((p != NULL));
return p;
}
void
ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info )
{
if (info != NULL) ADIOI_Free (info);
}
static
void
ADIOI_BGL_ProcInfo_set(ADIOI_BGL_ProcInfo_t *info, const DCMF_Hardware_t *hw, int r)
{
info->psetNum = hw->idOfPset;
info->xInPset = hw->xCoord;
info->yInPset = hw->yCoord;
info->zInPset = hw->zCoord;
info->cpuid = hw->tCoord;
info->rank = r;
info->rankInPset = hw->rankInPset;
}
ADIOI_BGL_ConfInfo_t *
ADIOI_BGL_ConfInfo_new ()
{
ADIOI_BGL_ConfInfo_t *p = (ADIOI_BGL_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ConfInfo_t));
AD_BGL_assert ((p != NULL));
return p;
}
static
void
ADIOI_BGL_ConfInfo_set(ADIOI_BGL_ConfInfo_t *info, const DCMF_Hardware_t *hw, int s, int n_aggrs)
{
info->PsetSize = hw->sizeOfPset;
info->numPsets = (hw->xSize * hw->ySize *
hw->zSize) / hw->sizeOfPset;
info->isVNM = (hw->tSize != 1);
info->cpuidSize = hw->tSize;
info->virtualPsetSize = hw->sizeOfPset * hw->tSize;
info->nProcs = s;
/* More complicated logic maybe needed for nAggrs specification */
info->nAggrs = n_aggrs;
if ( info->nAggrs <=0 || MIN(info->nProcs, info->virtualPsetSize) < info->nAggrs )
info->nAggrs = ADIOI_BGL_NAGG_PSET_DFLT;
if ( info->nAggrs > info->virtualPsetSize ) info->nAggrs = info->virtualPsetSize;
info->aggRatio = 1. * info->nAggrs / info->virtualPsetSize;
if (info->aggRatio > 1) info->aggRatio = 1.;
}
void
ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info )
{
if (info != NULL) ADIOI_Free (info);
}
void
ADIOI_BGL_persInfo_init(ADIOI_BGL_ConfInfo_t *conf,
ADIOI_BGL_ProcInfo_t *proc,
int s, int r, int n_aggrs)
{
DCMF_Hardware_t hw;
DCMF_Hardware(&hw);
ADIOI_BGL_ConfInfo_set (conf, &hw, s, n_aggrs);
ADIOI_BGL_ProcInfo_set (proc, &hw, r);
}
void
ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf, ADIOI_BGL_ProcInfo_t *proc )
{
ADIOI_BGL_ConfInfo_free( conf );
ADIOI_BGL_ProcInfo_free( proc );
}

Просмотреть файл

@ -1,82 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_pset.h
* \brief ???
*/
/* File: ad_bgl_pset.h
*
* Defines two structures that keep BG/L PSET specific information and their public interfaces:
* . ADIOI_BGL_ProcInfo_t object keeps specific information to each process
* . ADIOI_BGL_ConfInfo_t object keeps general information for the whole communicator, only kept
* on process 0.
*/
#ifndef AD_BGL_PSET_H_
#define AD_BGL_PSET_H_
/* Keeps specific information to each process, will be exchanged among processes */
typedef struct {
int psetNum; /* which PSET I am in */
int rank; /* my rank */
int xInPset; /* my relative coordinates in my PSET */
int yInPset;
int zInPset;
int cpuid; /* my CPU id -- for virtual node mode (t coord)*/
int rankInPset; /* my relative rank in my PSET */
int __pad; /* pad to 16 byte alignment */
} ADIOI_BGL_ProcInfo_t __attribute__((aligned(16)));
/* Keeps general information for the whole communicator, only on process 0 */
typedef struct {
int PsetSize;
int nAggrs;
int numPsets;
int isVNM;
int virtualPsetSize;
int nProcs;
float aggRatio;
int cpuidSize; /* how many cpu ids? (t size) */
} ADIOI_BGL_ConfInfo_t __attribute__((aligned(16)));
#undef MIN
#define MIN(a,b) ((a<b ? a : b))
/* Default is to choose 8 aggregator nodes in each 32 CN pset.
Also defines default ratio of aggregator nodes in each a pset.
For Virtual Node Mode, the ratio is 8/64 */
#define ADIOI_BGL_NAGG_PSET_MIN 1
#define ADIOI_BGL_NAGG_PSET_DFLT 8
#define ADIOI_BGL_PSET_SIZE_DFLT 32
/* public funcs for ADIOI_BGL_ProcInfo_t objects */
ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new();
ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new_n( int n );
void ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info );
/* public funcs for ADIOI_BGL_ConfInfo_t objects */
ADIOI_BGL_ConfInfo_t * ADIOI_BGL_ConfInfo_new ();
void ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info );
/* public funcs for a pair of ADIOI_BGL_ConfInfo_t and ADIOI_BGL_ProcInfo_t objects */
void ADIOI_BGL_persInfo_init( ADIOI_BGL_ConfInfo_t *conf,
ADIOI_BGL_ProcInfo_t *proc,
int s, int r, int n_aggrs );
void ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf,
ADIOI_BGL_ProcInfo_t *proc );
#endif /* AD_BGL_PSET_H_ */

Просмотреть файл

@ -1,549 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_read.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "adio_extern.h"
#include "ad_bgl_tuning.h"
void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_READCONTIG";
#if BGL_PROFILE
/* timing */
double io_time, io_time2;
if (bglmpio_timing) {
io_time = MPI_Wtime();
bglmpio_prof_cr[ BGLMPIO_CIO_DATA_SIZE ] += len;
}
#endif
MPI_Type_size(datatype, &datatype_size);
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
#if BGL_PROFILE
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (bglmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (bglmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = read(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#else /* BGL_PROFILE */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* read from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
err = read(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif /* BGL_PROFILE */
#if BGL_PROFILE
if (bglmpio_timing) bglmpio_prof_cr[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io", "**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
}
#define ADIOI_BUFFERED_READ \
{ \
if (req_off >= readbuf_off + readbuf_len) { \
readbuf_off = req_off; \
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
err = read(fd->fd_sys, readbuf, readbuf_len);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
if (err == -1) err_flag = 1; \
} \
while (req_len > readbuf_off + readbuf_len - req_off) { \
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
ADIOI_Free(readbuf); \
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
memcpy(readbuf, tmp_buf, partial_read); \
ADIOI_Free(tmp_buf); \
readbuf_off += readbuf_len-partial_read; \
readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
end_offset-readbuf_off+1)); \
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
if (err == -1) err_flag = 1; \
} \
ADIOI_Assert(req_len == (size_t)req_len); \
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
}
void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, new_brd_size, brd_size, size;
int i, j, k, err=-1, st_index=0;
ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
unsigned num, bufsize;
int n_etypes_in_filetype;
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, partial_read;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off, req_len, sum;
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
char *readbuf, *tmp_buf, *value;
int err_flag=0, info_flag;
unsigned max_bufsize, readbuf_len;
static char myname[] = "ADIOI_BGL_READSTRIDED";
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on reads, use naive
* approach instead.
*/
/*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
ADIOI_GEN_ReadStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset,
status,
error_code);
return;
}
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + (ADIO_Offset)etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
readbuf_off = off;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
lseek(fd->fd_sys, readbuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
err = read(fd->fd_sys, readbuf, readbuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_READ
off += flat_buf->blocklens[i];
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* frd_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
frd_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
frd_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
frd_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao: read request is within a single flat_file contig
* block e.g. with subarray types that actually describe the whole
* array */
if (buftype_is_contig && bufsize <= frd_size) {
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte that
* can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == frd_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
st_frd_size = frd_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i_offset < bufsize) {
i_offset += frd_size;
end_offset = off + frd_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock (exclusive) the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read into readbuf */
readbuf_off = offset;
readbuf = (char *) ADIOI_Malloc(max_bufsize);
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
lseek(fd->fd_sys, offset, SEEK_SET);
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
err = read(fd->fd_sys, readbuf, readbuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
if (err == -1) err_flag = 1;
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = ADIOI_MIN(st_frd_size, bufsize);
while (i_offset < bufsize) {
if (frd_size) {
/* TYPE_UB and TYPE_LB can result in
frd_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
req_off = off;
req_len = frd_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
i_offset += frd_size;
if (off + frd_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += frd_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by frd_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
frd_size = st_frd_size;
brd_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(frd_size, brd_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = read(fd->fd_sys, ((char *) buf) + i, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_READ
}
new_frd_size = frd_size;
new_brd_size = brd_size;
if (size == frd_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_frd_size = flat_file->blocklens[j];
if (size != brd_size) {
i_offset += size;
new_brd_size -= size;
}
}
if (size == brd_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k]);
new_brd_size = flat_buf->blocklens[k];
if (size != frd_size) {
off += size;
new_frd_size -= size;
}
}
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
num += size;
frd_size = new_frd_size;
brd_size = new_brd_size;
}
}
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually read and placed in buf
by ADIOI_BUFFERED_READ. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -1,68 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_setsh.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
/* set the shared file pointer to "offset" etypes relative to the current
view */
/*
This looks very similar to ADIOI_GEN_Set_shared_fp, except this
function avoids locking the file twice. The generic version does
Write lock
ADIO_WriteContig
Unlock
For BGL, ADIOI_BGL_WriteContig does a lock before writing to disable
caching. To avoid the lock being called twice, this version for BGL does
Write lock
Lseek
Write
Unlock
*/
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
{
int err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_BGL_SET_SHARED_FP";
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
fd->shared_fp_fname,
fd->file_system, fd->fns,
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
ADIO_PERM_NULL, error_code);
}
if (*error_code != MPI_SUCCESS) return;
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -1,163 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_tuning.c
* \brief defines ad_bgl performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*---------------------------------------------------------------------
* ad_bgl_tuning.c
*
* defines global variables and functions for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#include "ad_bgl_tuning.h"
#include "mpi.h"
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
int bglmpio_timing;
int bglmpio_timing2;
int bglmpio_comm;
int bglmpio_tunegather;
int bglmpio_tuneblocking;
long bglocklessmpio_f_type;
double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
/* set internal variables for tuning environment variables */
/** \page mpiio_vars MPIIO Configuration
\section env_sec Environment Variables
* - BGLMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - BGLMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - BGLMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
* Must also compile the library with BGL_PROFILE defined. Possible values:
* - 0 - Do not collect/report averages.
* - 1 - Collect/report averages.
* - Default is 0.
*
* - BGLMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - BGLMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
* "bgl:" or "bglockless:") on a file name will override this environment
* variable. Possible values:
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
* statfs() field f_type.
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
*
*/
void ad_bgl_get_env_vars() {
char *x, *dummy;
bglmpio_comm = 0;
x = getenv( "BGLMPIO_COMM" );
if (x) bglmpio_comm = atoi(x);
bglmpio_timing = 0;
x = getenv( "BGLMPIO_TIMING" );
if (x) bglmpio_timing = atoi(x);
bglmpio_timing2 = 0;
x = getenv( "BGLMPIO_TIMING2" );
if (x) bglmpio_timing2 = atoi(x);
bglmpio_tunegather = 1;
x = getenv( "BGLMPIO_TUNEGATHER" );
if (x) bglmpio_tunegather = atoi(x);
bglmpio_tuneblocking = 1;
x = getenv( "BGLMPIO_TUNEBLOCKING" );
if (x) bglmpio_tuneblocking = atoi(x);
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
bglocklessmpio_f_type,bglocklessmpio_f_type);
}
/* report timing breakdown for MPI I/O collective call */
void ad_bgl_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
{
int i;
if (bglmpio_timing) {
double *bglmpio_prof_org = bglmpio_prof_cr;
if (rw) bglmpio_prof_org = bglmpio_prof_cw;
double bglmpio_prof_avg[ BGLMPIO_CIO_LAST ];
double bglmpio_prof_max[ BGLMPIO_CIO_LAST ];
MPI_Reduce( bglmpio_prof_org, bglmpio_prof_avg, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
MPI_Reduce( bglmpio_prof_org, bglmpio_prof_max, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
if (myrank == 0) {
for (i=0; i<BGLMPIO_CIO_LAST; i++) bglmpio_prof_avg[i] /= nprocs;
if (bglmpio_timing2) {
bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
bglmpio_prof_max[ BGLMPIO_CIO_T_POSI_RW ];
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_RW ];
} else {
bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] = 0;
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] = 0;
}
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_CRW ];
printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
printf( "SZ: %12.4f , ", bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs );
printf( "SK-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_SEEK ] );
printf( "SK-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_SEEK ] );
printf( "LC-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_LCOMP ] );
printf( "GA-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_GATHER ] );
printf( "AN-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_PATANA ] );
printf( "FD-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_FD_PART ] );
printf( "MY-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MYREQ ] );
printf( "OT-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_OTHREQ ] );
printf( "EX-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_DEXCH ] );
printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
printf( "PXT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_POSI_RW ] );
printf( "MPT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_RW ] );
printf("MPTC-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_CRW ] );
printf( "PXB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] );
printf( "MPB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] );
printf( "MPBC: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] );
}
}
}

Просмотреть файл

@ -1,95 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_tuning.h
* \brief ???
*/
/*---------------------------------------------------------------------
* ad_bgl_tuning.h
*
* declares global variables and macros for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#ifndef AD_BGL_TUNING_H_
#define AD_BGL_TUNING_H_
#include "adio.h"
#define AD_BGL_assert( a ) if (!(a)) { \
fprintf( stderr, "AD_BGL_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
MPI_Abort( MPI_COMM_WORLD, 1 ); \
}
/*-----------------------------------------
* Global variables for the control of
* 1. timing
* 2. select specific optimizations
*-----------------------------------------*/
/* timing fields */
enum {
BGLMPIO_CIO_DATA_SIZE=0,
BGLMPIO_CIO_T_SEEK,
BGLMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
BGLMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
BGLMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
BGLMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
BGLMPIO_CIO_T_MYREQ, /* time for ADIOI_BGL_Calc_my_req(), local */
BGLMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
BGLMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
BGLMPIO_CIO_T_POSI_RW,
BGLMPIO_CIO_B_POSI_RW,
BGLMPIO_CIO_T_MPIO_RW, /* time for ADIOI_BGL_WriteContig() */
BGLMPIO_CIO_B_MPIO_RW,
BGLMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_BGL_WriteStridedColl() */
BGLMPIO_CIO_B_MPIO_CRW,
BGLMPIO_CIO_LAST
};
extern double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
extern double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
/* corresponds to environment variables to select optimizations and timing level */
extern int bglmpio_timing;
extern int bglmpio_timing2;
extern int bglmpio_comm;
extern int bglmpio_tunegather;
extern int bglmpio_tuneblocking;
extern long bglocklessmpio_f_type;
/* set internal variables for tuning environment variables */
void ad_bgl_get_env_vars();
/* report timing breakdown for MPI I/O collective call */
void ad_bgl_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
/* note:
* T := timing;
* CIO := collective I/O
*/
#define BGLMPIO_T_CIO_RESET( LEVEL, RW ) \
if (bglmpio_timing_cw_level >= LEVEL) { \
int i; \
for ( i = 0; i < BGLMPIO_T_LAST; i ++ ) \
bglmpio_prof_c##RW [ i ] = 0; \
}
#define BGLMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
if (bglmpio_timing_cw_level >= LEVEL) { \
ad_bgl_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
}
#define BGLMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
if (bglmpio_timing_cw_level >= LEVEL) { \
if ( DOBAR ) MPI_Barrier( fd->comm ); \
double temp = MPI_Wtime(); \
if ( ISSET ) bglmpio_prof_c##RW [ VAR1 ] = temp; \
if ( ISGET ) bglmpio_prof_c##RW [ VAR2 ] = temp - bglmpio_prof_c##RW [ VAR2 ] ; \
}
#endif /* AD_BGL_TUNING_H_ */

Просмотреть файл

@ -1,611 +0,0 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_write.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "adio_extern.h"
#include "ad_bgl_tuning.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size;
ADIO_Offset len;
static char myname[] = "ADIOI_BGL_WRITECONTIG";
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5036, 0, NULL);
#endif
#if BGL_PROFILE
/* timing */
double io_time, io_time2;
if (bglmpio_timing) {
io_time = MPI_Wtime();
bglmpio_prof_cw[ BGLMPIO_CIO_DATA_SIZE ] += len;
}
#endif
MPI_Type_size(datatype, &datatype_size);
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
#if BGL_PROFILE
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (bglmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (bglmpio_timing2) io_time2 = MPI_Wtime();
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
if (bglmpio_timing2) io_time2 = MPI_Wtime();
err = write(fd->fd_sys, buf, (unsigned int)len);
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#else /* BGL_PROFILE */
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
if (fd->fp_sys_posn != offset)
lseek(fd->fd_sys, offset, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_sys_posn = offset + err;
/* individual file pointer not updated */
}
else { /* write from curr. location of ind. file pointer */
offset = fd->fp_ind;
if (fd->fp_sys_posn != fd->fp_ind)
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
err = write(fd->fd_sys, buf, (unsigned int)len);
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
fd->fp_ind += err;
fd->fp_sys_posn = fd->fp_ind;
}
#endif /* BGL_PROFILE */
#if BGL_PROFILE
if (bglmpio_timing) bglmpio_prof_cw[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
#endif
/* --BEGIN ERROR HANDLING-- */
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
return;
}
/* --END ERROR HANDLING-- */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, err);
#endif
*error_code = MPI_SUCCESS;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5037, 0, NULL);
#endif
}
#define ADIOI_BUFFERED_WRITE \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
} \
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
err = read(fd->fd_sys, writebuf, writebuf_len); \
if (err == -1) { \
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
/* this macro is used when filetype is contig and buftype is not contig.
it does not do a read-modify-write and does not lock*/
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
{ \
if (req_off >= writebuf_off + writebuf_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
writebuf_off = req_off; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
} \
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
while (write_sz != req_len) { \
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
err = write(fd->fd_sys, writebuf, writebuf_len); \
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
if (err == -1) err_flag = 1; \
req_len -= write_sz; \
userbuf_off += write_sz; \
writebuf_off += writebuf_len; \
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
} \
}
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
{
/* offset is in units of etype relative to the filetype. */
ADIOI_Flatlist_node *flat_buf, *flat_file;
ADIO_Offset i_offset, sum, size_in_filetype;
int i, j, k, err=-1, st_index=0;
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value;
unsigned bufsize, writebuf_len, max_bufsize, write_sz;
int err_flag=0, info_flag;
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
static char myname[] = "ADIOI_BGL_WRITESTRIDED";
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
/* if user has disabled data sieving on reads, use naive
* approach instead.
*/
/*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
ADIOI_GEN_WriteStrided_naive(fd,
buf,
count,
datatype,
file_ptr_type,
offset,
status,
error_code);
return;
}
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
#endif
*error_code = MPI_SUCCESS;
return;
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
bufsize = buftype_size * count;
/* get max_bufsize from the info object. */
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
&info_flag);
max_bufsize = atoi(value);
ADIOI_Free(value);
if (!buftype_is_contig && filetype_is_contig) {
/* noncontiguous in memory, contiguous in file. */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
fd->disp + etype_size * offset;
start_off = off;
end_offset = off + bufsize - 1;
writebuf_off = off;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
for (j=0; j<count; j++)
{
int i;
for (i=0; i<flat_buf->count; i++) {
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
req_off = off;
req_len = flat_buf->blocklens[i];
ADIOI_BUFFERED_WRITE_WITHOUT_READ
off += flat_buf->blocklens[i];
}
}
/* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
err = write(fd->fd_sys, writebuf, writebuf_len);
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
if (err == -1) err_flag = 1;
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
else { /* noncontiguous in file */
/* filetype already flattened in ADIO_Open */
flat_file = ADIOI_Flatlist;
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
disp = fd->disp;
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* Wei-keng reworked type processing to be a bit more efficient */
offset = fd->fp_ind - disp;
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
/* now offset is local to this extent */
/* find the block where offset is located, skip blocklens[i]==0 */
for (i=0; i<flat_file->count; i++) {
ADIO_Offset dist;
if (flat_file->blocklens[i] == 0) continue;
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
/* fwr_size is from offset to the end of block i */
if (dist == 0) {
i++;
offset = flat_file->indices[i];
fwr_size = flat_file->blocklens[i];
break;
}
if (dist > 0) {
fwr_size = dist;
break;
}
}
st_index = i; /* starting index in flat_file->indices[] */
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
}
else {
int i;
n_etypes_in_filetype = filetype_size/etype_size;
n_filetypes = offset / n_etypes_in_filetype;
etype_in_filetype = offset % n_etypes_in_filetype;
size_in_filetype = etype_in_filetype * etype_size;
sum = 0;
for (i=0; i<flat_file->count; i++) {
sum += flat_file->blocklens[i];
if (sum > size_in_filetype) {
st_index = i;
fwr_size = sum - size_in_filetype;
abs_off_in_filetype = flat_file->indices[i] +
size_in_filetype - (sum - flat_file->blocklens[i]);
break;
}
}
/* abs. offset in bytes in the file */
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
abs_off_in_filetype;
}
start_off = offset;
/* Wei-keng Liao:write request is within single flat_file contig block*/
/* this could happen, for example, with subarray types that are
* actually fairly contiguous */
if (buftype_is_contig && bufsize <= fwr_size) {
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
offset, status, error_code);
if (file_ptr_type == ADIO_INDIVIDUAL) {
/* update MPI-IO file pointer to point to the first byte
* that can be accessed in the fileview. */
fd->fp_ind = offset + bufsize;
if (bufsize == fwr_size) {
do {
st_index++;
if (st_index == flat_file->count) {
st_index = 0;
n_filetypes++;
}
} while (flat_file->blocklens[st_index] == 0);
fd->fp_ind = disp + flat_file->indices[st_index]
+ (ADIO_Offset)n_filetypes*filetype_extent;
}
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
#endif
return;
}
/* Calculate end_offset, the last byte-offset that will be accessed.
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
st_fwr_size = fwr_size;
st_n_filetypes = n_filetypes;
i_offset = 0;
j = st_index;
off = offset;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
i_offset += fwr_size;
end_offset = off + fwr_size - 1;
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
}
/* if atomicity is true, lock the region to be accessed */
if (fd->atomicity)
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
/* initial read for the read-modify-write */
writebuf_off = offset;
writebuf = (char *) ADIOI_Malloc(max_bufsize);
writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
err = read(fd->fd_sys, writebuf, writebuf_len);
if (err == -1) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_IO,
"ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
return;
}
if (buftype_is_contig && !filetype_is_contig) {
/* contiguous in memory, noncontiguous in file. should be the most
common case. */
i_offset = 0;
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
while (i_offset < bufsize) {
if (fwr_size) {
/* TYPE_UB and TYPE_LB can result in
fwr_size = 0. save system call in such cases */
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
req_off = off;
req_len = fwr_size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
i_offset += fwr_size;
if (off + fwr_size < disp + flat_file->indices[j] +
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
off += fwr_size;
/* did not reach end of contiguous block in filetype.
no more I/O needed. off is incremented by fwr_size. */
else {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
bufsize-i_offset);
}
}
}
else {
/* noncontiguous in memory as well as in file */
ADIOI_Flatten_datatype(datatype);
flat_buf = ADIOI_Flatlist;
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
k = num = buf_count = 0;
i_offset = flat_buf->indices[0];
j = st_index;
off = offset;
n_filetypes = st_n_filetypes;
fwr_size = st_fwr_size;
bwr_size = flat_buf->blocklens[0];
while (num < bufsize) {
size = ADIOI_MIN(fwr_size, bwr_size);
if (size) {
/* lseek(fd->fd_sys, off, SEEK_SET);
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
req_off = off;
req_len = size;
userbuf_off = i_offset;
ADIOI_BUFFERED_WRITE
}
new_fwr_size = fwr_size;
new_bwr_size = bwr_size;
if (size == fwr_size) {
/* reached end of contiguous block in file */
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
while (flat_file->blocklens[j]==0) {
j = (j+1) % flat_file->count;
n_filetypes += (j == 0) ? 1 : 0;
}
off = disp + flat_file->indices[j] +
n_filetypes*(ADIO_Offset)filetype_extent;
new_fwr_size = flat_file->blocklens[j];
if (size != bwr_size) {
i_offset += size;
new_bwr_size -= size;
}
}
if (size == bwr_size) {
/* reached end of contiguous block in memory */
k = (k + 1)%flat_buf->count;
buf_count++;
i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
flat_buf->indices[k];
new_bwr_size = flat_buf->blocklens[k];
if (size != fwr_size) {
off += size;
new_fwr_size -= size;
}
}
num += size;
fwr_size = new_fwr_size;
bwr_size = new_bwr_size;
}
}
/* write the buffer out finally */
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
err = write(fd->fd_sys, writebuf, writebuf_len);
if (!(fd->atomicity))
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
if (err == -1) err_flag = 1;
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
else *error_code = MPI_SUCCESS;
}
fd->fp_sys_posn = -1; /* set it to null. */
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
}

Просмотреть файл

@ -1,17 +0,0 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2011 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_BGLOCKLESS
noinst_HEADERS += adio/ad_bglockless/ad_bglockless.h
romio_other_sources += \
adio/ad_bglockless/ad_bglockless.c \
adio/ad_bglockless/ad_bglockless_features.c
endif BUILD_AD_BGLOCKLESS

Просмотреть файл

@ -1,44 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "../ad_bg/ad_bg.h"
#include "ad_bglockless.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
ADIOI_BG_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_BG_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
ADIOI_BG_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
#else
ADIOI_FAKE_IreadContig, /* IreadContig */
ADIOI_FAKE_IwriteContig, /* IwriteContig */
#endif
ADIOI_GEN_IODone, /* ReadDone */
ADIOI_GEN_IODone, /* WriteDone */
ADIOI_GEN_IOComplete, /* ReadComplete */
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_BG_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_BGLOCKLESS_Feature /* Features */
};

Просмотреть файл

@ -1,14 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* Copyright (C) 2008 Uchicago Argonne LLC
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_BGLOCKLESS_INCLUDE
#define AD_PVFS2_INCLUDE
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag);
#endif

Просмотреть файл

@ -1,15 +0,0 @@
#include "adio.h"
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag)
{
switch(flag) {
case ADIO_SCALABLE_OPEN:
return 1;
case ADIO_SHARED_FP:
case ADIO_LOCKS:
case ADIO_SEQUENTIAL:
case ADIO_DATA_SIEVING_WRITES:
default:
return 0;
}
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,26 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_GPFS
noinst_HEADERS += \
adio/ad_gpfs/ad_gpfs_aggrs.h \
adio/ad_gpfs/ad_gpfs.h \
adio/ad_gpfs/ad_gpfs_tuning.h
romio_other_sources += \
adio/ad_gpfs/ad_gpfs_aggrs.c \
adio/ad_gpfs/ad_gpfs_close.c \
adio/ad_gpfs/ad_gpfs_flush.c \
adio/ad_gpfs/ad_gpfs_tuning.c \
adio/ad_gpfs/ad_gpfs.c \
adio/ad_gpfs/ad_gpfs_open.c \
adio/ad_gpfs/ad_gpfs_hints.c \
adio/ad_gpfs/ad_gpfs_rdcoll.c \
adio/ad_gpfs/ad_gpfs_wrcoll.c
endif BUILD_AD_GPFS

Просмотреть файл

@ -2,7 +2,7 @@
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl.c
* \file ad_gpfs.c
* \brief ???
*/
@ -11,34 +11,28 @@
* Copyright (C) 2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_gpfs.h"
/* adioi.h has the ADIOI_Fns_struct define */
#include "adioi.h"
struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_BGL_Open, /* Open */
struct ADIOI_Fns_struct ADIO_GPFS_operations = {
ADIOI_GPFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* Collective open */
ADIOI_BGL_ReadContig, /* ReadContig */
ADIOI_BGL_WriteContig, /* WriteContig */
#if BGL_OPTIM_STEP1_2
ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */
ADIOI_BGL_WriteStridedColl, /* WriteStridedColl */
#else
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
#endif
ADIOI_GEN_ReadContig, /* ReadContig */
ADIOI_GEN_WriteContig, /* WriteContig */
ADIOI_GPFS_ReadStridedColl, /* ReadStridedColl */
ADIOI_GPFS_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
ADIOI_BGL_Fcntl, /* Fcntl */
#if BGL_OPTIM_STEP1_1
ADIOI_BGL_SetInfo, /* SetInfo */
ADIOI_GEN_Fcntl, /* Fcntl */
#if defined(BGQPLATFORM) || defined(PEPLATFORM)
ADIOI_GPFS_SetInfo, /* SetInfo for BlueGene or PE */
#else
ADIOI_GEN_SetInfo, /* SetInfo */
ADIOI_GEN_SetInfo, /* SetInfo for any platform besides BlueGene or PE */
#endif
ADIOI_BGL_ReadStrided, /* ReadStrided */
ADIOI_BGL_WriteStrided, /* WriteStrided */
ADIOI_BGL_Close, /* Close */
ADIOI_GEN_ReadStrided, /* ReadStrided */
ADIOI_GEN_WriteStrided, /* WriteStrided */
ADIOI_GPFS_Close, /* Close */
#ifdef ROMIO_HAVE_WORKING_AIO
#warning Consider BG support for NFS before enabling this.
ADIOI_GEN_IreadContig, /* IreadContig */
@ -53,8 +47,17 @@ struct ADIOI_Fns_struct ADIO_BGL_operations = {
ADIOI_GEN_IOComplete, /* WriteComplete */
ADIOI_GEN_IreadStrided, /* IreadStrided */
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
ADIOI_BGL_Flush, /* Flush */
ADIOI_GPFS_Flush, /* Flush */
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
#ifdef BGQPLATFORM
"GPFS+BGQ: IBM GPFS for Blue Gene",
#elif PEPLATFORM
"GPFS+PE: IBM GPFS for PE",
#else
"GPFS: IBM GPFS",
#endif
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -0,0 +1,71 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs.h
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#ifndef AD_GPFS_INCLUDE
#define AD_GPFS_INCLUDE
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include "adio.h"
#ifdef HAVE_SIGNAL_H
#include <signal.h>
#endif
#ifdef HAVE_AIO_H
#include <aio.h>
#endif
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code);
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code);
void ADIOI_GPFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_GPFS_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
#include "ad_gpfs_tuning.h"
#endif

Просмотреть файл

@ -2,8 +2,8 @@
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.c
* \brief The externally used function from this file is is declared in ad_bg_aggrs.h
* \file ad_gpfs_aggrs.c
* \brief The externally used function from this file is is declared in ad_gpfs_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
@ -12,25 +12,24 @@
* See COPYRIGHT notice in top-level directory.
*/
/*#define TRACE_ON */
#include "adio.h"
#include "adio_cb_config_list.h"
#include "ad_bg.h"
#include "ad_bg_pset.h"
#include "ad_bg_aggrs.h"
#include "ad_gpfs.h"
#include "ad_gpfs_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#include "mpidi_macros.h"
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
static int aggrsInPsetSize=0;
static int *aggrsInPset=NULL;
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/* Comments copied from common:
* This file contains four functions:
@ -63,260 +62,6 @@ static int *aggrsInPset=NULL;
* uneven distributions
*/
/* forward declaration */
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo,
int *aggrsInPset );
/*
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
* ADIO can work more efficiently.
*/
int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
{
int r, s;
ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
ADIOI_BG_ConfInfo_t *confInfo;
TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
MPI_Comm_size( fd->comm, &s );
MPI_Comm_rank( fd->comm, &r );
/* Collect individual BG personality information */
confInfo = ADIOI_BG_ConfInfo_new ();
procInfo = ADIOI_BG_ProcInfo_new ();
ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
/* Gather BG personality infomation onto process 0 */
/* if (r == 0) */
all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
if(s > aggrsInPsetSize)
{
if(aggrsInPset) ADIOI_Free(aggrsInPset);
aggrsInPset = (int *) ADIOI_Malloc (s *sizeof(int));
aggrsInPsetSize = s;
}
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
0,
fd->comm );
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
if (r == 0) {
ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset);
/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
}
ADIOI_BG_ProcInfo_free (all_procInfo);
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
Declared in adio_cb_config_list.h */
ADIOI_cb_bcast_rank_map(fd);
/* Broadcast the BG-GPFS related file domain info */
MPI_Bcast( (void *)aggrsInPset,
fd->hints->cb_nodes * sizeof(int), MPI_BYTE,
0,
fd->comm );
ADIOI_BG_persInfo_free( confInfo, procInfo );
TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
return 0;
}
/* There are some number of bridge nodes (randomly) distributed through the job
* We need to split the nodes among the bridge nodes */
/* Maybe find which bridge node is closer (manhattan distance) and try to
* distribute evenly.
*/
/*
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
* The first order of tmp_ranklist is : PSET number
* The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
*/
typedef struct
{
int rank;
int bridge;
} sortstruct;
static int intsort(const void *p1, const void *p2)
{
sortstruct *i1, *i2;
i1 = (sortstruct *)p1;
i2 = (sortstruct *)p2;
return(i1->bridge - i2->bridge);
}
static int
ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo,
int *aggrsInPset,
int *tmp_ranklist)
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
/* BES: This should be done in the init routines probably. */
int i, j;
int aggTotal;
int distance, numAggs;
int *aggList;
/* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
* bridge node */
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
for(i=0; i < confInfo->nProcs; i++)
{
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
bridgelist[i].rank = i;
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
}
/* This list contains rank->bridge info. Now, we need to sort this list. */
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
/* In this array, we can pick an appropriate number of midpoints based on
* our bridgenode index and the number of aggregators */
numAggs = confInfo->aggRatio * confInfo->ioMaxSize /*virtualPsetSize*/;
if(numAggs == 1)
aggTotal = 1;
else
/* the number of aggregators is (numAggs per bridgenode) plus each
* bridge node is an aggregator */
aggTotal = confInfo->numBridgeRanks * (numAggs+1);
distance = (confInfo->ioMaxSize /*virtualPsetSize*/ / numAggs);
TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d numAggs: %d distance: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, distance, aggTotal);
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
/* For each bridge node, determine who the aggregators will be */
/* basically, the n*distance and bridge node */
if(aggTotal == 1) /* special case when we only have one bridge node */
aggList[0] = bridgelist[0].bridge;
else
{
for(i=0; i < confInfo->numBridgeRanks; i++)
{
aggList[i]=bridgelist[i*confInfo->ioMaxSize /*virtualPsetSize*/].bridge;
TRACE_ERR("aggList[%d]: %d\n", i, aggList[i]);
for(j = 0; j < numAggs; j++)
{
/* Sets up a list of nodes which will act as aggregators. numAggs
* per bridge node total. The list of aggregators is
* bridgeNodes
* bridgeNode[0]aggr[0]
* bridgeNode[0]aggr[1]...
* bridgeNode[0]aggr[N]...
* ...
* bridgeNode[N]aggr[0]..
* bridgeNode[N]aggr[N]
*/
aggList[i*numAggs+j+confInfo->numBridgeRanks] = bridgelist[i*confInfo->ioMaxSize /*virtualPsetSize*/ + j*distance+1].rank;
TRACE_ERR("(post bridge) agglist[%d] -> %d\n", confInfo->numBridgeRanks +i*numAggs+j, aggList[i*numAggs+j+confInfo->numBridgeRanks]);
}
}
}
memcpy(tmp_ranklist, aggList, (numAggs*confInfo->numBridgeRanks+numAggs)*sizeof(int));
for(i=0;i<aggTotal;i++)
{
TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
}
ADIOI_Free (bridgelist);
ADIOI_Free (aggList);
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
return aggTotal;
}
/*
* compute aggregators ranklist and put it into fd->hints struct
*/
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo,
int *aggrsInPset )
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
int i;
int naggs;
int size;
int *tmp_ranklist;
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) {
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
}
# endif
naggs=
ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);
# define VERIFY 1
# if VERIFY
DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
confInfo->ioMinSize ,
confInfo->ioMaxSize ,
confInfo->nAggrs ,
confInfo->numBridgeRanks ,
confInfo->nProcs ,
confInfo->ioMaxSize /*virtualPsetSize*/ ,
confInfo->cpuIDsize,
confInfo->aggRatio ,
naggs );
# endif
MPI_Comm_size( fd->comm, &size );
/* This fix is for when the bridgenode rnk is not part of the particular
* subcomm associated with this MPI File operation. I don't know if
* this is the best/right answer but it passes the test cases at least.
* I don't know how common file IO in subcomms is anyway... */
for(i=0;i<naggs;i++)
{
if(tmp_ranklist[i] > size)
{
TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
i, tmp_ranklist[i], fd->comm);
tmp_ranklist[i] = 0;
}
}
# if AGG_DEBUG
for (i=0; i<naggs; i++) {
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif
/* copy the ranklist of IO aggregators to fd->hints */
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
fd->hints->cb_nodes = naggs;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
/* */
ADIOI_Free( tmp_ranklist );
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
return;
}
/* Description from common/ad_aggregate.c. (Does it completely apply to bg?)
* ADIOI_Calc_aggregator()
*
@ -349,7 +94,7 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
* This is more general aggregator search function which does not base on the assumption
* that each aggregator hosts the file domain with the same size
*/
int ADIOI_BG_Calc_aggregator(ADIO_File fd,
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
@ -359,9 +104,9 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
{
int rank_index, rank;
ADIO_Offset avail_bytes;
TRACE_ERR("Entering ADIOI_BG_Calc_aggregator\n");
TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
ADIOI_BG_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
/* binary search --> rank_index is returned */
int ub = fd->hints->cb_nodes;
@ -401,7 +146,7 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
rank_index,fd->hints->cb_nodes,fd_size,off);
MPI_Abort(MPI_COMM_WORLD, 1);
}
/* DBG_FPRINTF ("ADIOI_BG_Calc_aggregator: rank_index = %d\n",
/* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
rank_index ); */
/*
@ -422,7 +167,7 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
/* map our index to a rank */
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
rank = fd->hints->ranklist[rank_index];
TRACE_ERR("Leaving ADIOI_BG_Calc_aggregator\n");
TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
return rank;
}
@ -441,7 +186,8 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
*/
void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
@ -453,20 +199,23 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
{
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
int i, aggr;
TRACE_ERR("Entering ADIOI_BG_GPFS_Calc_file_domains\n");
TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
blksize_t blksize;
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5004, 0, NULL);
#endif
# if AGG_DEBUG
static char myname[] = "ADIOI_BG_GPFS_Calc_file_domains";
static char myname[] = "ADIOI_GPFS_Calc_file_domains";
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
myname,__LINE__,nprocs_for_coll);
# endif
__blksize_t blksize = 1048576; /* default to 1M */
if(fs_ptr && ((ADIOI_BG_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
blksize = ((ADIOI_BG_fs*)fs_ptr)->blksize;
if (fd->blksize <= 0)
/* default to 1M if blksize unset */
fd->blksize = 1048576;
blksize = fd->blksize;
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
# endif
@ -509,14 +258,144 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
fd_start = *fd_start_ptr;
fd_end = *fd_end_ptr;
/* each process will have a file domain of some number of gpfs blocks, but
* the division of blocks is not likely to be even. Some file domains will
* be "large" and others "small"
*
* Example: consider 17 blocks distributed over 3 aggregators.
* nb_cn_small = 17/3 = 5
* naggs_large = 17 - 3*(17/3) = 17 - 15 = 2
* naggs_small = 3 - 2 = 1
*
* and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
*
* what about (relatively) small files? say, a file of 1000 blocks
* distributed over 2064 aggregators:
* nb_cn_small = 1000/2064 = 0
* naggs_large = 1000 - 2064*(1000/2064) = 1000
* naggs_small = 2064 - 1000 = 1064
* and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
*
* it might be a good idea instead of having all the zeros up front, to
* "mix" those zeros into the fd_size array. that way, no pset/bridge-set
* is left with zero work. In fact, even if the small file domains aren't
* zero, it's probably still a good idea to mix the "small" file domains
* across the fd_size array to keep the io nodes in balance */
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
ADIO_Offset naggs_small = naggs - naggs_large;
for (i=0; i<naggs; i++)
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
else fd_size[i] = (nb_cn_small+1) * blksize;
#ifdef BGQPLATFORM
if (gpfsmpio_balancecontig == 1) {
/* File domains blocks are assigned to aggregators in a breadth-first
* fashion relative to the ions - additionally, file domains on the
* aggregators sharing the same bridgeset and ion have contiguous
* offsets. */
// initialize everything to small
for (i=0; i<naggs; i++)
fd_size[i] = nb_cn_small * blksize;
// go thru and distribute the large across the bridges
/* bridelistoffset: agg rank list offsets using the bridgelist - each
* entry is created by adding up the indexes for the aggs from all
* previous bridges */
int *bridgelistoffset =
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
* decremented to keep track of bridge assignments during the actual
* large block assignments to the agg rank list*/
int *tmpbridgelistnum =
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
int j;
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
int k, bridgerankoffset = 0;
for (k=0;k<j;k++) {
bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
}
bridgelistoffset[j] = bridgerankoffset;
}
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
int bridgeiter = 0;
/* distribute the large blocks across the aggs going breadth-first
* across the bridgelist - this distributes the fd sizes across the
* ions, so later in the file domain assignment when it iterates thru
* the ranklist the offsets will be contiguous within the bridge and
* ion as well */
for (j=0;j<naggs_large;j++) {
int foundbridge = 0;
int numbridgelistpasses = 0;
while (!foundbridge) {
if (tmpbridgelistnum[bridgeiter] > 0) {
foundbridge = 1;
/*
printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
*/
int currentbridgelistnum =
(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
tmpbridgelistnum[bridgeiter]);
int currentfdsizeindex = bridgelistoffset[bridgeiter] +
currentbridgelistnum;
fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
tmpbridgelistnum[bridgeiter]--;
}
if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
/* guard against infinite loop - should only ever make 1 pass
* thru bridgelist */
ADIOI_Assert(numbridgelistpasses == 0);
numbridgelistpasses++;
bridgeiter = 0;
}
else
bridgeiter++;
}
}
ADIOI_Free(tmpbridgelistnum);
ADIOI_Free(bridgelistoffset);
} else {
/* BG/L- and BG/P-style distribution of file domains: simple allocation of
* file domins to each aggregator */
for (i=0; i<naggs; i++) {
if (i < naggs_large) {
fd_size[i] = (nb_cn_small+1) * blksize;
} else {
fd_size[i] = nb_cn_small * blksize;
}
}
}
#ifdef balancecontigtrace
int myrank;
MPI_Comm_rank(fd->comm,&myrank);
if (myrank == 0) {
fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
for (i=0; i<naggs; i++) {
fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
}
}
#endif
#else // not BGQ platform
for (i=0; i<naggs; i++) {
if (i < naggs_large) {
fd_size[i] = (nb_cn_small+1) * blksize;
} else {
fd_size[i] = nb_cn_small * blksize;
}
}
#endif
# if AGG_DEBUG
DBG_FPRINTF(stderr,"%s(%d): "
@ -561,30 +440,18 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
MPE_Log_event (5005, 0, NULL);
#endif
ADIOI_Free (fd_size);
TRACE_ERR("Leaving ADIOI_BG_GPFS_Calc_file_domains\n");
TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
}
/*
* When a process is an IO aggregator, this will return its index in the aggrs list.
* Otherwise, this will return -1
*/
int ADIOI_BG_Aggrs_index( ADIO_File fd, int myrank )
{
int i;
for (i=0; i<fd->hints->cb_nodes; i++)
if (fd->hints->ranklist[i] == myrank) return i;
return -1;
}
/*
* ADIOI_BG_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* ADIOI_GPFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
* is specific for static file domain partitioning.
*
* ADIOI_Calc_my_req() - calculate what portions of the access requests
* of this process are located in the file domains of various processes
* (including this one)
*/
void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
@ -600,12 +467,11 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
int i, l, proc;
ADIO_Offset fd_len, rem_len, curr_idx, off;
ADIOI_Access *my_req;
TRACE_ERR("Entering ADIOI_BG_Calc_my_req\n");
TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5024, 0, NULL);
#endif
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
count_my_req_per_proc = *count_my_req_per_proc_ptr;
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
@ -638,7 +504,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
* first part of the access.
*/
/* BES */
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
count_my_req_per_proc[proc]++;
@ -651,7 +517,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
while (rem_len > 0) {
off += fd_len; /* point to first remaining byte */
fd_len = rem_len; /* save remaining size, pass to calc */
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len,
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
count_my_req_per_proc[proc]++;
@ -670,8 +536,8 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
if (count_my_req_per_proc[i]) {
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
my_req[i].lens = (int *)
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
my_req[i].lens =
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed
@ -687,7 +553,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
continue;
off = offset_list[i];
fd_len = len_list[i];
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
fd_start, fd_end);
/* for each separate contiguous access from this process */
@ -708,14 +574,13 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
* and the associated count.
*/
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].lens[l] = fd_len;
my_req[proc].count++;
while (rem_len > 0) {
off += fd_len;
fd_len = rem_len;
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len,
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
fd_size, fd_start, fd_end);
if (buf_idx[proc] == -1)
@ -729,8 +594,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
rem_len -= fd_len;
my_req[proc].offsets[l] = off;
ADIOI_Assert(fd_len == (int) fd_len);
my_req[proc].lens[l] = (int) fd_len;
my_req[proc].lens[l] = fd_len;
my_req[proc].count++;
}
}
@ -743,7 +607,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
my_req[i].count);
for (l=0; l < my_req[i].count; l++) {
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %lld\n", l,
my_req[i].offsets[l], l, my_req[i].lens[l]);
}
}
@ -756,7 +620,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5025, 0, NULL);
#endif
TRACE_ERR("Leaving ADIOI_BG_Calc_my_req\n");
TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
}
/*
@ -776,14 +640,14 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr)
{
TRACE_ERR("Entering ADIOI_BG_Calc_others_req\n");
TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
/* determine what requests of other processes lie in this process's
file domain */
@ -820,7 +684,7 @@ void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
*/
count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
/* cora2a1=timebase(); */
for(i=0;i<nprocs;i++)
/*for(i=0;i<nprocs;i++) ?*/
MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
count_others_req_per_proc, 1, MPI_INT, fd->comm);
@ -852,8 +716,8 @@ for(i=0;i<nprocs;i++)
others_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
others_req[i].lens = (int *)
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
others_req[i].lens =
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
recvBufForOffsets = others_req[i].offsets;
@ -903,7 +767,6 @@ for(i=0;i<nprocs;i++)
if ( sendBufForLens == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens = NULL;
/* Calculate the displacements from the sendBufForOffsets/Lens */
MPI_Barrier(fd->comm);
for (i=0; i<nprocs; i++)
{
/* Send these offsets to process i.*/
@ -911,7 +774,7 @@ for(i=0;i<nprocs;i++)
if ( scounts[i] == 0 )
sdispls[i] = 0;
else
sdispls[i] = (int)
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].offsets -
(MPIR_Upint)sendBufForOffsets ) /
(MPIR_Upint)sizeof(ADIO_Offset) );
@ -948,7 +811,7 @@ for(i=0;i<nprocs;i++)
sdispls[i] = (int)
( ( (MPIR_Upint)my_req[i].lens -
(MPIR_Upint)sendBufForLens ) /
(MPIR_Upint) sizeof(int) );
(MPIR_Upint) sizeof(ADIO_Offset) );
/* Receive these offsets from process i. */
rcounts[i] = count_others_req_per_proc[i];
@ -958,14 +821,14 @@ for(i=0;i<nprocs;i++)
rdispls[i] = (int)
( ( (MPIR_Upint)others_req[i].lens -
(MPIR_Upint)recvBufForLens ) /
(MPIR_Upint) sizeof(int) );
(MPIR_Upint) sizeof(ADIO_Offset) );
}
/* Exchange the lengths */
MPI_Alltoallv(sendBufForLens,
scounts, sdispls, MPI_INT,
scounts, sdispls, ADIO_OFFSET,
recvBufForLens,
rcounts, rdispls, MPI_INT,
rcounts, rdispls, ADIO_OFFSET,
fd->comm);
/* Clean up */
@ -979,5 +842,5 @@ for(i=0;i<nprocs;i++)
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5027, 0, NULL);
#endif
TRACE_ERR("Leaving ADIOI_BG_Calc_others_req\n");
TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
}

Просмотреть файл

@ -1,104 +1,86 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.h
* \brief ???
*/
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_aggrs.h
* \brief ???
*/
/*
* File: ad_gpfs_aggrs.h
*
* Declares functions optimized specifically for GPFS parallel I/O solution.
*
*/
#ifndef AD_GPFS_AGGRS_H_
#define AD_GPFS_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
/*
* File: ad_bg_aggrs.h
*
* Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
* . Aligned file-domain partitioning, integrated in 7/28/2005
*
* In addition, following optimizations are planned:
* . Integrating multiple file-domain partitioning schemes
* (corresponding to Alok Chouhdary's persistent file domain work).
*/
#ifndef AD_BG_AGGRS_H_
#define AD_BG_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
/* File system (BG) specific information -
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
typedef struct ADIOI_BG_fs_s {
__blksize_t blksize;
int fsync_aggr; /* "fsync aggregation" flags (below) */
#define ADIOI_BG_FSYNC_AGGREGATION_DISABLED 0x00
#define ADIOI_BG_FSYNC_AGGREGATION_ENABLED 0x01
#define ADIOI_BG_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
} ADIOI_BG_fs;
/* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr);
/* a utilitiy function for debugging */
int ADIOI_BG_Aggrs_index(ADIO_File fd, int myrank );
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
static file domain partitioning */
int ADIOI_BG_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end);
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */
void ADIOI_BG_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr);
/*
* ADIOI_Calc_others_req
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr);
#endif /* AD_BG_AGGRS_H_ */
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
ADIO_Offset *st_offsets,
ADIO_Offset *end_offsets,
int nprocs,
int nprocs_for_coll,
ADIO_Offset *min_st_offset_ptr,
ADIO_Offset **fd_start_ptr,
ADIO_Offset **fd_end_ptr,
ADIO_Offset *fd_size_ptr,
void *fs_ptr);
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
static file domain partitioning */
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
ADIO_Offset off,
ADIO_Offset min_off,
ADIO_Offset *len,
ADIO_Offset fd_size,
ADIO_Offset *fd_start,
ADIO_Offset *fd_end);
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
static file domain partitioning */
void ADIOI_GPFS_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
int contig_access_count, ADIO_Offset
min_st_offset, ADIO_Offset *fd_start,
ADIO_Offset *fd_end, ADIO_Offset fd_size,
int nprocs,
int *count_my_req_procs_ptr,
int **count_my_req_per_proc_ptr,
ADIOI_Access **my_req_ptr,
int **buf_idx_ptr);
/*
* ADIOI_Calc_others_req
*
* param[in] count_my_req_procs Number of processes whose file domain my
* request touches.
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
* contig. requests of this process in
* process i's file domain.
* param[in] my_req A structure defining my request
* param[in] nprocs Number of nodes in the block
* param[in] myrank Rank of this node
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
* my process's file domain (including my
* process itself)
* param[out] others_req_ptr Array of other process' requests that lie
* in my process's file domain
*/
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
int *count_my_req_per_proc,
ADIOI_Access *my_req,
int nprocs, int myrank,
int *count_others_req_procs_ptr,
ADIOI_Access **others_req_ptr);
#endif /* AD_GPFS_AGGRS_H_ */

Просмотреть файл

@ -2,7 +2,7 @@
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_close.c
* \file ad_gpfs_close.c
* \brief ???
*/
@ -12,18 +12,22 @@
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_bgl.h"
#include "ad_bgl_aggrs.h"
#include "ad_gpfs.h"
#include "ad_gpfs_tuning.h"
#include <unistd.h>
void ADIOI_BGL_Close(ADIO_File fd, int *error_code)
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code)
{
int err, derr=0;
static char myname[] = "ADIOI_BGL_CLOSE";
static char myname[] = "ADIOI_GPFS_CLOSE";
#ifdef PROFILE
MPE_Log_event(9, 0, "start close");
#endif
if (fd->null_fd >= 0)
close(fd->null_fd);
err = close(fd->fd_sys);
if (fd->fd_direct >= 0)
{

Просмотреть файл

@ -0,0 +1,68 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_flush.c
* \brief Scalable flush for GPFS
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code)
{
int err=0;
static char myname[] = "ADIOI_GPFS_FLUSH";
int rank;
MPI_Comm_rank(fd->comm, &rank);
/* the old logic about who is an fsync aggregator and who is not fell down
* when deferred open was enabled. Instead, make this look more like
* ad_pvfs2_flush. If one day the I/O aggregators have something they need
* to flush, we can consult the 'fd->hints->ranklist[]' array. For now, a
* flush from one process should suffice */
/* ensure all other proceses are done writing. On many platforms MPI_Reduce
* is fastest because it has the lightest constraints. On Blue Gene, BARRIER
* is optimized */
MPI_Barrier(fd->comm);
if (rank == fd->hints->ranklist[0]) {
err = fsync(fd->fd_sys);
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
/* We want errno, not the return code if it failed */
if (err == -1) err = errno;
else err = 0;
}
MPI_Bcast(&err, 1, MPI_UNSIGNED, fd->hints->ranklist[0], fd->comm);
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
if (err) /* if it's non-zero, it must be an errno */
{
errno = err;
err = -1;
}
/* --BEGIN ERROR HANDLING-- */
if (err == -1)
{
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
myname, __LINE__, MPI_ERR_IO,
"**io",
"**io %s", strerror(errno));
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
return;
}
/* --END ERROR HANDLING-- */
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,288 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_hints.c
* \brief GPFS hint processing - for now, only used for BlueGene and PE platforms
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "adio.h"
#include "adio_extern.h"
#include "hint_fns.h"
#include "ad_gpfs.h"
#define ADIOI_GPFS_CB_BUFFER_SIZE_DFLT "16777216"
#define ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT "4194304"
#define ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT "4194304"
#ifdef BGQPLATFORM
#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
#endif
/** \page mpiio_vars MPIIO Configuration
*
* GPFS MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
*
* Used for BlueGene and PE platforms, which each have their own aggregator selection
* algorithms that ignore user provided cb_config_list.
*
* \section hint_sec Hints
* - bg_nodes_pset - BlueGene only - specify how many aggregators to use per pset.
* This hint will override the cb_nodes hint based on BlueGene psets.
* - N - Use N nodes per pset as aggregators.
* - Default is based on partition configuration and cb_nodes.
*
* The following default key/value pairs may differ from other platform defaults.
*
* - key = cb_buffer_size value = 16777216
* - key = romio_cb_read value = enable
* - key = romio_cb_write value = enable
* - key = ind_rd_buffer_size value = 4194304
* - key = ind_wr_buffer_size value = 4194304
*/
#ifdef BGQPLATFORM
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
extern int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
#elif PEPLATFORM
extern int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
#endif
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
Initialize fd->info to default values.
Initialize fd->hints to default values.
Examine the info object passed by the user. If it contains values that
ROMIO understands, override the default. */
MPI_Info info;
char *value;
int flag, intval, nprocs=0, nprocs_is_valid = 0;
static char myname[] = "ADIOI_GPFS_SETINFO";
int did_anything = 0;
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
info = fd->info;
/* Note that fd->hints is allocated at file open time; thus it is
* not necessary to allocate it, or check for allocation, here.
*/
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Assert ((value != NULL));
/* initialize info and hints to default values if they haven't been
* previously initialized
*/
if (!fd->hints->initialized) {
ad_gpfs_get_env_vars();
did_anything = 1;
/* buffer size for collective I/O */
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
/* default is to let romio automatically decide when to use
* collective buffering
*/
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = ADIOI_HINT_ENABLE;
ADIOI_Info_set(info, "romio_cb_write", "enable");
fd->hints->cb_write = ADIOI_HINT_ENABLE;
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
fd->hints->cb_config_list = NULL;
/* number of processes that perform I/O in collective I/O */
MPI_Comm_size(fd->comm, &nprocs);
nprocs_is_valid = 1;
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
ADIOI_Info_set(info, "cb_nodes", value);
fd->hints->cb_nodes = -1;
/* hint indicating that no indep. I/O will be performed on this file */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
/* gpfs is not implementing file realms (ADIOI_IOStridedColl),
initialize to disabled it. */
/* hint instructing the use of persistent file realms */
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
/* hint guiding the assignment of persistent file realms */
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
fd->hints->cb_fr_type = ADIOI_FR_AAR;
/* hint to align file realms with a certain byte value */
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
fd->hints->cb_fr_alignment = 1;
/* hint to set a threshold percentage for a datatype's size/extent at
* which data sieving should be done in collective I/O */
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
fd->hints->cb_ds_threshold = 0;
/* hint to switch between point-to-point or all-to-all for two-phase */
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
/* deferred_open derived from no_indep_rw and cb_{read,write} */
fd->hints->deferred_open = 0;
/* buffer size for data sieving in independent reads */
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
/* buffer size for data sieving in independent writes */
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
ADIOI_Info_set(info, "romio_ds_read", "automatic");
fd->hints->ds_read = ADIOI_HINT_AUTO;
ADIOI_Info_set(info, "romio_ds_write", "automatic");
fd->hints->ds_write = ADIOI_HINT_AUTO;
/* still to do: tune this a bit for a variety of file systems. there's
* no good default value so just leave it unset */
fd->hints->min_fdomain_size = 0;
fd->hints->striping_unit = 0;
fd->hints->initialized = 1;
}
/* add in user's info if supplied */
if (users_info != MPI_INFO_NULL) {
ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
&(fd->hints->cb_buffer_size), myname, error_code);
/* new hints for enabling/disabling coll. buffering on
* reads/writes
*/
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
&(fd->hints->cb_read), myname, error_code);
if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
/* romio_cb_read overrides no_indep_rw */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
&(fd->hints->cb_write), myname, error_code);
if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
/* romio_cb_write overrides no_indep_rw */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
}
/* Has the user indicated all I/O will be done collectively? */
ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
&(fd->hints->no_indep_rw), myname, error_code);
if (fd->hints->no_indep_rw == 1) {
/* if 'no_indep_rw' set, also hint that we will do
* collective buffering: if we aren't doing independent io,
* then we have to do collective */
ADIOI_Info_set(info, "romio_cb_write", "enable");
ADIOI_Info_set(info, "romio_cb_read", "enable");
fd->hints->cb_read = 1;
fd->hints->cb_write = 1;
}
/* new hints for enabling/disabling data sieving on
* reads/writes
*/
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
&(fd->hints->ds_read), myname, error_code);
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
&(fd->hints->ds_write), myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
&(fd->hints->ind_wr_buffer_size), myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
&(fd->hints->ind_rd_buffer_size), myname, error_code);
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
value, &flag);
if ( flag && ((intval = atoi(value)) > 0) ) {
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
fd->hints->min_fdomain_size = intval;
}
/* Now we use striping unit in common code so we should
process hints for it. */
ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
&(fd->hints->striping_unit), myname, error_code);
#ifdef BGQPLATFORM
memset( value, 0, MPI_MAX_INFO_VAL+1 );
ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
value, &flag);
if (flag && ((intval = atoi(value)) > 0)) {
did_anything = 1;
ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
fd->hints->cb_nodes = intval;
}
#endif
}
/* special CB aggregator assignment */
if (did_anything) {
#ifdef BGQPLATFORM
ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
#elif PEPLATFORM
ADIOI_PE_gen_agg_ranklist(fd);
#endif
}
/* deferred_open won't be set by callers, but if the user doesn't
* explicitly disable collecitve buffering (two-phase) and does hint that
* io w/o independent io is going on, we'll set this internal hint as a
* convenience */
if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \
&& (fd->hints->cb_write != ADIOI_HINT_DISABLE)\
&& fd->hints->no_indep_rw ) ) {
fd->hints->deferred_open = 1;
} else {
/* setting romio_no_indep_rw enable and romio_cb_{read,write}
* disable at the same time doesn't make sense. honor
* romio_cb_{read,write} and force the no_indep_rw hint to
* 'disable' */
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
fd->hints->no_indep_rw = 0;
fd->hints->deferred_open = 0;
}
/* BobC commented this out, but since hint processing runs on both bg and
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
* PVFS */
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
/* disable data sieving for fs that do not
support file locking */
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
/* get rid of this value if it is set */
ADIOI_Info_delete(info, "ind_wr_buffer_size");
}
/* note: leave ind_wr_buffer_size alone; used for other cases
* as well. -- Rob Ross, 04/22/2003
*/
ADIOI_Info_set(info, "romio_ds_write", "disable");
fd->hints->ds_write = ADIOI_HINT_DISABLE;
}
ADIOI_Free(value);
*error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,156 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_open.c
* \brief ???
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
#include "ad_gpfs.h"
#include "ad_gpfs_tuning.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
#ifdef HAVE_GPFS_FCNTL_H
#include <gpfs_fcntl.h>
#endif
#ifdef HAVE_GPFS_FCNTL_H
static void gpfs_free_all_locks(int fd)
{
int rc;
struct {
gpfsFcntlHeader_t header;
gpfsFreeRange_t release;
} release_all;
release_all.header.totalLength = sizeof(release_all);
release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
release_all.header.fcntlReserved = 0;
release_all.release.structLen = sizeof(release_all.release);
release_all.release.structType = GPFS_FREE_RANGE;
release_all.release.start = 0;
release_all.release.length = 0;
rc = gpfs_fcntl(fd, &release_all);
if (rc != 0) {
DBGV_FPRINTF(stderr,"GPFS fcntl release failed with rc=%d, errno=%d\n",
rc,errno);
}
}
#endif
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
{
int perm, old_mask, amode, rank, rc;
static char myname[] = "ADIOI_GPFS_OPEN";
/* set internal variables for tuning environment variables */
ad_gpfs_get_env_vars();
if (fd->perm == ADIO_PERM_NULL) {
old_mask = umask(022);
umask(old_mask);
perm = old_mask ^ 0666;
}
else perm = fd->perm;
amode = 0;
if (fd->access_mode & ADIO_CREATE)
amode = amode | O_CREAT;
if (fd->access_mode & ADIO_RDONLY)
amode = amode | O_RDONLY;
if (fd->access_mode & ADIO_WRONLY)
amode = amode | O_WRONLY;
if (fd->access_mode & ADIO_RDWR)
amode = amode | O_RDWR;
if (fd->access_mode & ADIO_EXCL)
amode = amode | O_EXCL;
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
fd->fd_sys = open(fd->filename, amode, perm);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
fd->fd_direct = -1;
if (gpfsmpio_devnullio == 1) {
fd->null_fd = open("/dev/null", O_RDWR);
} else {
fd->null_fd = -1;
}
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if(fd->fd_sys != -1)
{
fd->blksize = 1048576; /* default to 1M */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
#endif
/* in this fs-specific routine, we might not be called over entire
* communicator (deferred open). Collect statistics on one process.
* ADIOI_GEN_Opencoll (common-code caller) will take care of the
* broadcast */
MPI_Comm_rank(fd->comm, &rank);
if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
struct stat64 gpfs_statbuf;
/* Get the (real) underlying file system block size */
rc = stat64(fd->filename, &gpfs_statbuf);
if (rc >= 0)
{
fd->blksize = gpfs_statbuf.st_blksize;
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
fd->filename,gpfs_statbuf.st_blksize);
}
else
{
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
fd->filename,rc,errno);
}
}
/* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
* will take care of that in both standard and deferred-open case */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
#endif
#ifdef HAVE_GPFS_FCNTL_H
/* in parallel workload, might be helpful to immediately release block
* tokens. Or, system call overhead will outweigh any benefits... */
if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL)
gpfs_free_all_locks(fd->fd_sys);
#endif
}
if (fd->fd_sys == -1) {
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}
/*
*vim: ts=8 sts=4 sw=4 noexpandtab
*/

Просмотреть файл

@ -2,7 +2,7 @@
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_rdcoll.c
* \file ad_gpfs_rdcoll.c
* \brief ???
*/
@ -15,9 +15,8 @@
#include "adio.h"
#include "adio_extern.h"
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#include "ad_gpfs.h"
#include "ad_gpfs_aggrs.h"
#ifdef PROFILE
#include "mpe.h"
@ -87,7 +86,9 @@ extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
ADIO_Offset *end_offset_ptr, int
*contig_access_count_ptr);
void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
@ -112,20 +113,19 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
*fd_end = NULL, *end_offsets = NULL;
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
int ii;
ADIO_Offset *len_list = NULL;
int *buf_idx = NULL;
#if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, r )
#endif
GPFSMPIO_T_CIO_RESET( r);
#ifdef HAVE_STATUS_SET_BYTES
int bufsize, size;
MPI_Count bufsize, size;
#endif
#if 0
/* From common code - not implemented for bgl. */
/* From common code - not implemented for bg. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype,
file_ptr_type, offset, status, error_code);
@ -143,9 +143,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
nprocs_for_coll = fd->hints->cb_nodes;
orig_fp = fd->fp_ind;
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
/* only check for interleaving if cb_read isn't disabled */
if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
@ -157,11 +156,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
&offset_list, &len_list, &start_offset,
&end_offset, &contig_access_count);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
#endif
&end_offset, &contig_access_count);
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
#ifdef RDCOLL_DEBUG
for (i=0; i<contig_access_count; i++) {
@ -177,24 +174,24 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
if (bglmpio_tunegather) {
bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
bgl_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
if (gpfsmpio_tunegather) {
gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
gpfs_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
for (ii=0; ii<nprocs; ii++) {
bgl_offsets0[ii*2] = 0;
bgl_offsets0[ii*2+1] = 0;
gpfs_offsets0[ii*2] = 0;
gpfs_offsets0[ii*2+1] = 0;
}
bgl_offsets0[myrank*2] = start_offset;
bgl_offsets0[myrank*2+1] = end_offset;
gpfs_offsets0[myrank*2] = start_offset;
gpfs_offsets0[myrank*2+1] = end_offset;
MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
for (ii=0; ii<nprocs; ii++) {
st_offsets [ii] = bgl_offsets[ii*2] ;
end_offsets[ii] = bgl_offsets[ii*2+1];
st_offsets [ii] = gpfs_offsets[ii*2] ;
end_offsets[ii] = gpfs_offsets[ii*2+1];
}
ADIOI_Free( bgl_offsets0 );
ADIOI_Free( bgl_offsets );
ADIOI_Free( gpfs_offsets0 );
ADIOI_Free( gpfs_offsets );
} else {
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
ADIO_OFFSET, fd->comm);
@ -202,9 +199,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
ADIO_OFFSET, fd->comm);
}
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
/* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++)
@ -246,9 +241,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
return;
}
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
/* We're going to perform aggregation of I/O. Here we call
* ADIOI_Calc_file_domains() to determine what processes will handle I/O
@ -266,8 +259,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
* needs to be mapped to an actual rank in the communicator later.
*
*/
if (bglmpio_tuneblocking)
ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size, fd->fs_ptr);
else
@ -277,9 +270,39 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
if (gpfsmpio_p2pcontig==1) {
/* For some simple yet common(?) workloads, full-on two-phase I/O is
* overkill. We can establish sub-groups of processes and their
* aggregator, and then these sub-groups will carry out a simplified
* two-phase over that sub-group.
*
* First verify that the filetype is contig and the offsets are
* increasing in rank order*/
int x, inOrderAndNoGaps = 1;
for (x=0;x<(nprocs-1);x++) {
if (end_offsets[x] != (st_offsets[x+1]-1))
inOrderAndNoGaps = 0;
}
if (inOrderAndNoGaps && buftype_is_contig) {
/* if these conditions exist then execute the P2PContig code else
* execute the original code */
ADIOI_P2PContigReadAggregation(fd, buf,
error_code, st_offsets, end_offsets, fd_start, fd_end);
/* NOTE: we are skipping the rest of two-phase in this path */
GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
ADIOI_Free(fd_start);
ADIOI_Free(fd_end);
goto fn_exit;
}
}
/* calculate where the portions of the access requests of this process
* are located in terms of the file domains. this could be on the same
@ -293,8 +316,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
* buf_idx[] - array of locations into which data can be directly moved;
* this is only valid for contiguous buffer case
*/
if (bglmpio_tuneblocking)
ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
min_st_offset, fd_start, fd_end, fd_size,
nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
@ -306,9 +329,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
&count_my_req_per_proc, &my_req,
&buf_idx);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
/* perform a collective communication in order to distribute the
* data calculated above. fills in the following:
@ -317,11 +338,11 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
* count_others_req_per_proc[] - number of separate contiguous
* requests from proc i lie in this process's file domain.
*/
if (bglmpio_tuneblocking)
ADIOI_BGL_Calc_others_req(fd, count_my_req_procs,
count_my_req_per_proc, my_req,
nprocs, myrank, &count_others_req_procs,
&others_req);
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
count_my_req_per_proc, my_req,
nprocs, myrank, &count_others_req_procs,
&others_req);
else
ADIOI_Calc_others_req(fd, count_my_req_procs,
@ -329,9 +350,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
nprocs, myrank, &count_others_req_procs,
&others_req);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
#endif
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
/* my_req[] and count_my_req_per_proc aren't needed at this point, so
* let's free the memory
@ -354,12 +373,10 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
len_list, contig_access_count, min_st_offset,
fd_size, fd_start, fd_end, buf_idx, error_code);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
BGLMPIO_T_CIO_REPORT( 0, r, fd, myrank )
#endif
GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
@ -381,8 +398,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
ADIOI_Free(fd_start);
ADIOI_Free(fd_end);
fn_exit:
#ifdef HAVE_STATUS_SET_BYTES
MPI_Type_size(datatype, &size);
MPI_Type_size_x(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
/* This is a temporary way of filling in status. The right way is to
@ -470,7 +488,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize);
read_buf = fd->io_buf;
curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* its use is explained below. calloc initializes to 0. */
@ -627,6 +645,9 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
MPE_Log_event(14, 0, "end computation");
#endif
if (flag) {
char round[50];
sprintf(round, "two-phase-round=%d", m);
setenv("LIBIOLOG_EXTRA_INFO", round, 1);
ADIOI_Assert(size == (int)size);
ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, error_code);
@ -644,17 +665,17 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
#ifdef PROFILE
MPE_Log_event(7, 0, "start communication");
#endif
if (bglmpio_comm == 1)
if (gpfsmpio_comm == 1)
ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
send_size, recv_size, count,
start_pos, partial_send, recd_from_proc, nprocs,
myrank,
buftype_is_contig, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
others_req,
m, buftype_extent, buf_idx);
else
if (bglmpio_comm == 0) {
others_req,
m, buftype_extent, buf_idx);
else
if (gpfsmpio_comm == 0) {
ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
send_size, recv_size, count,
start_pos, partial_send, recd_from_proc, nprocs,
@ -675,9 +696,10 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
ADIOI_Free(read_buf);
read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
memcpy(read_buf, tmp_buf, for_next_iter);
ADIOI_Free(fd->io_buf);
fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
memcpy(fd->io_buf, tmp_buf, for_next_iter);
read_buf = fd->io_buf;
ADIOI_Free(tmp_buf);
}
@ -692,7 +714,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
for (m=ntimes; m<max_ntimes; m++)
/* nothing to send, but check for recv. */
if (bglmpio_comm == 1)
if (gpfsmpio_comm == 1)
ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
send_size, recv_size, count,
start_pos, partial_send, recd_from_proc, nprocs,
@ -702,7 +724,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
others_req, m,
buftype_extent, buf_idx);
else /* strncmp( env_switch, "alltoall", 8 ) == 0 */
if (bglmpio_comm == 0)
if (gpfsmpio_comm == 0)
ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
send_size, recv_size, count,
start_pos, partial_send, recd_from_proc, nprocs,
@ -716,7 +738,6 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
MPE_Log_event(8, 0, "end communication");
#endif
if (ntimes) ADIOI_Free(read_buf);
ADIOI_Free(curr_offlen_ptr);
ADIOI_Free(count);
ADIOI_Free(partial_send);
@ -724,6 +745,8 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
ADIOI_Free(recv_size);
ADIOI_Free(recd_from_proc);
ADIOI_Free(start_pos);
unsetenv("LIBIOLOG_EXTRA_INFO");
}
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
@ -807,8 +830,8 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
tmp = others_req[i].lens[k];
others_req[i].lens[k] = partial_send[i];
}
MPI_Type_hindexed(count[i],
&(others_req[i].lens[start_pos[i]]),
ADIOI_Type_create_hindexed_x(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, &send_type);
/* absolute displacement; use MPI_BOTTOM in send */
@ -968,7 +991,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_BGL_Calc_aggregator(fd,
p = ADIOI_GPFS_Calc_aggregator(fd,
off,
min_st_offset,
&len,
@ -1101,7 +1124,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
if (nprocs_send) {
DBG_FPRINTF(stderr, "\tall_send_buf = [%d]%2d,",0,all_send_buf[0]);
for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
/* someone at some point found it useful to look at the 128th kilobyte of data from each processor, but this segfaults in many situations if "all debugging" enabled */
//for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
}
#endif

Просмотреть файл

@ -0,0 +1,277 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_tuning.c
* \brief Defines ad_gpfs performance tuning
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 2008 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*---------------------------------------------------------------------
* ad_gpfs_tuning.c
*
* defines global variables and functions for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#include "ad_gpfs_tuning.h"
#include "mpi.h"
#if !defined(PVFS2_SUPER_MAGIC)
#define PVFS2_SUPER_MAGIC (0x20030528)
#endif
int gpfsmpio_timing;
int gpfsmpio_timing2;
int gpfsmpio_timing_cw_level;
int gpfsmpio_comm;
int gpfsmpio_tunegather;
int gpfsmpio_tuneblocking;
long bglocklessmpio_f_type;
int gpfsmpio_bg_nagg_pset;
int gpfsmpio_pthreadio;
int gpfsmpio_p2pcontig;
int gpfsmpio_balancecontig;
int gpfsmpio_devnullio;
int gpfsmpio_bridgeringagg;
double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
/* set internal variables for tuning environment variables */
/** \page mpiio_vars MPIIO Configuration
\section env_sec Environment Variables
* - GPFSMPIO_COMM - Define how data is exchanged on collective
* reads and writes. Possible values:
* - 0 - Use MPI_Alltoallv.
* - 1 - Use MPI_Isend/MPI_Irecv.
* - Default is 0.
*
* - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
* Possible values:
* - 0 - Do not collect/report timing.
* - 1 - Collect/report timing.
* - Default is 0.
*
* - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
* for aggregator collective i/o. Possible values:
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
* - Default is 1.
*
* - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
* calculated (block size). Possible values:
* - 0 - Evenly calculate file domains across aggregators. Also use
* MPI_Isend/MPI_Irecv to exchange domain information.
* - 1 - Align file domains with the underlying file system's block size. Also use
* MPI_Alltoallv to exchange domain information.
* - Default is 1.
*
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
* "bg:" or "bglockless:") on a file name will override this environment
* variable. Possible values:
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
* statfs() field f_type.
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
*
* - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
* compute group (compute nodes + i/o nodes). Possible values:
* - any integer
* - Default is 8
*
* - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
* pthread is spawned to do the posix writes while the main thread does the
* data aggregation - useful for large files where multiple rounds are
* required (more that the cb_buffer_size of data per aggregator). User
* must ensure there is hw resource available for the thread to run. I
* am sure there is a better way to do this involving comm threads - this is
* just a start. NOTE: For some reason the stats collected when this is
* enabled misses some of the data so the data sizes are off a bit - this is
* a statistical issue only, the data is still accurately written out
*
* - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
* aggregator and the procs that feed it. Performance could be enhanced by a
* one-sided put algorithm. Current implementation allows only 1 round of
* data. Useful/allowed only when:
* 1.) The datatype is contiguous.
* 2.) The offsets are increasing in rank-order.
* 3.) There are no gaps between the offsets.
* 4.) No single rank has a data size which spans multiple file domains.
*
* - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
* to aggregators in a breadth-first fashion relative to the ions - additionally,
* file domains on the aggregators sharing the same bridgeset and ion have contiguous
* offsets. The breadth-first assignment improves performance in the case of
* a relatively small file of size less than the gpfs block size multiplied
* by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
* - 0 - assign file domain blocks in the traditional manner
* - 1 - if there are variable sized file domain blocks, spread them out
* (balance) across bridge nodes
*
* - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
* system. When experimenting with different two-phase I/O strategies, it's
* helpful to remove the highly variable file system from the experiment.
* - 0 (disabled) or 1 (enabled)
* - Default is 0
*
* - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
* optimization whch forms a 5-d ring around the bridge node starting at
* GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
* suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
* and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
* GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
*
*/
void ad_gpfs_get_env_vars() {
char *x, *dummy;
gpfsmpio_comm = 0;
x = getenv( "GPFSMPIO_COMM" );
if (x) gpfsmpio_comm = atoi(x);
gpfsmpio_timing = 0;
x = getenv( "GPFSMPIO_TIMING" );
if (x) gpfsmpio_timing = atoi(x);
gpfsmpio_tunegather = 1;
x = getenv( "GPFSMPIO_TUNEGATHER" );
if (x) gpfsmpio_tunegather = atoi(x);
gpfsmpio_tuneblocking = 1;
x = getenv( "GPFSMPIO_TUNEBLOCKING" );
if (x) gpfsmpio_tuneblocking = atoi(x);
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
bglocklessmpio_f_type,bglocklessmpio_f_type);
/* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
* when we know a bit more about what "largest possible value" and
* "smallest possible value" should be */
gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
x = getenv("GPFSMPIO_NAGG_PSET");
if (x) gpfsmpio_bg_nagg_pset = atoi(x);
gpfsmpio_pthreadio = 0;
x = getenv( "GPFSMPIO_PTHREADIO" );
if (x) gpfsmpio_pthreadio = atoi(x);
gpfsmpio_p2pcontig = 0;
x = getenv( "GPFSMPIO_P2PCONTIG" );
if (x) gpfsmpio_p2pcontig = atoi(x);
gpfsmpio_balancecontig = 0;
x = getenv( "GPFSMPIO_BALANCECONTIG" );
if (x) gpfsmpio_balancecontig = atoi(x);
gpfsmpio_devnullio = 0;
x = getenv( "GPFSMPIO_DEVNULLIO" );
if (x) gpfsmpio_devnullio = atoi(x);
gpfsmpio_bridgeringagg = 0;
x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
if (x) gpfsmpio_bridgeringagg = atoi(x);
}
/* report timing breakdown for MPI I/O collective call */
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
{
int i;
if (gpfsmpio_timing) {
/* Timing across the whole communicator is a little bit interesting,
* but what is *more* interesting is if we single out the aggregators
* themselves. non-aggregators spend a lot of time in "exchange" not
* exchanging data, but blocked because they are waiting for
* aggregators to finish writing. If we focus on just the aggregator
* processes we will get a more clear picture about the data exchange
* vs. i/o time breakdown */
/* if deferred open enabled, we could use the aggregator communicator */
MPI_Comm agg_comm;
int nr_aggs, agg_rank;
MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
if(agg_comm != MPI_COMM_NULL) {
MPI_Comm_size(agg_comm, &nr_aggs);
MPI_Comm_rank(agg_comm, &agg_rank);
}
double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
if( agg_comm != MPI_COMM_NULL) {
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
}
if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ];
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ];
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
fprintf(stderr,"SEEK-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] );
fprintf(stderr,"SEEK-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] );
fprintf(stderr,"LOCAL-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] );
fprintf(stderr,"GATHER-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] );
fprintf(stderr,"PATTERN-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] );
fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] );
fprintf(stderr,"MYREQ-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] );
fprintf(stderr,"OTHERREQ-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] );
fprintf(stderr,"EXCHANGE-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] );
fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] );
fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] );
fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] );
fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] );
fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] );
fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] );
fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] );
fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] );
fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] );
fprintf(stderr,"MPI-BW-avg: %10.3f , ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] );
fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
}
if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
}
}

Просмотреть файл

@ -0,0 +1,114 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_gpfs_tuning.h
* \brief ???
*/
/*---------------------------------------------------------------------
* ad_gpfs_tuning.h
*
* declares global variables and macros for performance tuning and
* functional debugging.
*---------------------------------------------------------------------*/
#ifndef AD_GPFS_TUNING_H_
#define AD_GPFS_TUNING_H_
#include "adio.h"
/*-----------------------------------------
* Global variables for the control of
* 1. timing
* 2. select specific optimizations
*-----------------------------------------*/
/* timing fields */
enum {
GPFSMPIO_CIO_DATA_SIZE=0,
GPFSMPIO_CIO_T_SEEK,
GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */
GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
/* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
size info with everyone else */
GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */
GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */
GPFSMPIO_CIO_T_POSI_RW,
GPFSMPIO_CIO_B_POSI_RW,
GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */
GPFSMPIO_CIO_B_MPIO_RW,
GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */
GPFSMPIO_CIO_B_MPIO_CRW,
GPFSMPIO_CIO_LAST
};
/* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
extern double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
extern double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
/* corresponds to environment variables to select optimizations and timing level */
extern int gpfsmpio_timing;
extern int gpfsmpio_timing_cw_level;
extern int gpfsmpio_comm;
extern int gpfsmpio_tunegather;
extern int gpfsmpio_tuneblocking;
extern long bglocklessmpio_f_type;
extern int gpfsmpio_pthreadio;
extern int gpfsmpio_p2pcontig;
extern int gpfsmpio_balancecontig;
extern int gpfsmpio_devnullio;
extern int gpfsmpio_bridgeringagg;
/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
* i/o node and all compute nodes wired to it. On Blue Gene /Q that
* relationship is a lot more fluid. There are still I/O nodes, and compute
* nodes are assigned to an i/o node, but there are two routes to the i/o node,
* via compute nodes designated as "bridge nodes". In this code, what we used
* to call a "pset" is actually "compute nodes associated with and including a
* bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but
* look closely at ADIOI_BG_persInfo_init() for the details */
#define ADIOI_BG_NAGG_PSET_DFLT 16
extern int gpfsmpio_bg_nagg_pset;
/* set internal variables for tuning environment variables */
void ad_gpfs_get_env_vars(void);
/* report timing breakdown for MPI I/O collective call */
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
/* note:
* T := timing;
* CIO := collective I/O
*/
#define GPFSMPIO_T_CIO_RESET( RW ) \
{ \
int _i; \
for ( _i = 0; _i < GPFSMPIO_CIO_LAST; _i ++ ) \
gpfsmpio_prof_c##RW [ _i ] = 0; \
}
#define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
#define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
{\
double temp = MPI_Wtime(); \
if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\
}
#endif /* AD_GPFS_TUNING_H_ */

Просмотреть файл

@ -2,7 +2,7 @@
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bgl_wrcoll.c
* \file ad_gpfs_wrcoll.c
* \brief ???
*/
@ -14,9 +14,12 @@
#include "adio.h"
#include "adio_extern.h"
#include "ad_bgl.h"
#include "ad_bgl_pset.h"
#include "ad_bgl_aggrs.h"
#include "ad_gpfs.h"
#include "ad_gpfs_aggrs.h"
#ifdef BGQPLATFORM
#include <mpix.h>
#endif
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
@ -25,6 +28,16 @@
#include "mpe.h"
#endif
#include <pthread.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
#ifdef HAVE_GPFS_FCNTL_H
#include <gpfs_fcntl.h>
#endif
#include <limits.h>
/* prototypes of functions used for collective writes only. */
static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
datatype, int nprocs, int myrank, ADIOI_Access
@ -33,7 +46,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
min_st_offset, ADIO_Offset fd_size,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
int *buf_idx, int *error_code);
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf,
ADIOI_Flatlist_node *flat_buf, ADIO_Offset
*offset_list, ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off, int size,
@ -45,10 +58,10 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
ADIOI_Access *others_req,
int *send_buf_idx, int *curr_to_proc,
int *done_to_proc, int *hole, int iter,
int *done_to_proc, int *hole, int iter,
MPI_Aint buftype_extent, int *buf_idx, int *error_code);
static void ADIOI_W_Exchange_data_alltoallv(
ADIO_File fd, void *buf,
ADIO_File fd, const void *buf,
char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
@ -66,7 +79,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
int *done_to_proc, int *hole, /* 4 */
int iter, MPI_Aint buftype_extent, int *buf_idx,
int *error_code);
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
@ -77,7 +90,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
int *send_buf_idx, int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent);
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
@ -93,7 +106,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
int nprocs, int nprocs_recv, int total_elements);
void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int
*error_code)
@ -118,30 +131,16 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off;
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
*fd_end = NULL, *end_offsets = NULL;
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
int ii;
int *buf_idx = NULL;
ADIO_Offset *len_list = NULL;
#if BGL_PROFILE
BGLMPIO_T_CIO_RESET( 0, w )
#endif
#if 0
/* From common code - not implemented for bgl.*/
int old_error, tmp_error;
#endif
GPFSMPIO_T_CIO_RESET( w )
#ifdef PROFILE
MPE_Log_event(13, 0, "start computation");
#endif
#if 0
/* From common code - not implemented for bgl. */
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype,
file_ptr_type, offset, status, error_code);
return;
}
#endif
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
@ -151,9 +150,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
nprocs_for_coll = fd->hints->cb_nodes;
orig_fp = fd->fp_ind;
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
#endif
GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
/* only check for interleaving if cb_write isn't disabled */
@ -168,9 +166,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
&offset_list, &len_list, &start_offset,
&end_offset, &contig_access_count);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
#endif
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
/* each process communicates its start and end offsets to other
processes. The result is an array each of start and end offsets stored
@ -179,24 +175,24 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
if (bglmpio_tunegather) {
bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
bgl_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
if (gpfsmpio_tunegather) {
gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
gpfs_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
for (ii=0; ii<nprocs; ii++) {
bgl_offsets0[ii*2] = 0;
bgl_offsets0[ii*2+1] = 0;
gpfs_offsets0[ii*2] = 0;
gpfs_offsets0[ii*2+1] = 0;
}
bgl_offsets0[myrank*2] = start_offset;
bgl_offsets0[myrank*2+1] = end_offset;
gpfs_offsets0[myrank*2] = start_offset;
gpfs_offsets0[myrank*2+1] = end_offset;
MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
for (ii=0; ii<nprocs; ii++) {
st_offsets [ii] = bgl_offsets[ii*2] ;
end_offsets[ii] = bgl_offsets[ii*2+1];
st_offsets [ii] = gpfs_offsets[ii*2] ;
end_offsets[ii] = gpfs_offsets[ii*2+1];
}
ADIOI_Free( bgl_offsets0 );
ADIOI_Free( bgl_offsets );
ADIOI_Free( gpfs_offsets0 );
ADIOI_Free( gpfs_offsets );
} else {
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
ADIO_OFFSET, fd->comm);
@ -204,9 +200,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIO_OFFSET, fd->comm);
}
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
#endif
GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
/* are the accesses of different processes interleaved? */
for (i=1; i<nprocs; i++)
@ -250,16 +244,14 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
return;
}
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
#endif
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
/* Divide the I/O workload among "nprocs_for_coll" processes. This is
done by (logically) dividing the file into file domains (FDs); each
process may directly access only its own file domain. */
if (bglmpio_tuneblocking)
ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
nprocs_for_coll, &min_st_offset,
&fd_start, &fd_end, &fd_size, fd->fs_ptr);
else
@ -269,15 +261,42 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
fd->hints->min_fdomain_size, &fd_size,
fd->hints->striping_unit);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
#endif
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
if (gpfsmpio_p2pcontig==1) {
/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill. We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
*
* First verify that the filetype is contig and the offsets are
* increasing in rank order*/
int i, inOrderAndNoGaps = 1;
for (i=0;i<(nprocs-1);i++) {
if (end_offsets[i] != (st_offsets[i+1]-1))
inOrderAndNoGaps = 0;
}
if (inOrderAndNoGaps && buftype_is_contig) {
/* if these conditions exist then execute the P2PContig code else
* execute the original code */
ADIOI_P2PContigWriteAggregation(fd, buf,
error_code, st_offsets, end_offsets, fd_start, fd_end);
/* NOTE: we are skipping the rest of two-phase in this path */
GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
ADIOI_Free(offset_list);
ADIOI_Free(len_list);
ADIOI_Free(st_offsets);
ADIOI_Free(end_offsets);
ADIOI_Free(fd_start);
ADIOI_Free(fd_end);
goto fn_exit;
}
}
/* calculate what portions of the access requests of this process are
located in what file domains */
if (bglmpio_tuneblocking)
ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
min_st_offset, fd_start, fd_end, fd_size,
nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
@ -287,12 +306,10 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
min_st_offset, fd_start, fd_end, fd_size,
nprocs, &count_my_req_procs,
&count_my_req_per_proc, &my_req,
&buf_idx);
&buf_idx);
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
#endif
/* based on everyone's my_req, calculate what requests of other
processes lie in this process's file domain.
count_others_req_procs = number of processes whose requests lie in
@ -300,8 +317,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
count_others_req_per_proc[i] indicates how many separate contiguous
requests of proc. i lie in this process's file domain. */
if (bglmpio_tuneblocking)
ADIOI_BGL_Calc_others_req(fd, count_my_req_procs,
if (gpfsmpio_tuneblocking)
ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
count_my_req_per_proc, my_req,
nprocs, myrank,
&count_others_req_procs, &others_req);
@ -309,11 +326,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIOI_Calc_others_req(fd, count_my_req_procs,
count_my_req_per_proc, my_req,
nprocs, myrank,
&count_others_req_procs, &others_req);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
#endif
&count_others_req_procs, &others_req);
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
ADIOI_Free(count_my_req_per_proc);
for (i=0; i < nprocs; i++) {
@ -330,54 +345,11 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
len_list, contig_access_count, min_st_offset,
fd_size, fd_start, fd_end, buf_idx, error_code);
#if BGL_PROFILE
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank )
#endif
#if 0
/* From common code - not implemented for bgl.
*
* If this collective write is followed by an independent write,
* it's possible to have those subsequent writes on other processes
* race ahead and sneak in before the read-modify-write completes.
* We carry out a collective communication at the end here so no one
* can start independent i/o before collective I/O completes.
*
* need to do some gymnastics with the error codes so that if something
* went wrong, all processes report error, but if a process has a more
* specific error code, we can still have that process report the
* additional information */
GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
old_error = *error_code;
if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
/* optimization: if only one process performing i/o, we can perform
* a less-expensive Bcast */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
#endif
if (fd->hints->cb_nodes == 1)
MPI_Bcast(error_code, 1, MPI_INT,
fd->hints->ranklist[0], fd->comm);
else {
tmp_error = *error_code;
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
MPI_MAX, fd->comm);
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
#endif
#ifdef AGGREGATION_PROFILE
MPE_Log_event (5012, 0, NULL);
#endif
if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
*error_code = old_error;
#endif
/* free all memory allocated for collective I/O */
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
@ -398,11 +370,12 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
ADIOI_Free(fd_start);
ADIOI_Free(fd_end);
fn_exit:
#ifdef HAVE_STATUS_SET_BYTES
if (status) {
int bufsize, size;
MPI_Count bufsize, size;
/* Don't set status if it isn't needed */
MPI_Type_size(datatype, &size);
MPI_Type_size_x(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
}
@ -416,6 +389,100 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
#endif
}
static void gpfs_wr_access_start(int fd, ADIO_Offset offset, ADIO_Offset length)
{
int rc=0;
#ifdef HAVE_GPFS_FCNTL_H
struct {
gpfsFcntlHeader_t header;
gpfsAccessRange_t access;
} take_locks;
take_locks.header.totalLength = sizeof(take_locks);
take_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
take_locks.header.fcntlReserved = 0;
take_locks.access.structLen = sizeof(take_locks.access);
take_locks.access.structType = GPFS_ACCESS_RANGE;
take_locks.access.start = offset;
take_locks.access.length = length;
take_locks.access.isWrite = 1;
rc = gpfs_fcntl(fd, &take_locks);
#endif
ADIOI_Assert(rc == 0);
}
static void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
{
int rc=0;
#ifdef HAVE_GPFS_FCNTL_H
struct {
gpfsFcntlHeader_t header;
gpfsFreeRange_t free;
} free_locks;
free_locks.header.totalLength = sizeof(free_locks);
free_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
free_locks.header.fcntlReserved = 0;
free_locks.free.structLen = sizeof(free_locks.free);
free_locks.free.structType = GPFS_FREE_RANGE;
free_locks.free.start = offset;
free_locks.free.length = length;
rc = gpfs_fcntl(fd, &free_locks);
#endif
ADIOI_Assert(rc == 0);
}
#ifdef BGQPLATFORM
/* my_start, my_end: this processes file domain. coudd be -1,-1 for "no i/o"
* fd_start, fd_end: arrays of length fd->hints->cb_nodes specifying all file domains */
static int gpfs_find_access_for_ion(ADIO_File fd,
ADIO_Offset my_start, ADIO_Offset my_end,
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
ADIO_Offset *start, ADIO_Offset *end)
{
int my_ionode = MPIX_IO_node_id();
int *rank_to_ionode;
int i, nprocs, rank;
ADIO_Offset group_start=LLONG_MAX, group_end=0;
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &rank);
rank_to_ionode = ADIOI_Calloc(nprocs, sizeof(int));
MPI_Allgather(&my_ionode, 1, MPI_INT, rank_to_ionode, 1, MPI_INT, fd->comm);
/* rank_to_ionode now contains a mapping from MPI rank to IO node */
/* fd->hints->ranklist[] contains a list of MPI ranks that are aggregators */
/* fd_start[] and fd_end[] contain a list of file domains. */
/* what we really want to do is take all the file domains associated
* with a given i/o node and find the begin/end of that range.
*
* Because gpfs_fcntl hints are expected to be released, we'll pass this
* start/end back to the caller, who will both declare and free this range
*/
if (my_start == -1 || my_end == -1) {
ADIOI_Free(rank_to_ionode);
return 0; /* no work to do */
}
for (i=0; i<fd->hints->cb_nodes; i++ ){
if (my_ionode == rank_to_ionode[fd->hints->ranklist[i]] ) {
group_start = ADIOI_MIN(fd_start[i], group_start);
group_end = ADIOI_MAX(fd_end[i], group_end);
}
}
*start = group_start;
*end = group_end;
ADIOI_Free(rank_to_ionode);
return 1;
}
#endif // BGQPLATFORM
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
@ -444,7 +511,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
ADIO_Offset size=0;
int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
char *write_buf=NULL;
char *write_buf=NULL, *write_buf2=NULL;
int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
int *partial_recv, *sent_to_proc, *start_pos, flag;
int *send_buf_idx, *curr_to_proc, *done_to_proc;
@ -454,6 +521,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
int info_flag, coll_bufsize;
char *value;
static char myname[] = "ADIOI_EXCH_AND_WRITE";
pthread_t io_thread;
void *thread_ret;
ADIOI_IO_ThreadFuncData io_thread_args;
*error_code = MPI_SUCCESS; /* changed below if error */
/* only I/O errors are currently reported */
@ -468,6 +538,11 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
coll_bufsize = atoi(value);
ADIOI_Free(value);
if (gpfsmpio_pthreadio == 1){
/* ROMIO will spawn an additional thread. both threads use separate
* halves of the collective buffer*/
coll_bufsize = coll_bufsize/2;
}
for (i=0; i < nprocs; i++) {
if (others_req[i].count) {
@ -491,11 +566,35 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
if ((st_loc==-1) && (end_loc==-1)) {
ntimes = 0; /* this process does no writing. */
}
if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
an aggregator -- otherwise will fail for deferred open */
if (getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
}
}
ADIO_Offset st_loc_ion=0, end_loc_ion=0, needs_gpfs_access_cleanup=0;
#ifdef BGQPLATFORM
if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
an aggregator -- otherwise will fail for deferred open */
if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
&st_loc_ion, &end_loc_ion)) {
gpfs_wr_access_start(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
needs_gpfs_access_cleanup=1;
}
}
}
#endif
MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
fd->comm);
if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize);
write_buf = fd->io_buf;
if (gpfsmpio_pthreadio == 1) {
write_buf2 = fd->io_buf + coll_bufsize;
}
curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
/* its use is explained below. calloc initializes to 0. */
@ -552,6 +651,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
done = 0;
off = st_loc;
if(gpfsmpio_pthreadio == 1)
io_thread = pthread_self();
#ifdef PROFILE
MPE_Log_event(14, 0, "end computation");
#endif
@ -642,22 +744,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
MPE_Log_event(14, 0, "end computation");
MPE_Log_event(7, 0, "start communication");
#endif
if (bglmpio_comm == 1)
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
if (gpfsmpio_comm == 1)
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
others_req, send_buf_idx, curr_to_proc,
done_to_proc, &hole, m, buftype_extent, buf_idx,
error_code);
else
if (bglmpio_comm == 0)
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
if (gpfsmpio_comm == 0)
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
others_req, send_buf_idx, curr_to_proc,
@ -673,15 +775,52 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
if (count[i]) flag = 1;
if (flag) {
char round[50];
sprintf(round, "two-phase-round=%d", m);
setenv("LIBIOLOG_EXTRA_INFO", round, 1);
ADIOI_Assert(size == (int)size);
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
off, &status, error_code);
if (*error_code != MPI_SUCCESS) return;
if (gpfsmpio_pthreadio == 1) {
/* there is no such thing as "invalid pthread identifier", so
* we'll use pthread_self() instead. Before we do I/O we want
* to complete I/O from any previous iteration -- but only a
* previous iteration that had I/O work to do (i.e. set 'flag')
*/
if(!pthread_equal(io_thread, pthread_self())) {
pthread_join(io_thread, &thread_ret);
*error_code = *(int *)thread_ret;
if (*error_code != MPI_SUCCESS) return;
io_thread = pthread_self();
}
io_thread_args.fd = fd;
/* do a little pointer shuffling: background I/O works from one
* buffer while two-phase machinery fills up another */
io_thread_args.buf = write_buf;
ADIOI_SWAP(write_buf, write_buf2, char*);
io_thread_args.io_kind = ADIOI_WRITE;
io_thread_args.size = size;
io_thread_args.offset = off;
io_thread_args.status = status;
io_thread_args.error_code = *error_code;
if ( (pthread_create(&io_thread, NULL,
ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
io_thread = pthread_self();
} else {
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, error_code);
if (*error_code != MPI_SUCCESS) return;
}
}
off += size;
done += size;
}
if (gpfsmpio_pthreadio == 1) {
if ( !pthread_equal(io_thread, pthread_self()) ) {
pthread_join(io_thread, &thread_ret);
*error_code = *(int *)thread_ret;
}
}
for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;
#ifdef PROFILE
@ -689,22 +828,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
#endif
for (m=ntimes; m<max_ntimes; m++)
/* nothing to recv, but check for send. */
if (bglmpio_comm == 1)
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
if (gpfsmpio_comm == 1)
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
others_req, send_buf_idx,
curr_to_proc, done_to_proc, &hole, m,
buftype_extent, buf_idx, error_code);
else
if (bglmpio_comm == 0)
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
if (gpfsmpio_comm == 0)
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
len_list, send_size, recv_size, off, size, count,
start_pos, partial_recv,
sent_to_proc, nprocs, myrank,
buftype_is_contig, contig_access_count,
min_st_offset, fd_size, fd_start, fd_end,
others_req, send_buf_idx,
@ -715,7 +854,6 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
MPE_Log_event(8, 0, "end communication");
#endif
if (ntimes) ADIOI_Free(write_buf);
ADIOI_Free(curr_offlen_ptr);
ADIOI_Free(count);
ADIOI_Free(partial_recv);
@ -726,6 +864,17 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
ADIOI_Free(send_buf_idx);
ADIOI_Free(curr_to_proc);
ADIOI_Free(done_to_proc);
if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
gpfs_wr_access_end(fd->fd_sys, st_loc, end_loc-st_loc);
}
if (needs_gpfs_access_cleanup) {
gpfs_wr_access_end(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
needs_gpfs_access_cleanup=0;
}
unsetenv("LIBIOLOG_EXTRA_INFO");
}
@ -783,8 +932,8 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
tmp_len[i] = others_req[i].lens[k];
others_req[i].lens[k] = partial_recv[i];
}
MPI_Type_hindexed(count[i],
&(others_req[i].lens[start_pos[i]]),
ADIOI_Type_create_hindexed_x(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, recv_types+j);
/* absolute displacements; use MPI_BOTTOM in recv */
@ -799,15 +948,12 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
sum = 0;
for (i=0; i<nprocs; i++) sum += count[i];
/* valgrind-detcted optimization: if there is no work on this process we do
* not need to search for holes */
if (sum) {
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum)*sizeof(ADIO_Offset));
srt_len = (int *) ADIOI_Malloc((sum)*sizeof(int));
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum+1)*sizeof(ADIO_Offset));
srt_len = (int *) ADIOI_Malloc((sum+1)*sizeof(int));
/* +1 to avoid a 0-size malloc */
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
nprocs, nprocs_recv, sum);
}
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
nprocs, nprocs_recv, sum);
/* for partial recvs, restore original lengths */
for (i=0; i<nprocs; i++)
@ -824,28 +970,28 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
* #835). Missing these holes would result in us writing more data than
* recieved by everyone else. */
*hole = 0;
if (sum) {
if (off != srt_off[0]) /* hole at the front */
if (off != srt_off[0]) /* hole at the front */
*hole = 1;
else { /* coalesce the sorted offset-length pairs */
for (i=1; i<sum; i++) {
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
if (new_len > srt_len[0]) srt_len[0] = new_len;
}
else
break;
}
if (i < sum || size != srt_len[0]) /* hole in middle or end */
*hole = 1;
else { /* coalesce the sorted offset-length pairs */
for (i=1; i<sum; i++) {
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
if (new_len > srt_len[0]) srt_len[0] = new_len;
}
else
break;
}
if (i < sum || size != srt_len[0]) /* hole in middle or end */
*hole = 1;
}
}
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
}
if (nprocs_recv) {
if (*hole) {
const char * stuff = "data-sieve-in-two-phase";
setenv("LIBIOLOG_EXTRA_INFO", stuff, 1);
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
ADIO_EXPLICIT_OFFSET, off, &status, &err);
/* --BEGIN ERROR HANDLING-- */
@ -857,6 +1003,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
return;
}
/* --END ERROR HANDLING-- */
unsetenv("LIBIOLOG_EXTRA_INFO");
}
}
@ -1027,7 +1174,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
ADIOI_BUF_INCR \
}
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
@ -1079,7 +1226,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_BGL_Calc_aggregator(fd,
p = ADIOI_GPFS_Calc_aggregator(fd,
off,
min_st_offset,
&len,
@ -1140,7 +1287,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
{
typedef struct {
ADIO_Offset *off_list;
int *len_list;
ADIO_Offset *len_list;
int nelem;
} heap_struct;
@ -1256,7 +1403,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
static void ADIOI_W_Exchange_data_alltoallv(
ADIO_File fd, void *buf,
ADIO_File fd, const void *buf,
char *write_buf, /* 1 */
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
@ -1287,11 +1434,15 @@ static void ADIOI_W_Exchange_data_alltoallv(
int *srt_len, sum;
ADIO_Offset *srt_off;
static char myname[] = "ADIOI_W_EXCHANGE_DATA";
double io_time;
io_time = MPI_Wtime();
/* exchange recv_size info so that each process knows how much to
send to whom. */
MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
io_time = MPI_Wtime();
nprocs_recv = 0;
for (i=0; i<nprocs; i++) if (recv_size[i]) { nprocs_recv++; }
@ -1334,14 +1485,23 @@ static void ADIOI_W_Exchange_data_alltoallv(
min_st_offset, fd_size, fd_start, fd_end,
send_buf_idx, curr_to_proc, done_to_proc, iter,
buftype_extent);
ADIOI_Free(send_buf);
}
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
io_time = MPI_Wtime();
/* alltoallv */
MPI_Alltoallv(
all_send_buf, send_size, sdispls, MPI_BYTE,
all_recv_buf, recv_size, rdispls, MPI_BYTE,
fd->comm );
ADIOI_Free( all_send_buf );
ADIOI_Free(sdispls);
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
io_time = MPI_Wtime();
/* data sieving pre-read */
/* To avoid a read-modify-write, check if there are holes in the
data to be written. For this, merge the (sorted) offset lists
@ -1373,6 +1533,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
ADIOI_Free(srt_off);
ADIOI_Free(srt_len);
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
io_time = MPI_Wtime();
if (nprocs_recv) {
if (*hole) {
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
@ -1388,7 +1550,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
/* --END ERROR HANDLING-- */
}
}
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
/* scater all_recv_buf into 4M cb_buffer */
tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
for (i=0; i<nprocs; i++)
@ -1419,14 +1582,12 @@ static void ADIOI_W_Exchange_data_alltoallv(
}
ADIOI_Free( tmp_len );
ADIOI_Free( all_send_buf );
ADIOI_Free( all_recv_buf );
ADIOI_Free(sdispls);
ADIOI_Free(rdispls);
return;
}
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
*flat_buf, char **send_buf, ADIO_Offset
*offset_list, ADIO_Offset *len_list, int *send_size,
MPI_Request *requests, int *sent_to_proc,
@ -1478,7 +1639,7 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
* longer than the single region that processor "p" is responsible
* for.
*/
p = ADIOI_BGL_Calc_aggregator(fd,
p = ADIOI_GPFS_Calc_aggregator(fd,
off,
min_st_offset,
&len,

Просмотреть файл

@ -0,0 +1,18 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_BG
noinst_HEADERS += \
adio/ad_gpfs/bg/ad_bg_aggrs.h \
adio/ad_gpfs/bg/ad_bg_pset.h
romio_other_sources += \
adio/ad_gpfs/bg/ad_bg_aggrs.c \
adio/ad_gpfs/bg/ad_bg_pset.c
endif BUILD_AD_BG

Просмотреть файл

@ -0,0 +1,675 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.c
* \brief The externally used function from this file is is declared in ad_bg_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*#define TRACE_ON */
// Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
// #define balancecontigtrace 1
// #define bridgeringaggtrace 1
#include "adio.h"
#include "adio_cb_config_list.h"
#include "../ad_gpfs.h"
#include "ad_bg_pset.h"
#include "ad_bg_aggrs.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/* Comments copied from common:
* This file contains four functions:
*
* ADIOI_Calc_aggregator()
* ADIOI_Calc_file_domains()
* ADIOI_Calc_my_req()
* ADIOI_Calc_others_req()
*
* The last three of these were originally in ad_read_coll.c, but they are
* also shared with ad_write_coll.c. I felt that they were better kept with
* the rest of the shared aggregation code.
*/
/* Discussion of values available from above:
*
* ADIO_Offset st_offsets[0..nprocs-1]
* ADIO_Offset end_offsets[0..nprocs-1]
* These contain a list of start and end offsets for each process in
* the communicator. For example, an access at loc 10, size 10 would
* have a start offset of 10 and end offset of 19.
* int nprocs
* number of processors in the collective I/O communicator
* ADIO_Offset min_st_offset
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
* starting location of "file domain"; region that a given process will
* perform aggregation for (i.e. actually do I/O)
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
* start + size - 1 roughly, but it can be less, or 0, in the case of
* uneven distributions
*/
/* forward declaration */
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo);
/*
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
* ADIO can work more efficiently.
*/
int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
{
int r, s;
ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
ADIOI_BG_ConfInfo_t *confInfo;
TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
MPI_Comm_size( fd->comm, &s );
MPI_Comm_rank( fd->comm, &r );
/* Collect individual BG personality information */
confInfo = ADIOI_BG_ConfInfo_new ();
procInfo = ADIOI_BG_ProcInfo_new ();
ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
/* Gather BG personality infomation onto process 0 */
/* if (r == 0) */
all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
0,
fd->comm );
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
if (r == 0) {
ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
}
ADIOI_BG_ProcInfo_free (all_procInfo);
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
Declared in adio_cb_config_list.h */
ADIOI_cb_bcast_rank_map(fd);
if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
bridgelistnum and numbridges to all
ranks */
if (r != 0) {
fd->hints->fs_hints.bg.bridgelist =
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
if (fd->hints->fs_hints.bg.bridgelist == NULL) {
/* NEED TO HANDLE ENOMEM */
}
}
MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
fd->comm);
if (r != 0) {
fd->hints->fs_hints.bg.bridgelistnum =
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
/* NEED TO HANDLE ENOMEM */
}
}
MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
MPI_INT, 0, fd->comm);
MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
fd->comm);
}
ADIOI_BG_persInfo_free( confInfo, procInfo );
TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
return 0;
}
/* There are some number of bridge nodes (randomly) distributed through the job
* We need to split the nodes among the bridge nodes */
/* Maybe find which bridge node is closer (manhattan distance) and try to
* distribute evenly.
*/
/*
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
* The first order of tmp_ranklist is : PSET number
* The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
*/
typedef struct
{
int rank;
int bridge;
} sortstruct;
typedef struct
{
int bridgeRank;
int numAggsAssigned;
} bridgeAggAssignment;
static int intsort(const void *p1, const void *p2)
{
sortstruct *i1, *i2;
i1 = (sortstruct *)p1;
i2 = (sortstruct *)p2;
return(i1->bridge - i2->bridge);
}
static int
ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo,
int *tmp_ranklist)
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
/* BES: This should be done in the init routines probably. */
int i, j;
int aggTotal;
int *aggList;
if (gpfsmpio_bridgeringagg > 0) {
int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
/* the number of aggregators is (numAggs per bridgenode) */
if(numAggs == 1)
aggTotal = 1;
else
aggTotal = confInfo->numBridgeRanks * numAggs;
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
if(aggTotal == 1) { /* special case when we only have one bridge node */
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
for(i=0; i < confInfo->nProcs; i++)
{
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
bridgelist[i].rank = i;
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
}
/* This list contains rank->bridge info. Now, we need to sort this list. */
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
aggList[0] = bridgelist[0].bridge;
ADIOI_Free(bridgelist);
}
else { // aggTotal > 1
int currentAggListSize = 0;
int numBridgesWithAggAssignments = 0;
bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
int partitionSize = all_procInfo[0].numNodesInPartition;
int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
for (i=0;i<partitionSize;i++)
nodesAssigned[i] = 0;
int currentNumHops = gpfsmpio_bridgeringagg;
int allAggsAssigned = 0;
/* Iterate thru the process infos and select aggregators starting at currentNumHops
away. Increase the currentNumHops until all bridges have numAggs assigned to them.
*/
while (!allAggsAssigned) {
/* track whether any aggs are selected durng this round */
int startingCurrentAggListSize = currentAggListSize;
int numIterForHopsWithNoAggs = 0;
for (i=0;i<confInfo->nProcs;i++) {
if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
int foundBridge = 0;
for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
foundBridge = 1;
if (aggAssignments[j].numAggsAssigned < numAggs) {
aggAssignments[j].numAggsAssigned++;
nodesAssigned[all_procInfo[i].nodeRank] = 1;
aggList[currentAggListSize] = all_procInfo[i].rank;
currentAggListSize++;
#ifdef bridgeringaggtrace
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
#endif
}
}
}
if (!foundBridge) {
aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
numBridgesWithAggAssignments++;
nodesAssigned[all_procInfo[i].nodeRank] = 1;
aggList[currentAggListSize] = all_procInfo[i].rank;
currentAggListSize++;
#ifdef bridgeringaggtrace
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
#endif
}
}
}
}
if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
allAggsAssigned = 1;
for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
if (aggAssignments[i].numAggsAssigned < numAggs)
allAggsAssigned = 0;
}
}
currentNumHops++;
/* If 3 rounds go by without selecting an agg abort to avoid
infinite loop.
*/
if (startingCurrentAggListSize == currentAggListSize)
numIterForHopsWithNoAggs++;
else
numIterForHopsWithNoAggs = 0;
ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
}
ADIOI_Free(aggAssignments);
ADIOI_Free(nodesAssigned);
} // else aggTotal > 1
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
} // gpfsmpio_bridgeringagg > 0
else { // gpfsmpio_bridgeringagg unset - default code
int distance, numAggs;
/* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
* bridge node */
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
for(i=0; i < confInfo->nProcs; i++)
{
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
bridgelist[i].rank = i;
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
}
/* This list contains rank->bridge info. Now, we need to sort this list. */
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
/* In this array, we can pick an appropriate number of midpoints based on
* our bridgenode index and the number of aggregators */
numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
if(numAggs == 1)
aggTotal = 1;
else
/* the number of aggregators is (numAggs per bridgenode) plus each
* bridge node is an aggregator */
aggTotal = confInfo->numBridgeRanks * (numAggs+1);
if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
/* For each bridge node, determine who the aggregators will be */
/* basically, the n*distance and bridge node */
if(aggTotal == 1) /* special case when we only have one bridge node */
aggList[0] = bridgelist[0].bridge;
else
{
int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
int psetSize = 0;
int procIndex;
for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
{
TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex, bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
if(lastBridge == bridgelist[procIndex].bridge)
{
psetSize++;
if(procIndex) continue;
else procIndex--;/* procIndex == 0 */
}
/* Sets up a list of nodes which will act as aggregators. numAggs
* per bridge node total. The list of aggregators is
* bridgeNode 0
* bridgeNode 1
* bridgeNode ...
* bridgeNode N
* bridgeNode[0]aggr[0]
* bridgeNode[0]aggr[1]...
* bridgeNode[0]aggr[N]...
* ...
* bridgeNode[N]aggr[0]..
* bridgeNode[N]aggr[N]
*/
aggList[nextBridge]=lastBridge;
distance = psetSize/numAggs;
TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
if(numAggs>1)
{
for(j = 0; j < numAggs; j++)
{
ADIOI_Assert(nextAggr<aggTotal);
aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
{
aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
}
nextAggr++;
}
}
if(procIndex<0) break;
lastBridge = bridgelist[procIndex].bridge;
psetSize = 1;
nextBridge++;
}
}
TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
for(i=0;i<aggTotal;i++)
{
TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
}
ADIOI_Free (bridgelist);
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
}
ADIOI_Free (aggList);
return aggTotal;
}
/*
* compute aggregators ranklist and put it into fd->hints struct
*/
static void
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
const ADIOI_BG_ConfInfo_t *confInfo,
ADIOI_BG_ProcInfo_t *all_procInfo)
{
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
int i;
int naggs;
int size;
int *tmp_ranklist;
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
# if AGG_DEBUG
for (i=0; i<confInfo->nProcs; i++) {
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
}
# endif
naggs=
ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
# define VERIFY 1
# if VERIFY
DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
confInfo->ioMinSize ,
confInfo->ioMaxSize ,
confInfo->nAggrs ,
confInfo->numBridgeRanks ,
confInfo->nProcs ,
confInfo->ioMaxSize /*virtualPsetSize*/ ,
confInfo->cpuIDsize,
confInfo->aggRatio ,
naggs );
# endif
MPI_Comm_size( fd->comm, &size );
/* This fix is for when the bridgenode rnk is not part of the particular
* subcomm associated with this MPI File operation. I don't know if
* this is the best/right answer but it passes the test cases at least.
* I don't know how common file IO in subcomms is anyway... */
for(i=0;i<naggs;i++)
{
if(tmp_ranklist[i] > size)
{
TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
i, tmp_ranklist[i], fd->comm);
tmp_ranklist[i] = 0;
}
}
# if AGG_DEBUG
for (i=0; i<naggs; i++) {
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
}
# endif
if (gpfsmpio_balancecontig == 1) {
/* what comes out of this code block is the agg ranklist sorted by
* bridge set and ion id with associated bridge info stored in the
* hints structure for later access during file domain assignment */
// sort the agg ranklist by ions and bridges
int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
/* list of all bridge ranks */
int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
/* each entry here is the number of aggregators associated with the
* bridge rank of the same index in bridgelist */
int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
/* list of all ion IDs corresponding with bridgelist entries of same index */
int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
int numbridges = 0;
for (i=0;i<naggs;i++)
bridgelistnum[i] = 0;
/* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
* agg rank on that ion. */
int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
for (i=0;i<naggs;i++)
summarybridgeminionaggrank[i] = -1;
/* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
* entry and find the associated bridge list index - at the end we will
* know how many aggs belong to each bridge in each ion */
for (i=0;i<naggs;i++) {
int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
int foundrank = 0;
int summaryranklistbridgeindex = 0;
int j;
for (j=0;(j<numbridges && !foundrank);j++) {
if (bridgelist[j] == aggbridgerank) {
foundrank = 1;
summaryranklistbridgeindex = j;
}
else
summaryranklistbridgeindex++;
}
if (!foundrank) {
bridgelist[summaryranklistbridgeindex] = aggbridgerank;
ionlist[summaryranklistbridgeindex] = aggionid;
if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
numbridges++;
}
bridgelistnum[summaryranklistbridgeindex]++;
}
/* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
* need to make each entry the minimum bridge rank for the entire ion. */
for (i=0;i<numbridges;i++) {
int aggIonId = ionlist[i];
int j;
for (j=0;j<numbridges;j++) {
if (ionlist[j] == aggIonId) {
if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
}
}
}
// resort by io node minimum bridge rank
int x;
for (x=0;x<numbridges;x++) {
for (i=0;i<(numbridges-1);i++) {
if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
int tmpminionaggrank = summarybridgeminionaggrank[i];
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
summarybridgeminionaggrank[i+1] = tmpminionaggrank;
int tmpionid = ionlist[i];
ionlist[i] = ionlist[i+1];
ionlist[i+1] = tmpionid;
int tmpbridgerank = bridgelist[i];
bridgelist[i] = bridgelist[i+1];
bridgelist[i+1] = tmpbridgerank;
int tmpbridgeranknum = bridgelistnum[i];
bridgelistnum[i] = bridgelistnum[i+1];
bridgelistnum[i+1] = tmpbridgeranknum;
}
}
}
// for each io node make sure bridgelist is in rank order
int startSortIndex = -1;
int endSortIndex = -1;
int currentBridgeIndex = 0;
while (currentBridgeIndex < numbridges) {
int currentIonId = ionlist[currentBridgeIndex];
startSortIndex = currentBridgeIndex;
while (ionlist[currentBridgeIndex] == currentIonId)
currentBridgeIndex++;
endSortIndex = currentBridgeIndex-1;
for (x=startSortIndex;x<=endSortIndex;x++) {
for (i=startSortIndex;i<endSortIndex;i++) {
if (bridgelist[i] > bridgelist[i+1]) {
int tmpbridgerank = bridgelist[i];
bridgelist[i] = bridgelist[i+1];
bridgelist[i+1] = tmpbridgerank;
int tmpbridgeranknum = bridgelistnum[i];
bridgelistnum[i] = bridgelistnum[i+1];
bridgelistnum[i+1] = tmpbridgeranknum;
}
}
}
}
/* populate interleavedbridgeranklist - essentially the agg rank list
* is now sorted by the ion minimum bridge rank and bridge node */
int currentrankoffset = 0;
for (i=0;i<numbridges;i++) {
int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
int numAggsForThisBridge = 0;
int k;
for (k=0;k<naggs;k++) {
int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
if (aggbridgerank == bridgelist[i]) {
thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
numAggsForThisBridge++;
}
}
// sort thisBridgeAggList
for (x=0;x<numAggsForThisBridge;x++) {
int n;
for (n=0;n<(numAggsForThisBridge-1);n++) {
if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
int tmpthisBridgeAggList = thisBridgeAggList[n];
thisBridgeAggList[n] = thisBridgeAggList[n+1];
thisBridgeAggList[n+1] = tmpthisBridgeAggList;
}
}
}
int n;
for (n=0;n<numAggsForThisBridge;n++) {
interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
currentrankoffset++;
}
ADIOI_Free(thisBridgeAggList);
}
#ifdef balancecontigtrace
fprintf(stderr,"Interleaved aggregator list:\n");
for (i=0;i<naggs;i++) {
fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
}
fprintf(stderr,"Bridges list:\n");
for (i=0;i<numbridges;i++) {
fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
}
#endif
/* copy the ranklist of IO aggregators to fd->hints */
if(fd->hints->ranklist != NULL)
ADIOI_Free (fd->hints->ranklist);
if(fd->hints->fs_hints.bg.bridgelist != NULL)
ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
fd->hints->cb_nodes = naggs;
fd->hints->fs_hints.bg.numbridges = numbridges;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
ADIOI_Free(summarybridgeminionaggrank);
ADIOI_Free( tmp_ranklist );
ADIOI_Free( bridgelistnum );
ADIOI_Free( bridgelist );
ADIOI_Free( interleavedbridgeranklist );
ADIOI_Free(ionlist);
} else {
/* classic topology-agnostic copy of the ranklist of IO aggregators to
* fd->hints */
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
fd->hints->cb_nodes = naggs;
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
ADIOI_Free( tmp_ranklist );
}
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
return;
}

Просмотреть файл

@ -0,0 +1,33 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_aggrs.h
* \brief ???
*/
/*
*
* Declares functions specific for the BlueGene platform within the GPFS
* parallel I/O solution. Implements aligned file-domain partitioning
* (7/28/2005); persistent file doamin work not implemented
*
*/
#ifndef AD_BG_AGGRS_H_
#define AD_BG_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#ifdef HAVE_GPFS_H
#include <gpfs.h>
#endif
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
/* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
#endif /* AD_BG_AGGRS_H_ */

Просмотреть файл

@ -3,28 +3,37 @@
/* ---------------------------------------------------------------- */
/**
* \file ad_bg_pset.c
* \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
* \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997 University of Chicago.
/*
* Copyright (C) 1997 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/* #define TRACE_ON */
// #define bridgeringaggtrace 1
#include <stdlib.h>
#include "ad_bg.h"
#include "../ad_gpfs.h"
#include "ad_bg_pset.h"
#include "mpidimpl.h"
#include <spi/include/kernel/process.h>
#include <firmware/include/personality.h>
#ifdef HAVE_MPIX_H
#include <mpix.h>
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(fmt...)
#endif
ADIOI_BG_ProcInfo_t *
ADIOI_BG_ProcInfo_new()
{
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ProcInfo_t));
ADIOI_BG_assert ((p != NULL));
ADIOI_Assert ((p != NULL));
return p;
}
@ -32,7 +41,7 @@ ADIOI_BG_ProcInfo_t *
ADIOI_BG_ProcInfo_new_n( int n )
{
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BG_ProcInfo_t));
ADIOI_BG_assert ((p != NULL));
ADIOI_Assert ((p != NULL));
return p;
}
@ -46,7 +55,7 @@ ADIOI_BG_ConfInfo_t *
ADIOI_BG_ConfInfo_new ()
{
ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ConfInfo_t));
ADIOI_BG_assert ((p != NULL));
ADIOI_Assert ((p != NULL));
return p;
}
@ -72,10 +81,40 @@ static int intsort(const void *p1, const void *p2)
return(i1->bridgeCoord - i2->bridgeCoord);
}
unsigned torusSize[MPIX_TORUS_MAX_DIMS];
unsigned dimTorus[MPIX_TORUS_MAX_DIMS];
void
ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
ADIOI_BG_ProcInfo_t *proc,
/* This function computes the number of hops between the torus coordinates of the
* aggCoords and bridgeCoords parameters.
*/
static unsigned procManhattanDistance(unsigned *aggCoords, unsigned *bridgeCoords) {
unsigned totalDistance = 0;
int i;
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
unsigned dimDistance = abs((int)aggCoords[i] - (int)bridgeCoords[i]);
if (dimDistance > 0) { // could torus make it closer?
if (dimTorus[i]) {
if (aggCoords[i] == torusSize[i]) { // is wrap-around closer
if ((bridgeCoords[i]+1) < dimDistance) // assume will use torus link
dimDistance = bridgeCoords[i]+1;
}
else if (bridgeCoords[i] == torusSize[i]) { // is wrap-around closer
if ((aggCoords[i]+1) < dimDistance) // assume will use torus link
dimDistance = aggCoords[i]+1;
}
}
} /* else: dimDistance == 0, meaning aggCoords[i] and bridgeCoords[i] are
the same and there's no closer point to pick */
totalDistance += dimDistance;
}
return totalDistance;
}
void
ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
ADIOI_BG_ProcInfo_t *proc,
int size, int rank, int n_aggrs, MPI_Comm comm)
{
int i, iambridge=0, bridgerank = -1, bridgeIndex;
@ -95,11 +134,43 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
proc->rank = rank;
proc->coreID = hw.coreID;
if (gpfsmpio_bridgeringagg > 0) {
#ifdef bridgeringaggtrace
if (rank == 0)
fprintf(stderr,"Block dimensions:\n");
#endif
/* Set the numNodesInPartition and nodeRank for this proc
*/
proc->numNodesInPartition = 1;
proc->nodeRank = 0;
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
torusSize[i] = hw.Size[i];
dimTorus[i] = hw.isTorus[i];
proc->numNodesInPartition *= hw.Size[i];
int baseNum = 1, j;
for (j=0;j<i;j++)
baseNum *= hw.Size[j];
proc->nodeRank += (hw.Coords[i] * baseNum);
#ifdef bridgeringaggtrace
if (rank == 0)
fprintf(stderr,"Dimension %d has %d elements wrap-around value is %d\n",i,torusSize[i],dimTorus[i]);
#endif
}
}
MPI_Comm_size(comm, &commsize);
proc->ionID = MPIX_IO_node_id ();
if(size == 1)
{
proc->iamBridge = 1;
proc->bridgeRank = rank;
if (gpfsmpio_bridgeringagg > 0) {
proc->manhattanDistanceToBridge = 0;
}
/* Set up the other parameters */
proc->myIOSize = size;
@ -111,7 +182,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
conf->cpuIDsize = hw.ppn;
/*conf->virtualPsetSize = conf->ioMaxSize * conf->cpuIDsize;*/
conf->nAggrs = 1;
conf->aggRatio = 1. * conf->nAggrs / conf->ioMaxSize /*virtualPsetSize*/;
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
if(conf->aggRatio > 1) conf->aggRatio = 1.;
TRACE_ERR("I am (single) Bridge rank\n");
return;
@ -120,21 +191,45 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
/* Find the nearest bridge node coords. We don't know the
rank in our comm so we will collective find/pick a bridge
rank later.
*/
*/
int32_t bridgeCoords;
bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
pers.Network_Config.cnBridge_B << 18 |
pers.Network_Config.cnBridge_C << 12 |
pers.Network_Config.cnBridge_D << 6 |
bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
pers.Network_Config.cnBridge_B << 18 |
pers.Network_Config.cnBridge_C << 12 |
pers.Network_Config.cnBridge_D << 6 |
pers.Network_Config.cnBridge_E << 2;
ADIOI_BG_assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
ADIOI_Assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
if((hw.Coords[0] == pers.Network_Config.cnBridge_A) &&
(hw.Coords[1] == pers.Network_Config.cnBridge_B) &&
(hw.Coords[2] == pers.Network_Config.cnBridge_C) &&
(hw.Coords[3] == pers.Network_Config.cnBridge_D) &&
(hw.Coords[4] == pers.Network_Config.cnBridge_E))
if((hw.Coords[0] == pers.Network_Config.cnBridge_A) &&
(hw.Coords[1] == pers.Network_Config.cnBridge_B) &&
(hw.Coords[2] == pers.Network_Config.cnBridge_C) &&
(hw.Coords[3] == pers.Network_Config.cnBridge_D) &&
(hw.Coords[4] == pers.Network_Config.cnBridge_E)) {
iambridge = 1; /* I am bridge */
if (gpfsmpio_bridgeringagg > 0) {
proc->manhattanDistanceToBridge = 0;
}
}
else { // calculate manhattan distance to bridge if gpfsmpio_bridgeringagg is set
if (gpfsmpio_bridgeringagg > 0) {
unsigned aggCoords[MPIX_TORUS_MAX_DIMS],manhattanBridgeCoords[MPIX_TORUS_MAX_DIMS];
aggCoords[0] = hw.Coords[0];
manhattanBridgeCoords[0] = pers.Network_Config.cnBridge_A;
aggCoords[1] = hw.Coords[1];
manhattanBridgeCoords[1] = pers.Network_Config.cnBridge_B;
aggCoords[2] = hw.Coords[2];
manhattanBridgeCoords[2] = pers.Network_Config.cnBridge_C;
aggCoords[3] = hw.Coords[3];
manhattanBridgeCoords[3] = pers.Network_Config.cnBridge_D;
aggCoords[4] = hw.Coords[4];
manhattanBridgeCoords[4] = pers.Network_Config.cnBridge_E;
proc->manhattanDistanceToBridge= procManhattanDistance(aggCoords, manhattanBridgeCoords);
#ifdef bridgeringaggtrace
fprintf(stderr,"agg coords are %u %u %u %u %u bridge coords are %u %u %u %u %u distance is %u\n",aggCoords[0],aggCoords[1],aggCoords[2],aggCoords[3],aggCoords[4],manhattanBridgeCoords[0],manhattanBridgeCoords[1],manhattanBridgeCoords[2],manhattanBridgeCoords[3],manhattanBridgeCoords[4], proc->manhattanDistanceToBridge);
#endif
}
}
TRACE_ERR("Bridge coords(%8.8X): %d %d %d %d %d, %d. iambridge %d\n",bridgeCoords, pers.Network_Config.cnBridge_A,pers.Network_Config.cnBridge_B,pers.Network_Config.cnBridge_C,pers.Network_Config.cnBridge_D,pers.Network_Config.cnBridge_E,0, iambridge);
@ -143,16 +238,16 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
bridges = (sortstruct *) ADIOI_Malloc(sizeof(sortstruct) * size);
/* We're going to sort this structure by bridgeCoord:
typedef struct
{
int rank;
int bridgeCoord;
} sortstruct;
and I want the rank that IS the bridge to sort first, so
OR in '1' on non-bridge ranks that use a bridge coord.
*/
} sortstruct;
and I want the rank that IS the bridge to sort first, so
OR in '1' on non-bridge ranks that use a bridge coord.
*/
/* My input to the collective */
bridges[rank].rank = rank;
@ -173,18 +268,18 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
tempRank = bridges[0].rank;
countPset=1;
bridgeIndex = 0;
bridgeIndex = 0;
mincompute = size+1;
maxcompute = 1;
for(i=1; i<size; i++)
{
if((bridges[i].bridgeCoord & ~1) == tempCoords)
if((bridges[i].bridgeCoord & ~1) == tempCoords)
countPset++; /* same bridge (pset), count it */
else /* new bridge found */
{
#ifdef TRACE_ON
if(rank == 0)
if(rank == 0)
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
bridgeIndex, tempRank, tempCoords, countPset);
#endif
@ -193,13 +288,13 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
if(countPset < mincompute)
mincompute = countPset;
/* Is this my bridge? */
/* Was this my bridge we finished? */
if(tempCoords == bridgeCoords)
{
/* Am I the bridge rank? */
if(tempRank == rank)
iambridge = 1;
else
else
iambridge = 0; /* Another rank on my node may have taken over */
TRACE_ERR("Rank %u, bridge set %u, bridge rank %d (%#8.8X) has %d ranks, iambridge %u\n",
rank, bridgeIndex, tempRank, tempCoords, countPset,iambridge);
@ -207,6 +302,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
proc->myIOSize = countPset;
proc->ioNodeIndex = bridgeIndex;
}
/* Setup next bridge */
tempCoords = bridges[i].bridgeCoord & ~1;
tempRank = bridges[i].rank;
bridgeIndex++;
@ -216,7 +312,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
/* Process last bridge */
#ifdef TRACE_ON
if(rank == 0)
if(rank == 0)
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
bridgeIndex, tempRank, tempCoords, countPset);
#endif
@ -225,21 +321,21 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
if(countPset < mincompute)
mincompute = countPset;
/* Is this my bridge? */
/* Was this my bridge? */
if(tempCoords == bridgeCoords)
{
/* Am I the bridge rank? */
if(tempRank == rank)
iambridge = 1;
else
else
iambridge = 0; /* Another rank on my node may have taken over */
bridgerank = tempRank;
proc->myIOSize = countPset;
proc->ioNodeIndex = bridgeIndex;
}
if(rank == 0)
if(rank == 0)
{
/* Only rank 0 has a conf structure, fill in stuff as appropriate */
conf->ioMinSize = mincompute;
@ -248,21 +344,23 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
conf->nProcs = size;
conf->cpuIDsize = hw.ppn;
/*conf->virtualPsetSize = maxcompute * conf->cpuIDsize;*/
conf->nAggrs = n_aggrs;
/* First pass gets nAggrs = -1 */
if(conf->nAggrs <=0 ||
MIN(conf->nProcs, conf->ioMaxSize /*virtualPsetSize*/) < conf->nAggrs)
conf->nAggrs = ADIOI_BG_NAGG_PSET_DFLT;
if(conf->nAggrs > conf->numBridgeRanks) /* maybe? * conf->cpuIDsize) */
conf->nAggrs = conf->numBridgeRanks; /* * conf->cpuIDsize; */
conf->aggRatio = 1. * conf->nAggrs / conf->ioMaxSize /*virtualPsetSize*/;
if(conf->aggRatio > 1) conf->aggRatio = 1.;
TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, vps: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->ioMaxSize /*virtualPsetSize*/, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
if(conf->nAggrs <=0)
conf->nAggrs = gpfsmpio_bg_nagg_pset;
if(conf->ioMinSize <= conf->nAggrs)
conf->nAggrs = ADIOI_MAX(1,conf->ioMinSize-1); /* not including bridge itself */
/* if(conf->nAggrs > conf->numBridgeRanks)
conf->nAggrs = conf->numBridgeRanks;
*/
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
/* if(conf->aggRatio > 1) conf->aggRatio = 1.; */
TRACE_ERR("n_aggrs %zd, conf->nProcs %zu, conf->ioMaxSize %zu, ADIOI_BG_NAGG_PSET_DFLT %zu,conf->numBridgeRanks %zu,conf->nAggrs %zu\n",(size_t)n_aggrs, (size_t)conf->nProcs, (size_t)conf->ioMaxSize, (size_t)ADIOI_BG_NAGG_PSET_DFLT,(size_t)conf->numBridgeRanks,(size_t)conf->nAggrs);
TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
}
ADIOI_BG_assert((bridgerank != -1));
ADIOI_Assert((bridgerank != -1));
proc->bridgeRank = bridgerank;
proc->iamBridge = iambridge;
TRACE_ERR("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n", rank, proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);
@ -271,7 +369,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
}
void
void
ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf, ADIOI_BG_ProcInfo_t *proc )
{
ADIOI_BG_ConfInfo_free( conf );

Просмотреть файл

@ -8,7 +8,7 @@
/* File: ad_bg_pset.h
*
* Defines two structures that keep BG/L PSET specific information and their public interfaces:
* Defines two structures that keep BlueGene PSET specific information and their public interfaces:
* . ADIOI_BG_ProcInfo_t object keeps specific information to each process
* . ADIOI_BG_ConfInfo_t object keeps general information for the whole communicator, only kept
* on process 0.
@ -17,10 +17,15 @@
#ifndef AD_BG_PSET_H_
#define AD_BG_PSET_H_
#ifdef HAVE_MPIX_H
#include <mpix.h>
#endif
/* Keeps specific information to each process, will be exchanged among processes */
typedef struct {
int ioNodeIndex; /* similar to psetNum on BGL/BGP */
int rank; /* my rank */
int ionID; /* ion id this cn is using */
/* int myCoords[5]; */
int bridgeRank; /* my bridge node (or proxy) rank */
unsigned char coreID;
@ -30,6 +35,9 @@ typedef struct {
node, i.e. psetsize*/
int iamBridge; /* am *I* the bridge rank? */
int __ipad[2];
unsigned nodeRank; /* torus coords converted to an integer for use with gpfsmpio_bridgeringagg */
unsigned numNodesInPartition; /* number of physical nodes in the job partition */
unsigned manhattanDistanceToBridge; /* number of hops between this rank and the bridge node */
} ADIOI_BG_ProcInfo_t __attribute__((aligned(16)));
/* Keeps general information for the whole communicator, only on process 0 */
@ -48,15 +56,9 @@ typedef struct {
#undef MIN
#define MIN(a,b) ((a<b ? a : b))
#define MIN(a,b) (((a)<(b) ? (a) : (b)))
/* Default is to choose 8 aggregator nodes in each 32 CN pset.
Also defines default ratio of aggregator nodes in each a pset.
For Virtual Node Mode, the ratio is 8/64 */
#define ADIOI_BG_NAGG_PSET_MIN 1
#define ADIOI_BG_NAGG_PSET_DFLT 8
#define ADIOI_BG_PSET_SIZE_DFLT 32
/* public funcs for ADIOI_BG_ProcInfo_t objects */

Просмотреть файл

@ -0,0 +1,16 @@
## -*- Mode: Makefile; -*-
## vim: set ft=automake :
##
## (C) 2012 by Argonne National Laboratory.
## See COPYRIGHT in top-level directory.
##
if BUILD_AD_PE
noinst_HEADERS += \
adio/ad_gpfs/pe/ad_pe_aggrs.h
romio_other_sources += \
adio/ad_gpfs/pe/ad_pe_aggrs.c
endif BUILD_AD_PE

Просмотреть файл

@ -0,0 +1,276 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_pe_aggrs.c
* \brief The externally used function from this file is is declared in ad_pe_aggrs.h
*/
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (C) 1997-2001 University of Chicago.
* See COPYRIGHT notice in top-level directory.
*/
/*#define TRACE_ON */
#include "adio.h"
#include "adio_cb_config_list.h"
#include "../ad_gpfs.h"
#include "ad_pe_aggrs.h"
#include "mpiimpl.h"
#ifdef AGGREGATION_PROFILE
#include "mpe.h"
#endif
#ifdef USE_DBG_LOGGING
#define AGG_DEBUG 1
#endif
#ifndef TRACE_ERR
# define TRACE_ERR(format...)
#endif
/*
* Compute the aggregator-related parameters that are required in 2-phase
* collective IO of ADIO.
* The parameters are
* . the number of aggregators (proxies) : fd->hints->cb_nodes
* . the ranks of the aggregators : fd->hints->ranklist
* If MP_IONODEFILE is defined, POE determines all tasks on every node listed
* in the node file and defines MP_IOTASKLIST with them, making them all
* aggregators. Alternatively, the user can explictly set MP_IOTASKLIST
* themselves. The format of the MP_IOTASKLIST is a colon-delimited list of
* task ids, the first entry being the total number of aggregators, for example
* to specify 4 aggregators on task ids 0,8,16,24 the value would be:
* 4:0:8:16:24. If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the
* default aggregator selection is 1 task per node for every node of the job -
* additionally, an environment variable MP_IOAGGR_CNT can be specified, which
* defines the total number of aggregators, spread evenly across all the nodes.
* The romio_cb_nodes and romio_cb_config_list hint user settings are ignored.
*/
int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd)
{
int numAggs = 0;
char *ioTaskList = getenv( "MP_IOTASKLIST" );
char *ioAggrCount = getenv("MP_IOAGGR_CNT");
int i,j;
int inTERcommFlag = 0;
int myRank,commSize;
MPI_Comm_rank(fd->comm, &myRank);
MPI_Comm_size(fd->comm, &commSize);
MPI_Comm_test_inter(fd->comm, &inTERcommFlag);
if (inTERcommFlag) {
FPRINTF(stderr,"ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n");
perror("ADIOI_PE_gen_agg_ranklist:");
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (ioTaskList) {
int ioTaskListLen = strlen(ioTaskList);
int ioTaskListPos = 0;
char tmpBuf[8]; /* Big enough for 1M tasks (7 digits task ID). */
tmpBuf[7] = '\0';
for (i=0; i<7; i++) {
tmpBuf[i] = *ioTaskList++; /* Maximum is 7 digits for 1 million. */
ioTaskListPos++;
if (*ioTaskList == ':') { /* If the next char is a ':' ends it. */
tmpBuf[i+1] = '\0';
break;
}
}
numAggs = atoi(tmpBuf);
if (numAggs == 0)
FPRINTF(stderr,"ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n");
else if (!((numAggs > 0 ) && (numAggs <= commSize))) {
FPRINTF(stderr,"ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",tmpBuf,commSize);
numAggs = commSize;
}
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
int aggIndex = 0;
while (aggIndex < numAggs) {
ioTaskList++; /* Advance past the ':' */
ioTaskListPos++;
int allDigits=1;
for (i=0; i<7; i++) {
if (*ioTaskList < '0' || *ioTaskList > '9')
allDigits=0;
tmpBuf[i] = *ioTaskList++;
ioTaskListPos++;
if ( (*ioTaskList == ':') || (*ioTaskList == '\0') ) {
tmpBuf[i+1] = '\0';
break;
}
}
if (allDigits) {
int newAggRank = atoi(tmpBuf);
if (!((newAggRank >= 0 ) && (newAggRank < commSize))) {
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d - it will be ignored.\n",tmpBuf,commSize-1);
}
else {
int aggAlreadyAdded = 0;
for (i=0;i<aggIndex;i++)
if (fd->hints->ranklist[i] == newAggRank) {
aggAlreadyAdded = 1;
break;
}
if (!aggAlreadyAdded)
fd->hints->ranklist[aggIndex++] = newAggRank;
else
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",newAggRank);
}
}
else {
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id - it will be ignored.\n",tmpBuf);
}
/* At the end check whether the list is shorter than specified. */
if (ioTaskListPos == ioTaskListLen) {
if (aggIndex == 0) {
FPRINTF(stderr,"ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n");
ADIOI_Free(fd->hints->ranklist);
}
else if (aggIndex < numAggs)
FPRINTF(stderr,"ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",numAggs, aggIndex,aggIndex);
numAggs = aggIndex;
}
}
}
if (numAggs == 0) {
MPID_Comm *mpidCommData;
MPID_Comm_get_ptr(fd->comm,mpidCommData);
int localSize = mpidCommData->local_size;
// get my node rank
int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank];
int *allNodeRanks = (int *) ADIOI_Malloc (localSize * sizeof(int));
allNodeRanks[myRank] = myNodeRank;
MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm);
#ifdef AGG_DEBUG
printf("MPID_Comm data: local_size is %d\nintranode_table entries:\n",mpidCommData->local_size);
for (i=0;i<localSize;i++) {
printf("%d ",mpidCommData->intranode_table[i]);
}
printf("\ninternode_table entries:\n");
for (i=0;i<localSize;i++) {
printf("%d ",mpidCommData->internode_table[i]);
}
printf("\n");
printf("\nallNodeRanks entries:\n");
for (i=0;i<localSize;i++) {
printf("%d ",allNodeRanks[i]);
}
printf("\n");
#endif
if (ioAggrCount) {
int cntType = -1;
if ( strcasecmp(ioAggrCount, "ALL") ) {
if ( (cntType = atoi(ioAggrCount)) <= 0 ) {
/* Input is other non-digit or less than 1 the assume */
/* 1 aggregator per node. Note: atoi(-1) reutns -1. */
/* No warning message given here -- done earlier. */
cntType = -1;
}
}
else {
/* ALL is specified set aggr count to localSize */
cntType = -2;
}
switch(cntType) {
case -1:
/* 1 aggr/node case */
{
int rankListIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
if (allNodeRanks[i] == 0) {
fd->hints->ranklist[rankListIndex++] = i;
numAggs++;
}
}
}
break;
case -2:
/* ALL tasks case */
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
fd->hints->ranklist[i] = i;
numAggs++;
}
break;
default:
/* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */
if (cntType > localSize)
cntType = localSize;
numAggs = cntType;
// Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc
int currentNodeRank = 0; // node rank currently being selected as aggregator
int rankListIndex = 0;
int currentAllNodeIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
while (rankListIndex < numAggs) {
int foundEntry = 0;
while (!foundEntry && (currentAllNodeIndex < localSize)) {
if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) {
fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex;
foundEntry = 1;
}
currentAllNodeIndex++;
}
if (!foundEntry) {
currentNodeRank++;
currentAllNodeIndex = 0;
}
} // while
break;
} // switch(cntType)
} // if (ioAggrCount)
else { // default is 1 aggregator per node
// take the 0 entries from allNodeRanks
int rankListIndex = 0;
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
for (i=0;i<localSize;i++) {
if (allNodeRanks[i] == 0) {
fd->hints->ranklist[rankListIndex++] = i;
numAggs++;
}
}
}
ADIOI_Free(allNodeRanks);
}
if ( getenv("MP_I_SHOW_AGGRS") ) {
if (myRank == 0) {
printf("Agg rank list of %d generated:\n", numAggs);
for (i=0;i<numAggs;i++) {
printf("%d ",fd->hints->ranklist[i]);
}
printf("\n");
}
}
fd->hints->cb_nodes = numAggs;
return 0;
}

Просмотреть файл

@ -0,0 +1,30 @@
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp. 2007, 2008 */
/* ---------------------------------------------------------------- */
/**
* \file ad_pe_aggrs.h
* \brief ???
*/
/*
*
* Declares functions specific for the PE platform within the GPFS
* parallel I/O solution. For now simply processes the MP_IOTASKLIST
* env var.
*
*/
#ifndef AD_PE_AGGRS_H_
#define AD_PE_AGGRS_H_
#include "adio.h"
#include <sys/stat.h>
#if !defined(GPFS_SUPER_MAGIC)
#define GPFS_SUPER_MAGIC (0x47504653)
#endif
/* generate a list of I/O aggregators following a methodology specific for PE */
int ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
#endif /* AD_PE_AGGRS_H_ */

Просмотреть файл

@ -34,4 +34,6 @@ struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
ADIOI_GRIDFTP_Resize, /* Resize */
ADIOI_GRIDFTP_Delete, /* Delete */
ADIOI_GRIDFTP_Feature, /* Features */
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -1,3 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
int ADIOI_GRIDFTP_Feature (ADIO_File fd, int flag)
{
switch(flag) {

Просмотреть файл

@ -106,7 +106,8 @@ void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
*error_code)
{
static char myname[]="ADIOI_GRIDFTP_ReadContig";
int myrank, nprocs, datatype_size;
int myrank, nprocs;
MPI_Count datatype_size;
globus_size_t len,bytes_read=0;
globus_off_t goff;
globus_result_t result;
@ -121,7 +122,7 @@ void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
@ -219,11 +220,11 @@ void ADIOI_GRIDFTP_ReadDiscontig(ADIO_File fd, void *buf, int count,
MPI_Comm_size(fd->comm,&nprocs);
etype_size=fd->etype_size;
MPI_Type_size(fd->filetype,&ftype_size);
MPI_Type_size_x(fd->filetype,&ftype_size);
MPI_Type_extent(fd->filetype,&ftype_extent);
/* This is arguably unnecessary, as this routine assumes that the
buffer in memory is contiguous */
MPI_Type_size(datatype,&btype_size);
MPI_Type_size_x(datatype,&btype_size);
MPI_Type_extent(datatype,&btype_extent);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
@ -415,7 +416,7 @@ void ADIOI_GRIDFTP_ReadStrided(ADIO_File fd, void *buf, int count,
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size(datatype,&btype_size);
MPI_Type_size_x(datatype,&btype_size);
bufsize=count*btype_size;
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
ADIOI_Datatype_iscontig(datatype,&buf_contig);

Просмотреть файл

@ -112,7 +112,8 @@ void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
*error_code)
{
char myname[]="ADIOI_GRIDFTP_WriteContig";
int myrank, nprocs, datatype_size;
int myrank, nprocs;
MPI_Count datatype_size;
globus_size_t len,bytes_written=0;
globus_off_t goff;
globus_result_t result;
@ -127,7 +128,7 @@ void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
{
@ -219,11 +220,11 @@ void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count,
MPI_Comm_rank(fd->comm,&myrank);
MPI_Comm_size(fd->comm,&nprocs);
etype_size=fd->etype_size;
MPI_Type_size(fd->filetype,&ftype_size);
MPI_Type_size_x(fd->filetype,&ftype_size);
MPI_Type_extent(fd->filetype,&ftype_extent);
/* This is arguably unnecessary, as this routine assumes that the
buffer in memory is contiguous */
MPI_Type_size(datatype,&btype_size);
MPI_Type_size_x(datatype,&btype_size);
MPI_Type_extent(datatype,&btype_extent);
ADIOI_Datatype_iscontig(datatype,&buf_contig);
@ -406,7 +407,7 @@ void ADIOI_GRIDFTP_WriteStrided(ADIO_File fd, void *buf, int count,
MPI_Comm_size(fd->comm, &nprocs);
MPI_Comm_rank(fd->comm, &myrank);
MPI_Type_size(datatype,&btype_size);
MPI_Type_size_x(datatype,&btype_size);
bufsize=count*btype_size;
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
ADIOI_Datatype_iscontig(datatype,&buf_contig);

Просмотреть файл

@ -33,4 +33,6 @@ struct ADIOI_Fns_struct ADIO_HFS_operations = {
ADIOI_GEN_Flush, /* Flush */
ADIOI_HFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -15,12 +15,12 @@ void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
MPI_Count err=-1, datatype_size, len;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_READCONTIG";
#endif
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
#ifdef SPPUX

Просмотреть файл

@ -15,12 +15,12 @@ void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
MPI_Count err=-1, datatype_size, len;
#ifndef PRINT_ERR_MSG
static char myname[] = "ADIOI_HFS_WRITECONTIG";
#endif
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
#ifdef SPPUX

Просмотреть файл

@ -40,4 +40,7 @@ struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
ADIOI_GEN_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature, /* Features */
"LUSTRE:",
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -48,15 +48,15 @@ void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);

Просмотреть файл

@ -215,8 +215,8 @@ void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
my_req[i].offsets = (ADIO_Offset *)
ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(int));
my_req[i].lens = ADIOI_Malloc(count_my_req_per_proc[i] *
sizeof(ADIO_Offset));
count_my_req_procs++;
}
my_req[i].count = 0; /* will be incremented where needed later */

Просмотреть файл

@ -10,14 +10,18 @@
#include "ad_lustre.h"
#include "adio_extern.h"
#include "hint_fns.h"
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
char *value;
int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
int flag;
ADIO_Offset stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
struct lov_user_md lum = { 0 };
int err, myrank, fd_sys, perm, amode, old_mask;
int int_val, tmp_val;
static char myname[] = "ADIOI_LUSTRE_SETINFO";
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
@ -44,17 +48,17 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_unit=atoi(value);
str_unit=atoll(value);
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
value, &flag);
if (flag)
str_factor=atoi(value);
str_factor=atoll(value);
ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
MPI_MAX_INFO_VAL, value, &flag);
if (flag)
start_iodev=atoi(value);
start_iodev=atoll(value);
/* direct read and write */
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
@ -78,7 +82,7 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
stripe_val[1] = str_unit;
stripe_val[2] = start_iodev;
}
MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm);
if (stripe_val[0] != str_factor
|| stripe_val[1] != str_unit
@ -121,8 +125,20 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
lum.lmm_magic = LOV_USER_MAGIC;
lum.lmm_pattern = 0;
lum.lmm_stripe_size = str_unit;
/* crude check for overflow of lustre internal datatypes.
* Silently cap to large value if user provides a value
* larger than lustre supports */
if (lum.lmm_stripe_size != str_unit) {
lum.lmm_stripe_size = UINT_MAX;
}
lum.lmm_stripe_count = str_factor;
if ( lum.lmm_stripe_count != str_factor) {
lum.lmm_stripe_count = USHRT_MAX;
}
lum.lmm_stripe_offset = start_iodev;
if (lum.lmm_stripe_offset != start_iodev) {
lum.lmm_stripe_offset = USHRT_MAX;
}
err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
if (err == -1 && errno != EEXIST) {
@ -138,56 +154,19 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
if (users_info != MPI_INFO_NULL) {
/* CO: IO Clients/OST,
* to keep the load balancing between clients and OSTs */
ADIOI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_co_ratio",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", value);
fd->hints->fs_hints.lustre.co_ratio = atoi(value);
}
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_co_ratio",
&(fd->hints->fs_hints.lustre.co_ratio), myname, error_code );
/* coll_threshold:
* if the req size is bigger than this, collective IO may not be performed.
*/
ADIOI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
&flag);
if (flag && (int_val = atoi(value)) > 0) {
tmp_val = int_val;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_coll_threshold",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
}
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_coll_threshold",
&(fd->hints->fs_hints.lustre.coll_threshold), myname, error_code );
/* ds_in_coll: disable data sieving in collective IO */
ADIOI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "disable") ||
!strcmp(value, "DISABLE"))) {
tmp_val = int_val = 2;
MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
if (tmp_val != int_val) {
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
"romio_lustre_ds_in_coll",
error_code);
ADIOI_Free(value);
return;
}
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
}
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_lustre_ds_in_coll",
&(fd->hints->fs_hints.lustre.ds_in_coll), myname, error_code );
}
/* set the values for collective I/O and data sieving parameters */
ADIOI_GEN_SetInfo(fd, users_info, error_code);

Просмотреть файл

@ -105,50 +105,7 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
/* --BEGIN ERROR HANDLING-- */
if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
(fd->direct_write || fd->direct_read))) {
if (errno == ENAMETOOLONG)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamelong",
"**filenamelong %s %d",
fd->filename,
strlen(fd->filename));
else if (errno == ENOENT)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_NO_SUCH_FILE,
"**filenoexist",
"**filenoexist %s",
fd->filename);
else if (errno == ENOTDIR || errno == ELOOP)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_BAD_FILE,
"**filenamedir",
"**filenamedir %s",
fd->filename);
else if (errno == EACCES) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_ACCESS,
"**fileaccess",
"**fileaccess %s",
fd->filename );
}
else if (errno == EROFS) {
/* Read only file or file system and write access requested */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_READ_ONLY,
"**ioneedrd", 0 );
}
else {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
/* --END ERROR HANDLING-- */
else *error_code = MPI_SUCCESS;

Просмотреть файл

@ -8,15 +8,22 @@
* Copyright (C) 2008 Sun Microsystems, Lustre group
*/
#ifdef _STDC_C99
#define _XOPEN_SOURCE 600
#else
#define _XOPEN_SOURCE 500
#endif
#include <unistd.h>
#include <stdlib.h>
#include <malloc.h>
#include "ad_lustre.h"
#define LUSTRE_MEMALIGN (1<<12) /* to use page_shift */
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err);
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err)
{
int rem, size, nbytes;
@ -33,29 +40,29 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len
}
}
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err);
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
ADIO_Offset offset, int *err)
{
int rem, size, nbytes;
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
*err = pread(fd->fd_direct, buf, len, offset);
*err = pread(fd->fd_direct, (void *)buf, len, offset);
else if (len < fd->d_miniosz)
*err = pread(fd->fd_sys, buf, len, offset);
*err = pread(fd->fd_sys, (void *)buf, len, offset);
else {
rem = len % fd->d_miniosz;
size = len - rem;
nbytes = pread(fd->fd_direct, buf, size, offset);
nbytes = pread(fd->fd_direct, (void *)buf, size, offset);
nbytes += pread(fd->fd_sys, ((char *)buf) + size, rem, offset+size);
*err = nbytes;
}
}
static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
off_t offset, int rw);
static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
off_t offset, int rw)
{
int err=-1, diff, size=len, nbytes = 0;
@ -65,9 +72,9 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
diff = fd->d_miniosz - (offset % fd->d_miniosz);
diff = ADIOI_MIN(diff, len);
if (rw)
nbytes = pwrite(fd->fd_sys, buf, diff, offset);
nbytes = pwrite(fd->fd_sys, (void *)buf, diff, offset);
else
nbytes = pread(fd->fd_sys, buf, diff, offset);
nbytes = pread(fd->fd_sys, (void *)buf, diff, offset);
buf = ((char *) buf) + diff;
offset += diff;
size = len - diff;
@ -100,30 +107,31 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
if (newbuf) {
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
if (err > 0) memcpy(buf, newbuf, err);
if (err > 0) memcpy((void *)buf, newbuf, err);
nbytes += err;
ADIOI_Free(newbuf);
}
else nbytes += pread(fd->fd_sys, buf, size, offset);
else nbytes += pread(fd->fd_sys, (void *)buf, size, offset);
}
err = nbytes;
}
return err;
}
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int io_mode, int *error_code);
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int io_mode, int *error_code)
{
int err=-1, datatype_size, len;
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_LUSTRE_IOCONTIG";
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_INDIVIDUAL) {
@ -148,7 +156,7 @@ static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
#endif
err = read(fd->fd_sys, buf, len);
err = read(fd->fd_sys, (void *)buf, len);
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
#endif
@ -183,7 +191,7 @@ ioerr:
/* --END ERROR HANDLING-- */
}
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{

Просмотреть файл

@ -12,7 +12,7 @@
#include "adio_extern.h"
/* prototypes of functions used for collective writes only. */
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
MPI_Datatype datatype, int nprocs,
int myrank,
ADIOI_Access *others_req,
@ -22,7 +22,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
int contig_access_count,
int *striping_info,
int **buf_idx, int *error_code);
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,
@ -35,14 +35,14 @@ static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
int *curr_to_proc,
int *done_to_proc, int iter,
MPI_Aint buftype_extent);
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos,
int *start_pos,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
@ -59,7 +59,7 @@ void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
ADIO_Offset *srt_off, int *srt_len, int *start_pos,
int nprocs, int nprocs_recv, int total_elements);
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype,
int file_ptr_type, ADIO_Offset offset,
ADIO_Status *status, int *error_code)
@ -266,9 +266,9 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
#ifdef HAVE_STATUS_SET_BYTES
if (status) {
int bufsize, size;
MPI_Count bufsize, size;
/* Don't set status if it isn't needed */
MPI_Type_size(datatype, &size);
MPI_Type_size_x(datatype, &size);
bufsize = size * count;
MPIR_Status_set_bytes(status, datatype, bufsize);
}
@ -283,7 +283,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
* code is created and returned in error_code.
*/
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
MPI_Datatype datatype, int nprocs,
int myrank, ADIOI_Access *others_req,
ADIOI_Access *my_req,
@ -613,14 +613,14 @@ over:
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
* in the case of error.
*/
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
char *write_buf,
ADIOI_Flatlist_node *flat_buf,
ADIO_Offset *offset_list,
ADIO_Offset *len_list, int *send_size,
int *recv_size, ADIO_Offset off,
int size, int *count,
int *start_pos,
int *start_pos,
int *sent_to_proc, int nprocs,
int myrank, int buftype_is_contig,
int contig_access_count,
@ -656,7 +656,7 @@ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
j = 0;
for (i = 0; i < nprocs; i++) {
if (recv_size[i]) {
MPI_Type_hindexed(count[i],
ADIOI_Type_create_hindexed_x(count[i],
&(others_req[i].lens[start_pos[i]]),
&(others_req[i].mem_ptrs[start_pos[i]]),
MPI_BYTE, recv_types + j);
@ -885,7 +885,7 @@ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
ADIOI_BUF_INCR \
}
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
ADIOI_Flatlist_node *flat_buf,
char **send_buf,
ADIO_Offset *offset_list,

Просмотреть файл

@ -144,7 +144,7 @@
} \
}
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status * status,
int *error_code)
@ -156,7 +156,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
int n_etypes_in_filetype;
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
@ -186,7 +186,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
MPI_Type_size_x(fd->filetype, &filetype_size);
if (!filetype_size) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
@ -196,7 +196,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;

Просмотреть файл

@ -37,4 +37,7 @@ struct ADIOI_Fns_struct ADIO_NFS_operations = {
ADIOI_NFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_NFS_Feature, /* Features */
"NFS:", /* fsname: just a string */
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -74,7 +74,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
ADIO_Offset offset, ADIO_Status *status, int
*error_code);
void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp,
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp,
int *error_code);
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);

Просмотреть файл

@ -1,3 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "adio.h"
#include "ad_nfs.h"
@ -11,6 +17,7 @@ int ADIOI_NFS_Feature(ADIO_File fd, int flag)
return 1;
case ADIO_SCALABLE_OPEN:
case ADIO_UNLINK_AFTER_CLOSE:
case ADIO_SCALABLE_RESIZE:
default:
return 0;
}

Просмотреть файл

@ -12,11 +12,11 @@
shared_fp by the number of etypes to be accessed (incr) in the read
or write following this function. */
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
int *error_code)
{
ADIO_Offset new_fp;
int err;
ssize_t err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_NFS_GET_SHARED_FP";

Просмотреть файл

@ -13,11 +13,11 @@ void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
int len, typesize;
MPI_Count len, typesize;
int aio_errno = 0;
static char myname[] = "ADIOI_NFS_IREADCONTIG";
MPI_Type_size(datatype, &typesize);
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;

Просмотреть файл

@ -20,11 +20,11 @@ void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
int len, typesize;
MPI_Count len, typesize;
int aio_errno = 0;
static char myname[] = "ADIOI_NFS_IWRITECONTIG";
MPI_Type_size(datatype, &typesize);
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;

Просмотреть файл

@ -52,64 +52,7 @@ void ADIOI_NFS_Open(ADIO_File fd, int *error_code)
}
if (fd->fd_sys == -1) {
/* Check for special error codes for those MPI error
classes that relate to particular problems */
if (errno == ENAMETOOLONG)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamelong",
"**filenamelong %s %d",
fd->filename,
strlen(fd->filename));
else if (errno == ENOENT)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_NO_SUCH_FILE,
"**filenoexist",
"**filenoexist %s",
fd->filename);
else if (errno == ENOTDIR || errno == ELOOP)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamedir",
"**filenamedir %s",
fd->filename);
else if (errno == EACCES) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_ACCESS,
"**fileaccess",
"**fileaccess %s",
fd->filename);
}
else if (errno == EROFS) {
/* Read only file or file system and write access requested */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_READ_ONLY,
"**ioneedrd", 0);
}
else if(errno == EISDIR) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filename", 0);
}
else if(errno == EEXIST) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_FILE_EXISTS,
"**fileexist", 0);
}
else {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}

Просмотреть файл

@ -12,10 +12,11 @@ void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NFS_READCONTIG";
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
@ -171,7 +172,8 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len, partial_read;
int req_len, partial_read;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
@ -185,7 +187,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
MPI_Type_size_x(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
@ -195,7 +197,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;

Просмотреть файл

@ -30,7 +30,7 @@ Unlock
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
{
int err;
ssize_t err;
MPI_Comm dupcommself;
static char myname[] = "ADIOI_NFS_SET_SHARED_FP";

Просмотреть файл

@ -12,10 +12,11 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status, int *error_code)
{
int err=-1, datatype_size, len;
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NFS_WRITECONTIG";
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
@ -110,7 +111,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
goto fn_exit; \
} \
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
@ -140,7 +141,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
goto fn_exit; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
@ -164,7 +165,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
goto fn_exit; \
} \
} \
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
@ -186,7 +187,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPIR_ERR_RECOVERABLE, myname, \
__LINE__, MPI_ERR_IO, \
"**ioRMWrdwr", 0); \
return; \
goto fn_exit; \
} \
write_sz = ADIOI_MIN(req_len, writebuf_len); \
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
@ -275,12 +276,13 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
int n_filetypes, etype_in_filetype;
ADIO_Offset abs_off_in_filetype=0;
int filetype_size, etype_size, buftype_size, req_len;
int req_len;
MPI_Count filetype_size, etype_size, buftype_size;
MPI_Aint filetype_extent, buftype_extent;
int buf_count, buftype_is_contig, filetype_is_contig;
ADIO_Offset userbuf_off;
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
char *writebuf, *value;
char *writebuf=NULL, *value;
int st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
static char myname[] = "ADIOI_NFS_WRITESTRIDED";
@ -288,7 +290,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
MPI_Type_size(fd->filetype, &filetype_size);
MPI_Type_size_x(fd->filetype, &filetype_size);
if ( ! filetype_size ) {
#ifdef HAVE_STATUS_SET_BYTES
MPIR_Status_set_bytes(status, datatype, 0);
@ -298,7 +300,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
}
MPI_Type_extent(fd->filetype, &filetype_extent);
MPI_Type_size(datatype, &buftype_size);
MPI_Type_size_x(datatype, &buftype_size);
MPI_Type_extent(datatype, &buftype_extent);
etype_size = fd->etype_size;
@ -364,8 +366,6 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
if (fd->atomicity)
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
@ -517,8 +517,8 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
myname, __LINE__,
MPI_ERR_IO,
"ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
return;
}
goto fn_exit;
}
if (buftype_is_contig && !filetype_is_contig) {
@ -653,8 +653,6 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
if (err == -1) err_flag = 1;
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
if (err_flag) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
@ -674,4 +672,8 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
#endif
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
fn_exit:
if (writebuf != NULL) ADIOI_Free(writebuf);
return;
}

Просмотреть файл

@ -12,7 +12,7 @@
struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Open, /* Open */
ADIOI_GEN_OpenColl, /* OpenColl */
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
ADIOI_NTFS_ReadContig, /* ReadContig */
ADIOI_NTFS_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
@ -34,5 +34,7 @@ struct ADIOI_Fns_struct ADIO_NTFS_operations = {
ADIOI_NTFS_Flush, /* Flush */
ADIOI_NTFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_NTFS_Feature /* Features */
ADIOI_NTFS_Feature, /* Features */
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -1,3 +1,9 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
*
* (C) 2008 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "adio.h"
int ADIOI_NTFS_Feature(ADIO_File fd, int flag)

Просмотреть файл

@ -10,11 +10,11 @@ void ADIOI_NTFS_IreadContig(ADIO_File fd, void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Request *request, int *error_code)
{
int len, typesize;
MPI_Count len, typesize;
int err;
static char myname[] = "ADIOI_NTFS_IreadContig";
MPI_Type_size(datatype, &typesize);
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL)

Просмотреть файл

@ -171,11 +171,11 @@ void ADIOI_NTFS_IwriteContig(ADIO_File fd, void *buf, int count,
ADIO_Offset offset, ADIO_Request *request,
int *error_code)
{
int len, typesize;
MPI_Count len, typesize;
int err;
static char myname[] = "ADIOI_NTFS_IwriteContig";
MPI_Type_size(datatype, &typesize);
MPI_Type_size_x(datatype, &typesize);
len = count * typesize;
if (file_ptr_type == ADIO_INDIVIDUAL)

Просмотреть файл

@ -13,7 +13,8 @@ void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
{
LONG dwTemp;
DWORD dwNumRead = 0;
int err=-1, datatype_size, len;
int err=-1;
MPI_Count datatype_size, len;
static char myname[] = "ADIOI_NTFS_ReadContig";
OVERLAPPED *pOvl;
@ -23,7 +24,7 @@ void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
offset = fd->fp_ind;
}
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);

Просмотреть файл

@ -14,7 +14,7 @@ void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
static char myname[] = "ADIOI_NTFS_WriteContig";
LONG dwTemp;
DWORD dwNumWritten = 0;
int err=-1, datatype_size, len;
MPI_Count err=-1, datatype_size, len;
OVERLAPPED *pOvl;
/* If file pointer type in ADIO_INDIVIDUAL then offset should be
@ -23,7 +23,7 @@ void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
offset = fd->fp_ind;
}
MPI_Type_size(datatype, &datatype_size);
MPI_Type_size_x(datatype, &datatype_size);
len = datatype_size * count;
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);

Просмотреть файл

@ -7,10 +7,6 @@
if BUILD_AD_PANFS
# I don't like this hard-coded path to the PANFS headers but I guess that's
# where they always are?
AM_CPPFLAGS += -I/opt/panfs/include
noinst_HEADERS += adio/ad_panfs/ad_panfs.h
romio_other_sources += \

Просмотреть файл

@ -41,4 +41,7 @@ struct ADIOI_Fns_struct ADIO_PANFS_operations = {
ADIOI_PANFS_Resize, /* Resize */
ADIOI_GEN_Delete, /* Delete */
ADIOI_GEN_Feature,
"PANFS: Panasas PanFS",
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
};

Просмотреть файл

@ -32,11 +32,17 @@ void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
void ADIOI_PANFS_WriteContig(ADIO_File fd, void *buf, int count,
void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
MPI_Datatype datatype, int file_ptr_type,
ADIO_Offset offset, ADIO_Status *status,
int *error_code);
/* TODO: move this to common code and have all routines retry. */
/* TODO: also check for EWOULDBLOCK */
#if defined(NEEDS_USLEEP_DECL)
int usleep(useconds_t usec);
#endif
/* Delay 1 ms */
#define AD_PANFS_RETRY_DELAY 1000

Просмотреть файл

@ -8,21 +8,13 @@
#include "ad_panfs.h"
#include <pan_fs_client_cw_mode.h>
#include "hint_fns.h"
void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
#if defined(MPICH) || !defined(PRINT_ERR_MSG)
static char myname[] = "ADIOI_PANFS_SETINFO";
#endif
char* value;
int flag, tmp_val = -1;
unsigned long int concurrent_write = 0;
pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT;
unsigned long int layout_stripe_unit = 0;
unsigned long int layout_parity_stripe_width = 0;
unsigned long int layout_parity_stripe_depth = 0;
unsigned long int layout_total_num_comps = 0;
pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
int gen_error_code;
*error_code = MPI_SUCCESS;
@ -33,104 +25,39 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
*/
MPI_Info_create(&(fd->info));
/* anticipate concurrent writes in an MPI-IO application */
ADIOI_Info_set (fd->info, "panfs_concurrent_write", "1");
/* has user specified striping parameters
and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) {
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
ADIOI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
concurrent_write = strtoul(value,NULL,10);
tmp_val = concurrent_write;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != concurrent_write) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_concurrent_write", value);
}
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_concurrent_write",
NULL, myname, error_code);
ADIOI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_type = strtoul(value,NULL,10);
tmp_val = layout_type;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_type) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_type", value);
}
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_type",
NULL, myname, error_code);
ADIOI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_stripe_unit = strtoul(value,NULL,10);
tmp_val = layout_stripe_unit;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_stripe_unit) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", value);
}
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_stripe_unit",
NULL, myname, error_code);
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_width = strtoul(value,NULL,10);
tmp_val = layout_parity_stripe_width;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_parity_stripe_width) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value);
}
/* strange: there was a check "layout_type ==
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE, but
* nothing ever touched layout_type */
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_parity_stripe_width", NULL, myname, error_code);
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
layout_parity_stripe_depth = strtoul(value,NULL,10);
tmp_val = layout_parity_stripe_depth;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_parity_stripe_depth) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value);
}
ADIOI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
value, &flag);
if (flag) {
layout_total_num_comps = strtoul(value,NULL,10);
tmp_val = layout_total_num_comps;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_total_num_comps) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", value);
}
ADIOI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
layout_visit_policy = strtoul(value,NULL,10);
tmp_val = layout_visit_policy;
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
if (tmp_val != layout_visit_policy) {
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", value);
}
ADIOI_Free(value);
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_parity_stripe_depth", NULL, myname, error_code);
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_total_num_comps", NULL, myname, error_code);
/* this hint used to check for
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE or
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID10, but again, layout_type never
* gets updated */
ADIOI_Info_check_and_install_int(fd, users_info,
"panfs_layout_visit_policy", NULL, myname, error_code);
}
}

Просмотреть файл

@ -191,7 +191,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
}
/* create PanFS object */
bzero(&file_create_args,sizeof(pan_fs_client_layout_create_args_t));
memset(&file_create_args,0,sizeof(pan_fs_client_layout_create_args_t));
/* open directory */
fd_dir = open(path, O_RDONLY);
if (fd_dir < 0) {
@ -285,7 +285,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
int rc;
char temp_buffer[TEMP_BUFFER_SIZE];
pan_fs_client_layout_query_args_t file_query_args;
bzero(&file_query_args,sizeof(pan_fs_client_layout_query_args_t));
memset(&file_query_args,0,sizeof(pan_fs_client_layout_query_args_t));
file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args);
if (rc < 0)
@ -327,6 +327,10 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy);
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
break;
case PAN_FS_CLIENT_LAYOUT_TYPE__INVALID:
case PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT:
MPI_Info_set(fd->info, "panfs_layout_type",
"PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
default:
break;
}
@ -338,50 +342,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
if (fd->fd_sys == -1) {
if (errno == ENAMETOOLONG)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_BAD_FILE,
"**filenamelong",
"**filenamelong %s %d",
fd->filename,
strlen(fd->filename));
else if (errno == ENOENT)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_NO_SUCH_FILE,
"**filenoexist",
"**filenoexist %s",
fd->filename);
else if (errno == ENOTDIR || errno == ELOOP)
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_BAD_FILE,
"**filenamedir",
"**filenamedir %s",
fd->filename);
else if (errno == EACCES) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_ACCESS,
"**fileaccess",
"**fileaccess %s",
fd->filename );
}
else if (errno == EROFS) {
/* Read only file or file system and write access requested */
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_READ_ONLY,
"**ioneedrd", 0 );
}
else {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE, myname,
__LINE__, MPI_ERR_IO, "**io",
"**io %s", strerror(errno));
}
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
}
else *error_code = MPI_SUCCESS;
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше