refresh ROMIO based on v3.2a2-84-gef1cf14
Этот коммит содержится в:
родитель
c857cc926c
Коммит
0914de9eae
13
ompi/mca/io/romio/romio/.gitignore
поставляемый
Обычный файл
13
ompi/mca/io/romio/romio/.gitignore
поставляемый
Обычный файл
@ -0,0 +1,13 @@
|
||||
/Makefile
|
||||
/.deps
|
||||
/*.bb
|
||||
/*.bbg
|
||||
/*.gcda
|
||||
/*.gcno
|
||||
/.libs
|
||||
/.libstamp*
|
||||
/*.lo
|
||||
/.*-cache
|
||||
.state-cache
|
||||
version.m4
|
||||
confdb/config.rpath
|
@ -5,15 +5,15 @@
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# (C) 2011 by Argonne National Laboratory.
|
||||
@ -33,11 +33,11 @@ include $(top_srcdir)/Makefile.options
|
||||
ACLOCAL_AMFLAGS = -I confdb
|
||||
|
||||
# empty variable initializations so that later code can append (+=)
|
||||
include_HEADERS =
|
||||
include_HEADERS =
|
||||
nodist_include_HEADERS =
|
||||
noinst_HEADERS =
|
||||
noinst_HEADERS =
|
||||
EXTRA_DIST =
|
||||
SUFFIXES =
|
||||
SUFFIXES =
|
||||
doc1_src_txt =
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
@ -47,14 +47,14 @@ doc1_src_txt =
|
||||
# In MPICH these will have an MPI_ and a PMPI_ version. Other implementations
|
||||
# (like OMPI) only want these to be MPI_ routines, possibly with some
|
||||
# name-shifting prefix.
|
||||
romio_mpi_sources =
|
||||
romio_mpi_sources =
|
||||
|
||||
# regular old source files that implement ROMIO, such as ADIO code
|
||||
romio_other_sources =
|
||||
|
||||
# code that may need to be "up" called from the MPI library and/or is
|
||||
# MPI-implementation-specific in some way
|
||||
glue_sources =
|
||||
glue_sources =
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# when building under MPICH we must be able to find mpi.h
|
||||
@ -88,25 +88,30 @@ libromio_dist_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_so
|
||||
|
||||
## NOTE: ROMIO's old build system builds a bunch of _foo.o objects that contain
|
||||
## PMPI_ implementations as well as calls to only other PMPI routines. In
|
||||
## MPICH, these are the objects that need to go into libmpich, while the foo.o
|
||||
## objects should go into libpmpich. Furthermore, the -D option for ROMIO's
|
||||
## MPICH, these are the objects that need to go into libmpi, while the foo.o
|
||||
## objects should go into libpmpi. Furthermore, the -D option for ROMIO's
|
||||
## source files is different and inverted (in the boolean sense) compared with
|
||||
## MPICH's defintion. And ROMIO was dumping all of the symbols into the main
|
||||
## libmpich library, regardless of the separate profiling library's existence.
|
||||
## libmpi library, regardless of the separate profiling library's existence.
|
||||
##
|
||||
## Annoying, right?
|
||||
if BUILD_PROFILING_LIB
|
||||
# The current best strategy for now is to build the PMPI symbols as a separate
|
||||
# convenience lib to permit adding the special "-D..." argument for all objects.
|
||||
# MPICH will then link in both convenience library into libmpich, since it
|
||||
# MPICH will then link in both convenience library into libmpi, since it
|
||||
# won't work very well the other way around.
|
||||
noinst_LTLIBRARIES += libpromio.la
|
||||
libpromio_la_SOURCES = $(romio_mpi_sources)
|
||||
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
|
||||
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
|
||||
endif BUILD_PROFILING_LIB
|
||||
|
||||
else !BUILD_ROMIO_EMBEDDED
|
||||
## TODO build a libromio.la (non-convenience) and possibly a libglue.la or something?
|
||||
lib_LTLIBRARIES = libromio.la
|
||||
libromio_la_SOURCES = $(romio_mpi_sources) $(romio_other_sources) $(glue_sources)
|
||||
if BUILD_PROFILING_LIB
|
||||
libpromio_la_SOURCES = $(romio_mpi_sources)
|
||||
libpromio_la_CPPFLAGS = $(AM_CPPFLAGS) -DMPIO_BUILD_PROFILING
|
||||
endif BUILD_PROFILING_LIB
|
||||
|
||||
endif
|
||||
|
||||
@ -147,20 +152,27 @@ mandoc_path3=$(abs_top_builddir)/man/man3
|
||||
htmldoc_path1=$(abs_top_builddir)/www/www1
|
||||
htmldoc_path3=$(abs_top_builddir)/www/www3
|
||||
doctext_docnotes=
|
||||
# Provide an easily replaced url root for the generated index file.
|
||||
# You can override this with URL desired in the index file generated by doctext.
|
||||
# You can ignore this if you don't use mapnames or tohtml to add links
|
||||
# to the MPI manual pages to documents.
|
||||
htmldoc_root3="--your-url-here--"
|
||||
|
||||
.c.man-phony:
|
||||
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path3) -ext 3 \
|
||||
-heading MPI -quotefmt $(doctext_docnotes) $<
|
||||
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
|
||||
.c.html-phony:
|
||||
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path3) \
|
||||
-heading MPI -quotefmt $(doctext_docnotes) $<
|
||||
-heading MPI -quotefmt -nolocation \
|
||||
-index $(htmldoc_path3)/mpi.cit -indexdir $(htmldoc_root3) \
|
||||
$(doctext_docnotes) $<
|
||||
|
||||
.txt.man1-phony:
|
||||
$(doctextman_verbose)$(DOCTEXT) -man -mpath $(mandoc_path1) -ext 1 \
|
||||
-heading MPI -quotefmt $(doctext_docnotes) $<
|
||||
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
|
||||
.txt.html1-phony:
|
||||
$(doctexthtml_verbose)$(DOCTEXT) -html -mpath $(htmldoc_path1) \
|
||||
-heading MPI -quotefmt $(doctext_docnotes) $<
|
||||
-heading MPI -quotefmt -nolocation $(doctext_docnotes) $<
|
||||
|
||||
# use mandoc-local target to force directory creation before running DOCTEXT
|
||||
mandoc:
|
||||
|
@ -6,14 +6,14 @@
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
@ -492,7 +492,7 @@ to include the file mpio.h for C or mpiof.h for Fortran in your MPI-IO
|
||||
program.
|
||||
|
||||
Note that on HP machines running HPUX and on NEC SX-4, you need to
|
||||
compile Fortran programs with mpif90, because the f77 compilers on
|
||||
compile Fortran programs with mpifort, because the f77 compilers on
|
||||
these machines don't support 8-byte integers.
|
||||
|
||||
With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
|
||||
@ -500,9 +500,9 @@ With MPICH, HP MPI, or NEC MPI, you can compile MPI-IO programs as
|
||||
or
|
||||
mpif77 foo.f
|
||||
or
|
||||
mpif90 foo.f
|
||||
mpifort foo.f
|
||||
|
||||
As mentioned above, mpif90 is preferred over mpif77 on HPUX and NEC
|
||||
As mentioned above, mpifort is preferred over mpif77 on HPUX and NEC
|
||||
because the f77 compilers on those machines do not support 8-byte integers.
|
||||
|
||||
With SGI MPI, you can compile MPI-IO programs as
|
||||
@ -566,7 +566,7 @@ systems because they don't support fcntl file locks, and ROMIO uses
|
||||
that feature to implement shared file pointers.
|
||||
|
||||
* On HP machines running HPUX and on NEC SX-4, you need to compile
|
||||
Fortran programs with mpif90 instead of mpif77, because the f77
|
||||
Fortran programs with mpifort instead of mpif77, because the f77
|
||||
compilers on these machines don't support 8-byte integers.
|
||||
|
||||
* The file-open mode MPI_MODE_EXCL does not work on Intel PFS file system,
|
||||
|
@ -1,28 +1,11 @@
|
||||
Please note that this is *NOT* a vanilla MPICH 3.0.4 distribution of the
|
||||
ROMIO package from Argonne National Labs. Various customizations had
|
||||
to be applied to the configuration process. More to the point -- if
|
||||
replace this copy of ROMIO with a newer version, it will likely not
|
||||
work. :-(
|
||||
Please note that this is *NOT* a vanilla MPICH v3.2a2-84-gef1cf14
|
||||
distribution of the ROMIO package from Argonne National Labs.
|
||||
Various customizations had to be applied to the configuration process.
|
||||
More to the point -- if replace this copy of ROMIO with a newer version,
|
||||
it will likely not work. :-(
|
||||
|
||||
- The Open MPI Team
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
Local modifications:
|
||||
====================
|
||||
|
||||
- Moved aclocal.m4 -> acinclude.m4
|
||||
- Bunches of changes in acinclude.m4 to make it work with modern
|
||||
versions of the GNU auto tools -- see comments in file.
|
||||
- Bunches of changes in configure.ac to make it work with modern
|
||||
versions of the GNU auto tools -- see comments in file.
|
||||
- We define MPI_MAX_DATAREP_STRING, therefore
|
||||
protect the redefinition in include/mpio.h
|
||||
- Not all systems have snprintf(); include "opal/util/printf.h" in
|
||||
test/noncontig_coll.c
|
||||
|
||||
Patches past 3.0.4: (Update whenever ROMIO is updated)
|
||||
- Deal with endless ESTALE cases:
|
||||
http://git.mpich.org/mpich.git/commit/b250d338e66667a8a1071a5f73a4151fd59f83b2
|
||||
- Fix compile error with Lustre 2.4
|
||||
http://trac.mpich.org/projects/mpich/changeset/a0c4278f1400a73eb63c5106e2bd3b1a6565ad5a
|
||||
Local modifications are in ompi.patch
|
||||
|
@ -19,13 +19,11 @@ noinst_HEADERS += \
|
||||
adio/include/mpio_error.h \
|
||||
adio/include/mpipr.h \
|
||||
adio/include/mpiu_greq.h \
|
||||
adio/include/nopackage.h \
|
||||
adio/include/mpiu_external32.h \
|
||||
adio/include/romioconf-undefs.h
|
||||
adio/include/nopackage.h
|
||||
|
||||
include $(top_srcdir)/adio/ad_bg/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_bgl/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_bglockless/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_gpfs/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_gpfs/bg/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_gpfs/pe/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_gridftp/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_hfs/Makefile.mk
|
||||
include $(top_srcdir)/adio/ad_lustre/Makefile.mk
|
||||
|
@ -1,35 +0,0 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2011 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_BG
|
||||
|
||||
AM_CPPFLAGS += -DBGL_OPTIM_STEP1_2=1 -DBGL_OPTIM_STEP1_1=1
|
||||
|
||||
noinst_HEADERS += \
|
||||
adio/ad_bg/ad_bg_aggrs.h \
|
||||
adio/ad_bg/ad_bg.h \
|
||||
adio/ad_bg/ad_bg_pset.h \
|
||||
adio/ad_bg/ad_bg_tuning.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_bg/ad_bg_aggrs.c \
|
||||
adio/ad_bg/ad_bg_close.c \
|
||||
adio/ad_bg/ad_bg_flush.c \
|
||||
adio/ad_bg/ad_bg_hints.c \
|
||||
adio/ad_bg/ad_bg_pset.c \
|
||||
adio/ad_bg/ad_bg_read.c \
|
||||
adio/ad_bg/ad_bg_tuning.c \
|
||||
adio/ad_bg/ad_bg_write.c \
|
||||
adio/ad_bg/ad_bg.c \
|
||||
adio/ad_bg/ad_bg_fcntl.c \
|
||||
adio/ad_bg/ad_bg_getsh.c \
|
||||
adio/ad_bg/ad_bg_open.c \
|
||||
adio/ad_bg/ad_bg_rdcoll.c \
|
||||
adio/ad_bg/ad_bg_setsh.c \
|
||||
adio/ad_bg/ad_bg_wrcoll.c
|
||||
|
||||
endif BUILD_AD_BG
|
@ -1,51 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
#define BG_OPTIM_STEP1_1 1
|
||||
#include "ad_bg.h"
|
||||
|
||||
/* adioi.h has the ADIOI_Fns_struct define */
|
||||
#include "adioi.h"
|
||||
|
||||
struct ADIOI_Fns_struct ADIO_BG_operations = {
|
||||
ADIOI_BG_Open, /* Open */
|
||||
ADIOI_GEN_OpenColl, /* Collective open */
|
||||
ADIOI_BG_ReadContig, /* ReadContig */
|
||||
ADIOI_BG_WriteContig, /* WriteContig */
|
||||
ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
|
||||
ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
|
||||
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
|
||||
ADIOI_BG_Fcntl, /* Fcntl */
|
||||
ADIOI_BG_SetInfo, /* SetInfo */
|
||||
ADIOI_BG_ReadStrided, /* ReadStrided */
|
||||
ADIOI_BG_WriteStrided, /* WriteStrided */
|
||||
ADIOI_BG_Close, /* Close */
|
||||
#ifdef ROMIO_HAVE_WORKING_AIO
|
||||
#warning Consider BG support for NFS before enabling this.
|
||||
ADIOI_GEN_IreadContig, /* IreadContig */
|
||||
ADIOI_GEN_IwriteContig, /* IwriteContig */
|
||||
#else
|
||||
ADIOI_FAKE_IreadContig, /* IreadContig */
|
||||
ADIOI_FAKE_IwriteContig, /* IwriteContig */
|
||||
#endif
|
||||
ADIOI_GEN_IODone, /* ReadDone */
|
||||
ADIOI_GEN_IODone, /* WriteDone */
|
||||
ADIOI_GEN_IOComplete, /* ReadComplete */
|
||||
ADIOI_GEN_IOComplete, /* WriteComplete */
|
||||
ADIOI_GEN_IreadStrided, /* IreadStrided */
|
||||
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
|
||||
ADIOI_BG_Flush, /* Flush */
|
||||
ADIOI_GEN_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_GEN_Feature, /* Features */
|
||||
};
|
@ -1,97 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#ifndef AD_BG_INCLUDE
|
||||
#define AD_BG_INCLUDE
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <fcntl.h>
|
||||
#include "adio.h"
|
||||
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#ifdef HAVE_AIO_H
|
||||
#include <aio.h>
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
int ADIOI_BG_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
|
||||
int wr, void *handle);
|
||||
#endif
|
||||
|
||||
void ADIOI_BG_Open(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_BG_Close(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
#if 0
|
||||
void ADIOI_BG_IwriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int
|
||||
*error_code);
|
||||
void ADIOI_BG_IreadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int
|
||||
*error_code);
|
||||
int ADIOI_BG_ReadDone(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
int ADIOI_BG_WriteDone(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BG_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BG_WriteComplete(ADIO_Request *request, ADIO_Status *status,
|
||||
int *error_code);
|
||||
#endif
|
||||
void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
|
||||
*error_code);
|
||||
void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
|
||||
|
||||
void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BG_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BG_WriteStridedColl(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BG_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
|
||||
void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
|
||||
|
||||
void ADIOI_BG_Flush(ADIO_File fd, int *error_code);
|
||||
|
||||
#include "ad_bg_tuning.h"
|
||||
|
||||
|
||||
#endif
|
@ -1,53 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_close.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
|
||||
void ADIOI_BG_Close(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int err, derr=0;
|
||||
static char myname[] = "ADIOI_BG_CLOSE";
|
||||
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(9, 0, "start close");
|
||||
#endif
|
||||
|
||||
err = close(fd->fd_sys);
|
||||
if (fd->fd_direct >= 0)
|
||||
{
|
||||
derr = close(fd->fd_direct);
|
||||
}
|
||||
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(10, 0, "end close");
|
||||
#endif
|
||||
|
||||
/* FPRINTF(stderr,"%s(%d):'%s'. Free %#X\n",myname,__LINE__,fd->filename,(int)fd->fs_ptr);*/
|
||||
if (fd->fs_ptr != NULL) {
|
||||
ADIOI_Free(fd->fs_ptr);
|
||||
fd->fs_ptr = NULL;
|
||||
}
|
||||
fd->fd_sys = -1;
|
||||
fd->fd_direct = -1;
|
||||
|
||||
if (err == -1 || derr == -1)
|
||||
{
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,58 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_fcntl.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "adio_extern.h"
|
||||
/* #ifdef MPISGI
|
||||
#include "mpisgi2.h"
|
||||
#endif */
|
||||
|
||||
void ADIOI_BG_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
|
||||
int *error_code)
|
||||
{
|
||||
static char myname[] = "ADIOI_BG_FCNTL";
|
||||
|
||||
switch(flag) {
|
||||
case ADIO_FCNTL_GET_FSIZE:
|
||||
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
if (fd->fp_sys_posn != -1)
|
||||
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
|
||||
if (fcntl_struct->fsize == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
break;
|
||||
|
||||
case ADIO_FCNTL_SET_DISKSPACE:
|
||||
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
|
||||
break;
|
||||
|
||||
case ADIO_FCNTL_SET_ATOMICITY:
|
||||
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
|
||||
*error_code = MPI_SUCCESS;
|
||||
break;
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
default:
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_ARG,
|
||||
"**flag", "**flag %d", flag);
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
}
|
@ -1,90 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_flush.c
|
||||
* \brief Scalable flush based on underlying filesystem and psets
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
|
||||
void ADIOI_BG_Flush(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int err=0;
|
||||
static char myname[] = "ADIOI_BG_FLUSH";
|
||||
|
||||
|
||||
if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATION_ENABLED)
|
||||
{
|
||||
int rank;
|
||||
|
||||
/* Barrier so we can collectively do fewer fsync's */
|
||||
MPI_Barrier(fd->comm);
|
||||
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
/* All ranks marked as "fsync aggregators" should fsync.
|
||||
(We currently only do one fsync on rank 0 but this is general
|
||||
enough to support >1 aggregator using allreduce to get the
|
||||
results instead of simply bcast'ing the results from rank 0.)*/
|
||||
if(((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BG_FSYNC_AGGREGATOR)
|
||||
{
|
||||
err = fsync(fd->fd_sys);
|
||||
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
/* We want errno, not the return code if it failed */
|
||||
if (err == -1) err = errno;
|
||||
else err = 0;
|
||||
}
|
||||
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
|
||||
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
|
||||
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
|
||||
|
||||
if (err) /* if it's non-zero, it must be an errno */
|
||||
{
|
||||
errno = err;
|
||||
err = -1;
|
||||
}
|
||||
}
|
||||
else /* Non-aggregated fsync */
|
||||
{
|
||||
#ifdef USE_DBG_LOGGING
|
||||
int rank;
|
||||
#endif
|
||||
err = fsync(fd->fd_sys);
|
||||
#ifdef USE_DBG_LOGGING
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
if(rank == 0)
|
||||
{
|
||||
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1)
|
||||
{
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
@ -1,84 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_getsh.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
|
||||
/* returns the current location of the shared_fp in terms of the
|
||||
no. of etypes relative to the current view, and also increments the
|
||||
shared_fp by the number of etypes to be accessed (incr) in the read
|
||||
or write following this function. */
|
||||
|
||||
void ADIOI_BG_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
|
||||
int *error_code)
|
||||
{
|
||||
ADIO_Offset new_fp;
|
||||
int err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_BG_GET_SHARED_FP";
|
||||
|
||||
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
|
||||
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
|
||||
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
|
||||
dupcommself,
|
||||
fd->shared_fp_fname,
|
||||
fd->file_system,
|
||||
fd->fns,
|
||||
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
|
||||
0,
|
||||
MPI_BYTE,
|
||||
MPI_BYTE,
|
||||
MPI_INFO_NULL,
|
||||
ADIO_PERM_NULL,
|
||||
error_code);
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
*shared_fp = 0;
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
|
||||
/* if the file is empty, the above read may return error
|
||||
(reading beyond end of file). In that case, shared_fp = 0,
|
||||
set above, is the correct value. */
|
||||
}
|
||||
else {
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
|
||||
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
if (err == 0) {
|
||||
err = read(fd->shared_fp_fd->fd_sys, shared_fp,
|
||||
sizeof(ADIO_Offset));
|
||||
}
|
||||
if (err == -1) {
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
new_fp = *shared_fp + incr;
|
||||
|
||||
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
if (err == 0) {
|
||||
err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
|
||||
}
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,542 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_hints.c
|
||||
* \brief BlueGene hint processing
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "ad_bg_pset.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
|
||||
#define ADIOI_BG_CB_BUFFER_SIZE_DFLT "16777216"
|
||||
#define ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT "4194304"
|
||||
#define ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT "4194304"
|
||||
#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
*
|
||||
* BlueGene MPIIO configuration and performance tuning. Used by ad_bg and ad_bglockless ADIO's.
|
||||
*
|
||||
* \section hint_sec Hints
|
||||
* - bg_nodes_pset - Specify how many aggregators to use per pset.
|
||||
* This hint will override the cb_nodes hint based on BlueGene psets.
|
||||
* - N - Use N nodes per pset as aggregators.
|
||||
* - Default is based on partition configuration and cb_nodes.
|
||||
*
|
||||
* The following default key/value pairs may differ from other platform defaults.
|
||||
*
|
||||
* - key = cb_buffer_size value = 16777216
|
||||
* - key = romio_cb_read value = enable
|
||||
* - key = romio_cb_write value = enable
|
||||
* - key = ind_rd_buffer_size value = 4194304
|
||||
* - key = ind_wr_buffer_size value = 4194304
|
||||
*/
|
||||
|
||||
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
|
||||
extern int
|
||||
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
|
||||
|
||||
void ADIOI_BG_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
{
|
||||
/* if fd->info is null, create a new info object.
|
||||
Initialize fd->info to default values.
|
||||
Initialize fd->hints to default values.
|
||||
Examine the info object passed by the user. If it contains values that
|
||||
ROMIO understands, override the default. */
|
||||
|
||||
MPI_Info info;
|
||||
char *value;
|
||||
int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
|
||||
static char myname[] = "ADIOI_BG_SETINFO";
|
||||
|
||||
int did_anything = 0;
|
||||
|
||||
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
|
||||
info = fd->info;
|
||||
|
||||
/* Note that fd->hints is allocated at file open time; thus it is
|
||||
* not necessary to allocate it, or check for allocation, here.
|
||||
*/
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_BG_assert ((value != NULL));
|
||||
|
||||
/* initialize info and hints to default values if they haven't been
|
||||
* previously initialized
|
||||
*/
|
||||
if (!fd->hints->initialized) {
|
||||
|
||||
did_anything = 1;
|
||||
|
||||
/* buffer size for collective I/O */
|
||||
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BG_CB_BUFFER_SIZE_DFLT);
|
||||
fd->hints->cb_buffer_size = atoi(ADIOI_BG_CB_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* default is to let romio automatically decide when to use
|
||||
* collective buffering
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
fd->hints->cb_write = ADIOI_HINT_ENABLE;
|
||||
|
||||
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
|
||||
fd->hints->cb_config_list = NULL;
|
||||
|
||||
/* number of processes that perform I/O in collective I/O */
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
nprocs_is_valid = 1;
|
||||
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
|
||||
ADIOI_Info_set(info, "cb_nodes", value);
|
||||
fd->hints->cb_nodes = -1;
|
||||
|
||||
/* hint indicating that no indep. I/O will be performed on this file */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
|
||||
/* bg is not implementing file realms (ADIOI_IOStridedColl),
|
||||
initialize to disabled it. */
|
||||
/* hint instructing the use of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
|
||||
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
|
||||
|
||||
/* hint guiding the assignment of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
|
||||
fd->hints->cb_fr_type = ADIOI_FR_AAR;
|
||||
|
||||
/* hint to align file realms with a certain byte value */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
|
||||
fd->hints->cb_fr_alignment = 1;
|
||||
|
||||
/* hint to set a threshold percentage for a datatype's size/extent at
|
||||
* which data sieving should be done in collective I/O */
|
||||
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
|
||||
fd->hints->cb_ds_threshold = 0;
|
||||
|
||||
/* hint to switch between point-to-point or all-to-all for two-phase */
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
|
||||
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
|
||||
|
||||
/* deferred_open derived from no_indep_rw and cb_{read,write} */
|
||||
fd->hints->deferred_open = 0;
|
||||
|
||||
/* buffer size for data sieving in independent reads */
|
||||
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BG_IND_RD_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* buffer size for data sieving in independent writes */
|
||||
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BG_IND_WR_BUFFER_SIZE_DFLT);
|
||||
|
||||
if(fd->file_system == ADIO_UFS)
|
||||
{
|
||||
/* default for ufs/pvfs is to disable data sieving */
|
||||
ADIOI_Info_set(info, "romio_ds_read", "disable");
|
||||
fd->hints->ds_read = ADIOI_HINT_DISABLE;
|
||||
ADIOI_Info_set(info, "romio_ds_write", "disable");
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* default is to let romio automatically decide when to use data
|
||||
* sieving
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_ds_read", "automatic");
|
||||
fd->hints->ds_read = ADIOI_HINT_AUTO;
|
||||
ADIOI_Info_set(info, "romio_ds_write", "automatic");
|
||||
fd->hints->ds_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
/* still to do: tune this a bit for a variety of file systems. there's
|
||||
* no good default value so just leave it unset */
|
||||
fd->hints->min_fdomain_size = 0;
|
||||
fd->hints->striping_unit = 0;
|
||||
|
||||
fd->hints->initialized = 1;
|
||||
}
|
||||
|
||||
/* add in user's info if supplied */
|
||||
if (users_info != MPI_INFO_NULL) {
|
||||
ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"cb_buffer_size",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "cb_buffer_size", value);
|
||||
fd->hints->cb_buffer_size = intval;
|
||||
|
||||
}
|
||||
#if 0
|
||||
/* bg is not implementing file realms (ADIOI_IOStridedColl) ... */
|
||||
/* aligning file realms to certain sizes (e.g. stripe sizes)
|
||||
* may benefit I/O performance */
|
||||
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_fr_alignment",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
|
||||
fd->hints->cb_fr_alignment = intval;
|
||||
|
||||
}
|
||||
|
||||
/* for collective I/O, try to be smarter about when to do data sieving
|
||||
* using a specific threshold for the datatype size/extent
|
||||
* (percentage 0-100%) */
|
||||
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_ds_threshold",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
|
||||
fd->hints->cb_ds_threshold = intval;
|
||||
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_alltoall;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_alltoall) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_alltoall",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
#endif
|
||||
/* new hints for enabling/disabling coll. buffering on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
/* romio_cb_read overrides no_indep_rw */
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->cb_read = ADIOI_HINT_DISABLE;
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_read;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_read) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_read",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
fd->hints->cb_write = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
|
||||
{
|
||||
/* romio_cb_write overrides no_indep_rw, too */
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->cb_write = ADIOI_HINT_DISABLE;
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") ||
|
||||
!strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
fd->hints->cb_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_write;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_write) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_write",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* bg is not implementing file realms (ADIOI_IOStridedColl) ... */
|
||||
/* enable/disable persistent file realms for collective I/O */
|
||||
/* may want to check for no_indep_rdwr hint as well */
|
||||
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_pfr;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_pfr) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_pfr",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
|
||||
/* file realm assignment types ADIOI_FR_AAR(0),
|
||||
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
|
||||
a regular fr size in bytes. probably not the best way... */
|
||||
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) >= -2)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_fr_type",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_fr_type", value);
|
||||
fd->hints->cb_fr_type = intval;
|
||||
|
||||
}
|
||||
#endif
|
||||
/* new hint for specifying no indep. read/write will be performed */
|
||||
ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
|
||||
/* if 'no_indep_rw' set, also hint that we will do
|
||||
* collective buffering: if we aren't doing independent io,
|
||||
* then we have to do collective */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", value);
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->no_indep_rw = 1;
|
||||
fd->hints->cb_read = 1;
|
||||
fd->hints->cb_write = 1;
|
||||
tmp_val = 1;
|
||||
}
|
||||
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", value);
|
||||
fd->hints->no_indep_rw = 0;
|
||||
tmp_val = 0;
|
||||
}
|
||||
else {
|
||||
/* default is above */
|
||||
tmp_val = 0;
|
||||
}
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->no_indep_rw) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_no_indep_rw",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
/* new hints for enabling/disabling data sieving on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
/* otherwise ignore */
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
/* otherwise ignore */
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
ADIOI_Info_set(info, "ind_wr_buffer_size", value);
|
||||
fd->hints->ind_wr_buffer_size = intval;
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
ADIOI_Info_set(info, "ind_rd_buffer_size", value);
|
||||
fd->hints->ind_rd_buffer_size = intval;
|
||||
}
|
||||
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if ( flag && ((intval = atoi(value)) > 0) ) {
|
||||
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
|
||||
fd->hints->min_fdomain_size = intval;
|
||||
}
|
||||
/* Now we use striping unit in common code so we should
|
||||
process hints for it. */
|
||||
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if ( flag && ((intval = atoi(value)) > 0) ) {
|
||||
ADIOI_Info_set(info, "striping_unit", value);
|
||||
fd->hints->striping_unit = intval;
|
||||
}
|
||||
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
|
||||
did_anything = 1;
|
||||
ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
|
||||
fd->hints->cb_nodes = intval;
|
||||
}
|
||||
}
|
||||
|
||||
/* associate CB aggregators to certain CNs in every involved PSET */
|
||||
if (did_anything) {
|
||||
ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
|
||||
}
|
||||
/* ignore defered open hints and do not enable it for bluegene: need all
|
||||
* processors in the open path so we can stat-and-broadcast the blocksize
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
fd->hints->deferred_open = 0;
|
||||
|
||||
/* BobC commented this out, but since hint processing runs on both bg and
|
||||
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
|
||||
* PVFS */
|
||||
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
|
||||
/* disable data sieving for fs that do not
|
||||
support file locking */
|
||||
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
/* get rid of this value if it is set */
|
||||
ADIOI_Info_delete(info, "ind_wr_buffer_size");
|
||||
}
|
||||
/* note: leave ind_wr_buffer_size alone; used for other cases
|
||||
* as well. -- Rob Ross, 04/22/2003
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_ds_write", "disable");
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
|
||||
ADIOI_Free(value);
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,307 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_open.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
|
||||
#include <sys/statfs.h>
|
||||
#include <sys/vfs.h>
|
||||
|
||||
/* COPIED FROM ad_fstype.c since it is static in that file
|
||||
|
||||
ADIO_FileSysType_parentdir - determines a string pathname for the
|
||||
parent directory of a given filename.
|
||||
|
||||
Input Parameters:
|
||||
. filename - pointer to file name character array
|
||||
|
||||
Output Parameters:
|
||||
. dirnamep - pointer to location in which to store a pointer to a string
|
||||
|
||||
Note that the caller should free the memory located at the pointer returned
|
||||
after the string is no longer needed.
|
||||
*/
|
||||
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX 65535
|
||||
#endif
|
||||
|
||||
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
|
||||
here. We assume that S_ISLNK is *always* defined as a macro. If
|
||||
that is not universally true, then add a test to the romio
|
||||
configure that trys to link a program that references S_ISLNK */
|
||||
#if !defined(S_ISLNK)
|
||||
# if defined(S_IFLNK)
|
||||
/* Check for the link bit */
|
||||
# define S_ISLNK(mode) ((mode) & S_IFLNK)
|
||||
# else
|
||||
/* no way to check if it is a link, so say false */
|
||||
# define S_ISLNK(mode) 0
|
||||
# endif
|
||||
#endif /* !(S_ISLNK) */
|
||||
|
||||
/* ADIO_FileSysType_parentdir
|
||||
*
|
||||
* Returns pointer to string in dirnamep; that string is allocated with
|
||||
* strdup and must be free()'d.
|
||||
*/
|
||||
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
|
||||
{
|
||||
int err;
|
||||
char *dir = NULL, *slash;
|
||||
struct stat statbuf;
|
||||
|
||||
err = lstat(filename, &statbuf);
|
||||
|
||||
if (err || (!S_ISLNK(statbuf.st_mode))) {
|
||||
/* no such file, or file is not a link; these are the "normal"
|
||||
* cases where we can just return the parent directory.
|
||||
*/
|
||||
dir = ADIOI_Strdup(filename);
|
||||
}
|
||||
else {
|
||||
/* filename is a symlink. we've presumably already tried
|
||||
* to stat it and found it to be missing (dangling link),
|
||||
* but this code doesn't care if the target is really there
|
||||
* or not.
|
||||
*/
|
||||
int namelen;
|
||||
char *linkbuf;
|
||||
|
||||
linkbuf = ADIOI_Malloc(PATH_MAX+1);
|
||||
namelen = readlink(filename, linkbuf, PATH_MAX+1);
|
||||
if (namelen == -1) {
|
||||
/* something strange has happened between the time that
|
||||
* we determined that this was a link and the time that
|
||||
* we attempted to read it; punt and use the old name.
|
||||
*/
|
||||
dir = ADIOI_Strdup(filename);
|
||||
}
|
||||
else {
|
||||
/* successfully read the link */
|
||||
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
|
||||
dir = ADIOI_Strdup(linkbuf);
|
||||
ADIOI_Free(linkbuf);
|
||||
}
|
||||
}
|
||||
|
||||
slash = strrchr(dir, '/');
|
||||
if (!slash) ADIOI_Strncpy(dir, ".", 2);
|
||||
else {
|
||||
if (slash == dir) *(dir + 1) = '\0';
|
||||
else *slash = '\0';
|
||||
}
|
||||
|
||||
*dirnamep = dir;
|
||||
return;
|
||||
}
|
||||
|
||||
static void scaleable_stat(ADIO_File fd)
|
||||
{
|
||||
struct stat64 bg_stat;
|
||||
struct statfs bg_statfs;
|
||||
int rank, rc;
|
||||
char * dir;
|
||||
long buf[2];
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
if (rank == 0) {
|
||||
/* Get the (real) underlying file system block size */
|
||||
rc = stat64(fd->filename, &bg_stat);
|
||||
if (rc >= 0)
|
||||
{
|
||||
buf[0] = bg_stat.st_blksize;
|
||||
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
|
||||
fd->filename,bg_stat.st_blksize);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
|
||||
fd->filename,rc,errno);
|
||||
}
|
||||
/* Get the (real) underlying file system type so we can
|
||||
* plan our fsync scaling strategy */
|
||||
rc = statfs(fd->filename,&bg_statfs);
|
||||
if (rc >= 0)
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",
|
||||
fd->filename,bg_statfs.f_type);
|
||||
buf[1] = bg_statfs.f_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
|
||||
fd->filename,rc,errno);
|
||||
ADIO_FileSysType_parentdir(fd->filename, &dir);
|
||||
rc = statfs(dir,&bg_statfs);
|
||||
if (rc >= 0)
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#lX\n",dir,bg_statfs.f_type);
|
||||
buf[1] = bg_statfs.f_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
|
||||
* or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
|
||||
buf[1] = -1; /* bogus magic number */
|
||||
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
|
||||
}
|
||||
free(dir);
|
||||
}
|
||||
}
|
||||
/* now we can broadcast the stat/statfs data to everyone else */
|
||||
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
|
||||
bg_stat.st_blksize = buf[0];
|
||||
bg_statfs.f_type = buf[1];
|
||||
|
||||
/* data from stat64 */
|
||||
/* store the blksize in the file system specific storage */
|
||||
((ADIOI_BG_fs*)fd->fs_ptr)->blksize = bg_stat.st_blksize;
|
||||
|
||||
/* data from statfs */
|
||||
if ((bg_statfs.f_type == GPFS_SUPER_MAGIC) ||
|
||||
(bg_statfs.f_type == bglocklessmpio_f_type))
|
||||
{
|
||||
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr =
|
||||
ADIOI_BG_FSYNC_AGGREGATION_ENABLED;
|
||||
|
||||
/* Only one rank is an "fsync aggregator" because only one
|
||||
* fsync is needed */
|
||||
if (rank == 0)
|
||||
{
|
||||
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr |=
|
||||
ADIOI_BG_FSYNC_AGGREGATOR;
|
||||
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
|
||||
}
|
||||
else
|
||||
; /* aggregation enabled but this rank is not an aggregator*/
|
||||
}
|
||||
else
|
||||
; /* Other filesystems default to no fsync aggregation */
|
||||
}
|
||||
|
||||
|
||||
void ADIOI_BG_Open(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int perm, old_mask, amode;
|
||||
static char myname[] = "ADIOI_BG_OPEN";
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
ad_bg_get_env_vars();
|
||||
|
||||
if (fd->perm == ADIO_PERM_NULL) {
|
||||
old_mask = umask(022);
|
||||
umask(old_mask);
|
||||
perm = old_mask ^ 0666;
|
||||
}
|
||||
else perm = fd->perm;
|
||||
|
||||
amode = 0;
|
||||
if (fd->access_mode & ADIO_CREATE)
|
||||
amode = amode | O_CREAT;
|
||||
if (fd->access_mode & ADIO_RDONLY)
|
||||
amode = amode | O_RDONLY;
|
||||
if (fd->access_mode & ADIO_WRONLY)
|
||||
amode = amode | O_WRONLY;
|
||||
if (fd->access_mode & ADIO_RDWR)
|
||||
amode = amode | O_RDWR;
|
||||
if (fd->access_mode & ADIO_EXCL)
|
||||
amode = amode | O_EXCL;
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
|
||||
#endif
|
||||
fd->fd_sys = open(fd->filename, amode, perm);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
|
||||
#endif
|
||||
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
|
||||
fd->fd_direct = -1;
|
||||
|
||||
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
|
||||
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
|
||||
if(fd->fd_sys != -1)
|
||||
{
|
||||
|
||||
/* Initialize the ad_bg file system specific information */
|
||||
ADIOI_BG_assert(fd->fs_ptr == NULL);
|
||||
fd->fs_ptr = (ADIOI_BG_fs*) ADIOI_Malloc(sizeof(ADIOI_BG_fs));
|
||||
|
||||
((ADIOI_BG_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
|
||||
|
||||
/* default is no fsync aggregation */
|
||||
((ADIOI_BG_fs*)fd->fs_ptr)->fsync_aggr =
|
||||
ADIOI_BG_FSYNC_AGGREGATION_DISABLED;
|
||||
|
||||
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
|
||||
#endif
|
||||
scaleable_stat(fd);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (fd->fd_sys == -1) {
|
||||
if (errno == ENAMETOOLONG)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamelong",
|
||||
"**filenamelong %s %d",
|
||||
fd->filename,
|
||||
strlen(fd->filename));
|
||||
else if (errno == ENOENT)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_NO_SUCH_FILE,
|
||||
"**filenoexist",
|
||||
"**filenoexist %s",
|
||||
fd->filename);
|
||||
else if (errno == ENOTDIR || errno == ELOOP)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_BAD_FILE,
|
||||
"**filenamedir",
|
||||
"**filenamedir %s",
|
||||
fd->filename);
|
||||
else if (errno == EACCES) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_ACCESS,
|
||||
"**fileaccess",
|
||||
"**fileaccess %s",
|
||||
fd->filename );
|
||||
}
|
||||
else if (errno == EROFS) {
|
||||
/* Read only file or file system and write access requested */
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_READ_ONLY,
|
||||
"**ioneedrd", 0 );
|
||||
}
|
||||
else {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
/*
|
||||
*vim: ts=8 sts=4 sw=4 noexpandtab
|
||||
*/
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,558 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_read.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bg_tuning.h"
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
void ADIOI_BG_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size;
|
||||
ADIO_Offset len;
|
||||
static char myname[] = "ADIOI_BG_READCONTIG";
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5034, 0, NULL);
|
||||
#endif
|
||||
#if BG_PROFILE
|
||||
/* timing */
|
||||
double io_time, io_time2;
|
||||
|
||||
if (bgmpio_timing) {
|
||||
io_time = MPI_Wtime();
|
||||
bgmpio_prof_cr[ BGMPIO_CIO_DATA_SIZE ] += len;
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
|
||||
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
|
||||
|
||||
#if BG_PROFILE
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* read from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bgmpio_timing2) bgmpio_prof_cr[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#else /* BG_PROFILE */
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* read from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#endif /* BG_PROFILE */
|
||||
|
||||
#if BG_PROFILE
|
||||
if (bgmpio_timing) bgmpio_prof_cr[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
|
||||
#endif
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io", "**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, err);
|
||||
#endif
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5035, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#define ADIOI_BUFFERED_READ \
|
||||
{ \
|
||||
if (req_off >= readbuf_off + readbuf_len) { \
|
||||
readbuf_off = req_off; \
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
|
||||
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);\
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
|
||||
if (err == -1) err_flag = 1; \
|
||||
} \
|
||||
while (req_len > readbuf_off + readbuf_len - req_off) { \
|
||||
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
|
||||
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
|
||||
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
|
||||
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
|
||||
ADIOI_Free(readbuf); \
|
||||
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
|
||||
memcpy(readbuf, tmp_buf, partial_read); \
|
||||
ADIOI_Free(tmp_buf); \
|
||||
readbuf_off += readbuf_len-partial_read; \
|
||||
readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
|
||||
end_offset-readbuf_off+1)); \
|
||||
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
|
||||
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
|
||||
if (err == -1) err_flag = 1; \
|
||||
} \
|
||||
ADIOI_Assert(req_len == (size_t)req_len); \
|
||||
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
|
||||
}
|
||||
|
||||
|
||||
void ADIOI_BG_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
{
|
||||
/* offset is in units of etype relative to the filetype. */
|
||||
|
||||
|
||||
ADIOI_Flatlist_node *flat_buf, *flat_file;
|
||||
ADIO_Offset i_offset, new_brd_size, brd_size, size;
|
||||
int i, j, k, err=-1, st_index=0;
|
||||
ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
|
||||
unsigned num, bufsize;
|
||||
int n_etypes_in_filetype;
|
||||
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size, partial_read;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off, req_len, sum;
|
||||
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
|
||||
char *readbuf, *tmp_buf, *value;
|
||||
int err_flag=0, info_flag;
|
||||
unsigned max_bufsize, readbuf_len;
|
||||
static char myname[] = "ADIOI_BG_READSTRIDED";
|
||||
|
||||
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
|
||||
/* if user has disabled data sieving on reads, use naive
|
||||
* approach instead.
|
||||
*/
|
||||
/*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
|
||||
ADIOI_GEN_ReadStrided_naive(fd,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
file_ptr_type,
|
||||
offset,
|
||||
status,
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
|
||||
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
#endif
|
||||
*error_code = MPI_SUCCESS;
|
||||
return;
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
|
||||
bufsize = buftype_size * count;
|
||||
|
||||
/* get max_bufsize from the info object. */
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
|
||||
&info_flag);
|
||||
max_bufsize = atoi(value);
|
||||
ADIOI_Free(value);
|
||||
|
||||
if (!buftype_is_contig && filetype_is_contig) {
|
||||
|
||||
/* noncontiguous in memory, contiguous in file. */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
|
||||
fd->disp + (ADIO_Offset)etype_size * offset;
|
||||
|
||||
start_off = off;
|
||||
end_offset = off + bufsize - 1;
|
||||
readbuf_off = off;
|
||||
readbuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
|
||||
|
||||
/* if atomicity is true, lock (exclusive) the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
lseek(fd->fd_sys, readbuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
for (j=0; j<count; j++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<flat_buf->count; i++) {
|
||||
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
|
||||
req_off = off;
|
||||
req_len = flat_buf->blocklens[i];
|
||||
ADIOI_BUFFERED_READ
|
||||
off += flat_buf->blocklens[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
|
||||
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
|
||||
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
else { /* noncontiguous in file */
|
||||
|
||||
/* filetype already flattened in ADIO_Open */
|
||||
flat_file = ADIOI_Flatlist;
|
||||
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
|
||||
disp = fd->disp;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* Wei-keng reworked type processing to be a bit more efficient */
|
||||
offset = fd->fp_ind - disp;
|
||||
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
|
||||
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
|
||||
/* now offset is local to this extent */
|
||||
|
||||
/* find the block where offset is located, skip blocklens[i]==0 */
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
ADIO_Offset dist;
|
||||
if (flat_file->blocklens[i] == 0) continue;
|
||||
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
|
||||
/* frd_size is from offset to the end of block i */
|
||||
if (dist == 0) {
|
||||
i++;
|
||||
offset = flat_file->indices[i];
|
||||
frd_size = flat_file->blocklens[i];
|
||||
break;
|
||||
}
|
||||
if (dist > 0) {
|
||||
frd_size = dist;
|
||||
break;
|
||||
}
|
||||
}
|
||||
st_index = i; /* starting index in flat_file->indices[] */
|
||||
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
else {
|
||||
n_etypes_in_filetype = filetype_size/etype_size;
|
||||
n_filetypes = offset / n_etypes_in_filetype;
|
||||
etype_in_filetype = offset % n_etypes_in_filetype;
|
||||
size_in_filetype = etype_in_filetype * etype_size;
|
||||
|
||||
sum = 0;
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
sum += flat_file->blocklens[i];
|
||||
if (sum > size_in_filetype) {
|
||||
st_index = i;
|
||||
frd_size = sum - size_in_filetype;
|
||||
abs_off_in_filetype = flat_file->indices[i] +
|
||||
size_in_filetype - (sum - flat_file->blocklens[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* abs. offset in bytes in the file */
|
||||
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
|
||||
abs_off_in_filetype;
|
||||
}
|
||||
|
||||
start_off = offset;
|
||||
|
||||
/* Wei-keng Liao: read request is within a single flat_file contig
|
||||
* block e.g. with subarray types that actually describe the whole
|
||||
* array */
|
||||
if (buftype_is_contig && bufsize <= frd_size) {
|
||||
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
|
||||
offset, status, error_code);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* update MPI-IO file pointer to point to the first byte that
|
||||
* can be accessed in the fileview. */
|
||||
fd->fp_ind = offset + bufsize;
|
||||
if (bufsize == frd_size) {
|
||||
do {
|
||||
st_index++;
|
||||
if (st_index == flat_file->count) {
|
||||
st_index = 0;
|
||||
n_filetypes++;
|
||||
}
|
||||
} while (flat_file->blocklens[st_index] == 0);
|
||||
fd->fp_ind = disp + flat_file->indices[st_index]
|
||||
+ n_filetypes*filetype_extent;
|
||||
}
|
||||
}
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/* Calculate end_offset, the last byte-offset that will be accessed.
|
||||
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
|
||||
|
||||
st_frd_size = frd_size;
|
||||
st_n_filetypes = n_filetypes;
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
frd_size = ADIOI_MIN(st_frd_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
i_offset += frd_size;
|
||||
end_offset = off + frd_size - 1;
|
||||
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
|
||||
/* if atomicity is true, lock (exclusive) the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
/* initial read into readbuf */
|
||||
readbuf_off = offset;
|
||||
readbuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
|
||||
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
|
||||
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
if (buftype_is_contig && !filetype_is_contig) {
|
||||
|
||||
/* contiguous in memory, noncontiguous in file. should be the most
|
||||
common case. */
|
||||
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
frd_size = ADIOI_MIN(st_frd_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
if (frd_size) {
|
||||
/* TYPE_UB and TYPE_LB can result in
|
||||
frd_size = 0. save system call in such cases */
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
|
||||
|
||||
req_off = off;
|
||||
req_len = frd_size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_READ
|
||||
}
|
||||
i_offset += frd_size;
|
||||
|
||||
if (off + frd_size < disp + flat_file->indices[j] +
|
||||
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
|
||||
off += frd_size;
|
||||
/* did not reach end of contiguous block in filetype.
|
||||
no more I/O needed. off is incremented by frd_size. */
|
||||
else {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* noncontiguous in memory as well as in file */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
k = num = buf_count = 0;
|
||||
i_offset = flat_buf->indices[0];
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
frd_size = st_frd_size;
|
||||
brd_size = flat_buf->blocklens[0];
|
||||
|
||||
while (num < bufsize) {
|
||||
size = ADIOI_MIN(frd_size, brd_size);
|
||||
if (size) {
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = read(fd->fd_sys, ((char *) buf) + i, size); */
|
||||
|
||||
req_off = off;
|
||||
req_len = size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_READ
|
||||
}
|
||||
|
||||
new_frd_size = frd_size;
|
||||
new_brd_size = brd_size;
|
||||
|
||||
if (size == frd_size) {
|
||||
/* reached end of contiguous block in file */
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
|
||||
new_frd_size = flat_file->blocklens[j];
|
||||
if (size != brd_size) {
|
||||
i_offset += size;
|
||||
new_brd_size -= size;
|
||||
}
|
||||
}
|
||||
|
||||
if (size == brd_size) {
|
||||
/* reached end of contiguous block in memory */
|
||||
|
||||
k = (k + 1)%flat_buf->count;
|
||||
buf_count++;
|
||||
i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
|
||||
flat_buf->indices[k]);
|
||||
new_brd_size = flat_buf->blocklens[k];
|
||||
if (size != frd_size) {
|
||||
off += size;
|
||||
new_frd_size -= size;
|
||||
}
|
||||
}
|
||||
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
|
||||
num += size;
|
||||
frd_size = new_frd_size;
|
||||
brd_size = new_brd_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
|
||||
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
|
||||
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
/* This is a temporary way of filling in status. The right way is to
|
||||
keep track of how much data was actually read and placed in buf
|
||||
by ADIOI_BUFFERED_READ. */
|
||||
#endif
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
}
|
@ -1,68 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_setsh.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
|
||||
/* set the shared file pointer to "offset" etypes relative to the current
|
||||
view */
|
||||
|
||||
/*
|
||||
This looks very similar to ADIOI_GEN_Set_shared_fp, except this
|
||||
function avoids locking the file twice. The generic version does
|
||||
|
||||
Write lock
|
||||
ADIO_WriteContig
|
||||
Unlock
|
||||
|
||||
For BG, ADIOI_BG_WriteContig does a lock before writing to disable
|
||||
caching. To avoid the lock being called twice, this version for BG does
|
||||
|
||||
Write lock
|
||||
Lseek
|
||||
Write
|
||||
Unlock
|
||||
|
||||
*/
|
||||
|
||||
void ADIOI_BG_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
|
||||
{
|
||||
int err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_BG_SET_SHARED_FP";
|
||||
|
||||
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
|
||||
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
|
||||
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
|
||||
fd->shared_fp_fname,
|
||||
fd->file_system, fd->fns,
|
||||
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
|
||||
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
|
||||
ADIO_PERM_NULL, error_code);
|
||||
}
|
||||
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,164 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_tuning.c
|
||||
* \brief Defines ad_bg performance tuning
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 2008 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_bg_tuning.c
|
||||
*
|
||||
* defines global variables and functions for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#include "ad_bg_tuning.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#if !defined(PVFS2_SUPER_MAGIC)
|
||||
#define PVFS2_SUPER_MAGIC (0x20030528)
|
||||
#endif
|
||||
|
||||
|
||||
int bgmpio_timing;
|
||||
int bgmpio_timing2;
|
||||
int bgmpio_comm;
|
||||
int bgmpio_tunegather;
|
||||
int bgmpio_tuneblocking;
|
||||
long bglocklessmpio_f_type;
|
||||
|
||||
double bgmpio_prof_cw [BGMPIO_CIO_LAST];
|
||||
double bgmpio_prof_cr [BGMPIO_CIO_LAST];
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
\section env_sec Environment Variables
|
||||
* - BGMPIO_COMM - Define how data is exchanged on collective
|
||||
* reads and writes. Possible values:
|
||||
* - 0 - Use MPI_Alltoallv.
|
||||
* - 1 - Use MPI_Isend/MPI_Irecv.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
|
||||
* Must also compile the library with BG_PROFILE defined. Possible values:
|
||||
* - 0 - Do not collect/report timing.
|
||||
* - 1 - Collect/report timing.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
|
||||
* Must also compile the library with BG_PROFILE defined. Possible values:
|
||||
* - 0 - Do not collect/report averages.
|
||||
* - 1 - Collect/report averages.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
|
||||
* for aggregator collective i/o. Possible values:
|
||||
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
|
||||
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - BGMPIO_TUNEBLOCKING - Tune how aggregate file domains are
|
||||
* calculated (block size). Possible values:
|
||||
* - 0 - Evenly calculate file domains across aggregators. Also use
|
||||
* MPI_Isend/MPI_Irecv to exchange domain information.
|
||||
* - 1 - Align file domains with the underlying file system's block size. Also use
|
||||
* MPI_Alltoallv to exchange domain information.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
|
||||
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
|
||||
* "bg:" or "bglockless:") on a file name will override this environment
|
||||
* variable. Possible values:
|
||||
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
|
||||
* statfs() field f_type.
|
||||
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
|
||||
*
|
||||
*/
|
||||
void ad_bg_get_env_vars() {
|
||||
char *x, *dummy;
|
||||
|
||||
bgmpio_comm = 0;
|
||||
x = getenv( "BGMPIO_COMM" );
|
||||
if (x) bgmpio_comm = atoi(x);
|
||||
bgmpio_timing = 0;
|
||||
x = getenv( "BGMPIO_TIMING" );
|
||||
if (x) bgmpio_timing = atoi(x);
|
||||
bgmpio_timing2 = 0;
|
||||
x = getenv( "BGMPIO_TIMING2" );
|
||||
if (x) bgmpio_timing2 = atoi(x);
|
||||
bgmpio_tunegather = 1;
|
||||
x = getenv( "BGMPIO_TUNEGATHER" );
|
||||
if (x) bgmpio_tunegather = atoi(x);
|
||||
bgmpio_tuneblocking = 1;
|
||||
x = getenv( "BGMPIO_TUNEBLOCKING" );
|
||||
if (x) bgmpio_tuneblocking = atoi(x);
|
||||
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
|
||||
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
|
||||
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
|
||||
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
|
||||
bglocklessmpio_f_type,bglocklessmpio_f_type);
|
||||
}
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_bg_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
|
||||
{
|
||||
int i;
|
||||
|
||||
if (bgmpio_timing) {
|
||||
|
||||
double *bgmpio_prof_org = bgmpio_prof_cr;
|
||||
if (rw) bgmpio_prof_org = bgmpio_prof_cw;
|
||||
|
||||
double bgmpio_prof_avg[ BGMPIO_CIO_LAST ];
|
||||
double bgmpio_prof_max[ BGMPIO_CIO_LAST ];
|
||||
|
||||
MPI_Reduce( bgmpio_prof_org, bgmpio_prof_avg, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
|
||||
MPI_Reduce( bgmpio_prof_org, bgmpio_prof_max, BGMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
|
||||
|
||||
if (myrank == 0) {
|
||||
|
||||
for (i=0; i<BGMPIO_CIO_LAST; i++) bgmpio_prof_avg[i] /= nprocs;
|
||||
|
||||
if (bgmpio_timing2) {
|
||||
bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bgmpio_prof_max[ BGMPIO_CIO_T_POSI_RW ];
|
||||
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_RW ];
|
||||
} else {
|
||||
|
||||
bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] = 0;
|
||||
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] = 0;
|
||||
}
|
||||
|
||||
bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] = bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bgmpio_prof_max[ BGMPIO_CIO_T_MPIO_CRW ];
|
||||
|
||||
printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
|
||||
printf( "SZ: %12.4f , ", bgmpio_prof_avg[ BGMPIO_CIO_DATA_SIZE ] * nprocs );
|
||||
printf( "SK-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_SEEK ] );
|
||||
printf( "SK-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_SEEK ] );
|
||||
printf( "LC-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_LCOMP ] );
|
||||
printf( "GA-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_GATHER ] );
|
||||
printf( "AN-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_PATANA ] );
|
||||
printf( "FD-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_FD_PART ] );
|
||||
printf( "MY-a: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MYREQ ] );
|
||||
printf( "OT-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_OTHREQ ] );
|
||||
printf( "EX-m: %10.3f , ", bgmpio_prof_max[ BGMPIO_CIO_T_DEXCH ] );
|
||||
printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
|
||||
printf( "PXT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_POSI_RW ] );
|
||||
printf( "MPT-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_RW ] );
|
||||
printf("MPTC-m: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_T_MPIO_CRW ] );
|
||||
printf( "PXB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_POSI_RW ] );
|
||||
printf( "MPB: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_RW ] );
|
||||
printf( "MPBC: %10.3f , ", bgmpio_prof_avg[ BGMPIO_CIO_B_MPIO_CRW ] );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,96 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_tuning.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_bg_tuning.h
|
||||
*
|
||||
* declares global variables and macros for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#ifndef AD_BG_TUNING_H_
|
||||
#define AD_BG_TUNING_H_
|
||||
|
||||
#include "adio.h"
|
||||
|
||||
#define ADIOI_BG_assert( a ) if (!(a)) { \
|
||||
fprintf( stderr, "AD_BG_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
|
||||
MPI_Abort( MPI_COMM_WORLD, 1 ); \
|
||||
}
|
||||
|
||||
|
||||
/*-----------------------------------------
|
||||
* Global variables for the control of
|
||||
* 1. timing
|
||||
* 2. select specific optimizations
|
||||
*-----------------------------------------*/
|
||||
|
||||
/* timing fields */
|
||||
enum {
|
||||
BGMPIO_CIO_DATA_SIZE=0,
|
||||
BGMPIO_CIO_T_SEEK,
|
||||
BGMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
|
||||
BGMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
|
||||
BGMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
|
||||
BGMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
|
||||
BGMPIO_CIO_T_MYREQ, /* time for ADIOI_BG_Calc_my_req(), local */
|
||||
BGMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
|
||||
BGMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
|
||||
BGMPIO_CIO_T_POSI_RW,
|
||||
BGMPIO_CIO_B_POSI_RW,
|
||||
BGMPIO_CIO_T_MPIO_RW, /* time for ADIOI_BG_WriteContig() */
|
||||
BGMPIO_CIO_B_MPIO_RW,
|
||||
BGMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_BG_WriteStridedColl() */
|
||||
BGMPIO_CIO_B_MPIO_CRW,
|
||||
BGMPIO_CIO_LAST
|
||||
};
|
||||
|
||||
extern double bgmpio_prof_cw [BGMPIO_CIO_LAST];
|
||||
extern double bgmpio_prof_cr [BGMPIO_CIO_LAST];
|
||||
|
||||
|
||||
/* corresponds to environment variables to select optimizations and timing level */
|
||||
extern int bgmpio_timing;
|
||||
extern int bgmpio_timing2;
|
||||
extern int bgmpio_comm;
|
||||
extern int bgmpio_tunegather;
|
||||
extern int bgmpio_tuneblocking;
|
||||
extern long bglocklessmpio_f_type;
|
||||
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
void ad_bg_get_env_vars();
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_bg_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
|
||||
|
||||
/* note:
|
||||
* T := timing;
|
||||
* CIO := collective I/O
|
||||
*/
|
||||
#define BGMPIO_T_CIO_RESET( LEVEL, RW ) \
|
||||
if (bgmpio_timing_cw_level >= LEVEL) { \
|
||||
int i; \
|
||||
for ( i = 0; i < BGMPIO_T_LAST; i ++ ) \
|
||||
bgmpio_prof_c##RW [ i ] = 0; \
|
||||
}
|
||||
|
||||
#define BGMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
|
||||
if (bgmpio_timing_cw_level >= LEVEL) { \
|
||||
ad_bg_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
|
||||
}
|
||||
|
||||
#define BGMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
|
||||
if (bgmpio_timing_cw_level >= LEVEL) { \
|
||||
if ( DOBAR ) MPI_Barrier( fd->comm ); \
|
||||
double temp = MPI_Wtime(); \
|
||||
if ( ISSET ) bgmpio_prof_c##RW [ VAR1 ] = temp; \
|
||||
if ( ISGET ) bgmpio_prof_c##RW [ VAR2 ] = temp - bgmpio_prof_c##RW [ VAR2 ] ; \
|
||||
}
|
||||
|
||||
#endif /* AD_BG_TUNING_H_ */
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,611 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_write.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bg.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bg_tuning.h"
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
void ADIOI_BG_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size;
|
||||
ADIO_Offset len;
|
||||
static char myname[] = "ADIOI_BG_WRITECONTIG";
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5036, 0, NULL);
|
||||
#endif
|
||||
#if BG_PROFILE
|
||||
/* timing */
|
||||
double io_time, io_time2;
|
||||
|
||||
if (bgmpio_timing) {
|
||||
io_time = MPI_Wtime();
|
||||
bgmpio_prof_cw[ BGMPIO_CIO_DATA_SIZE ] += len;
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
|
||||
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
|
||||
|
||||
#if BG_PROFILE
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* write from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bgmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bgmpio_timing2) bgmpio_prof_cw[ BGMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#else /* BG_PROFILE */
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* write from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#endif /* BG_PROFILE */
|
||||
|
||||
#if BG_PROFILE
|
||||
if (bgmpio_timing) bgmpio_prof_cw[ BGMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
|
||||
#endif
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, err);
|
||||
#endif
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5037, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#define ADIOI_BUFFERED_WRITE \
|
||||
{ \
|
||||
if (req_off >= writebuf_off + writebuf_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
writebuf_off = req_off; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (err == -1) { \
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
|
||||
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
|
||||
while (write_sz != req_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
req_len -= write_sz; \
|
||||
userbuf_off += write_sz; \
|
||||
writebuf_off += writebuf_len; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (err == -1) { \
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
} \
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
/* this macro is used when filetype is contig and buftype is not contig.
|
||||
it does not do a read-modify-write and does not lock*/
|
||||
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
|
||||
{ \
|
||||
if (req_off >= writebuf_off + writebuf_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
writebuf_off = req_off; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
} \
|
||||
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
|
||||
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
|
||||
while (write_sz != req_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
req_len -= write_sz; \
|
||||
userbuf_off += write_sz; \
|
||||
writebuf_off += writebuf_len; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ADIOI_BG_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
{
|
||||
/* offset is in units of etype relative to the filetype. */
|
||||
|
||||
|
||||
|
||||
ADIOI_Flatlist_node *flat_buf, *flat_file;
|
||||
ADIO_Offset i_offset, sum, size_in_filetype;
|
||||
int i, j, k, err=-1, st_index=0;
|
||||
int n_etypes_in_filetype;
|
||||
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off;
|
||||
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
|
||||
char *writebuf, *value;
|
||||
unsigned bufsize, writebuf_len, max_bufsize, write_sz;
|
||||
int err_flag=0, info_flag;
|
||||
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
|
||||
static char myname[] = "ADIOI_BG_WRITESTRIDED";
|
||||
|
||||
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
|
||||
/* if user has disabled data sieving on reads, use naive
|
||||
* approach instead.
|
||||
*/
|
||||
/*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
|
||||
ADIOI_GEN_WriteStrided_naive(fd,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
file_ptr_type,
|
||||
offset,
|
||||
status,
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
|
||||
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
#endif
|
||||
*error_code = MPI_SUCCESS;
|
||||
return;
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
|
||||
bufsize = buftype_size * count;
|
||||
|
||||
/* get max_bufsize from the info object. */
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
|
||||
&info_flag);
|
||||
max_bufsize = atoi(value);
|
||||
ADIOI_Free(value);
|
||||
|
||||
if (!buftype_is_contig && filetype_is_contig) {
|
||||
|
||||
/* noncontiguous in memory, contiguous in file. */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
|
||||
fd->disp + etype_size * offset;
|
||||
|
||||
start_off = off;
|
||||
end_offset = off + bufsize - 1;
|
||||
writebuf_off = off;
|
||||
writebuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
|
||||
|
||||
/* if atomicity is true, lock the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
for (j=0; j<count; j++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<flat_buf->count; i++) {
|
||||
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
|
||||
req_off = off;
|
||||
req_len = flat_buf->blocklens[i];
|
||||
ADIOI_BUFFERED_WRITE_WITHOUT_READ
|
||||
off += flat_buf->blocklens[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* write the buffer out finally */
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
else { /* noncontiguous in file */
|
||||
|
||||
/* filetype already flattened in ADIO_Open */
|
||||
flat_file = ADIOI_Flatlist;
|
||||
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
|
||||
disp = fd->disp;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* Wei-keng reworked type processing to be a bit more efficient */
|
||||
offset = fd->fp_ind - disp;
|
||||
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
|
||||
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
|
||||
/* now offset is local to this extent */
|
||||
|
||||
/* find the block where offset is located, skip blocklens[i]==0 */
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
ADIO_Offset dist;
|
||||
if (flat_file->blocklens[i] == 0) continue;
|
||||
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
|
||||
/* fwr_size is from offset to the end of block i */
|
||||
if (dist == 0) {
|
||||
i++;
|
||||
offset = flat_file->indices[i];
|
||||
fwr_size = flat_file->blocklens[i];
|
||||
break;
|
||||
}
|
||||
if (dist > 0) {
|
||||
fwr_size = dist;
|
||||
break;
|
||||
}
|
||||
}
|
||||
st_index = i; /* starting index in flat_file->indices[] */
|
||||
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
else {
|
||||
int i;
|
||||
n_etypes_in_filetype = filetype_size/etype_size;
|
||||
n_filetypes = offset / n_etypes_in_filetype;
|
||||
etype_in_filetype = offset % n_etypes_in_filetype;
|
||||
size_in_filetype = etype_in_filetype * etype_size;
|
||||
|
||||
sum = 0;
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
sum += flat_file->blocklens[i];
|
||||
if (sum > size_in_filetype) {
|
||||
st_index = i;
|
||||
fwr_size = sum - size_in_filetype;
|
||||
abs_off_in_filetype = flat_file->indices[i] +
|
||||
size_in_filetype - (sum - flat_file->blocklens[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* abs. offset in bytes in the file */
|
||||
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
|
||||
abs_off_in_filetype;
|
||||
}
|
||||
|
||||
start_off = offset;
|
||||
/* Wei-keng Liao:write request is within single flat_file contig block*/
|
||||
/* this could happen, for example, with subarray types that are
|
||||
* actually fairly contiguous */
|
||||
if (buftype_is_contig && bufsize <= fwr_size) {
|
||||
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
|
||||
offset, status, error_code);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* update MPI-IO file pointer to point to the first byte
|
||||
* that can be accessed in the fileview. */
|
||||
fd->fp_ind = offset + bufsize;
|
||||
if (bufsize == fwr_size) {
|
||||
do {
|
||||
st_index++;
|
||||
if (st_index == flat_file->count) {
|
||||
st_index = 0;
|
||||
n_filetypes++;
|
||||
}
|
||||
} while (flat_file->blocklens[st_index] == 0);
|
||||
fd->fp_ind = disp + flat_file->indices[st_index]
|
||||
+ (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
}
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/* Calculate end_offset, the last byte-offset that will be accessed.
|
||||
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
|
||||
|
||||
st_fwr_size = fwr_size;
|
||||
st_n_filetypes = n_filetypes;
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
i_offset += fwr_size;
|
||||
end_offset = off + fwr_size - 1;
|
||||
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
|
||||
/* if atomicity is true, lock the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
/* initial read for the read-modify-write */
|
||||
writebuf_off = offset;
|
||||
writebuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len);
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_IO,
|
||||
"ADIOI_BG_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (buftype_is_contig && !filetype_is_contig) {
|
||||
|
||||
/* contiguous in memory, noncontiguous in file. should be the most
|
||||
common case. */
|
||||
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
if (fwr_size) {
|
||||
/* TYPE_UB and TYPE_LB can result in
|
||||
fwr_size = 0. save system call in such cases */
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
|
||||
|
||||
req_off = off;
|
||||
req_len = fwr_size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_WRITE
|
||||
}
|
||||
i_offset += fwr_size;
|
||||
|
||||
if (off + fwr_size < disp + flat_file->indices[j] +
|
||||
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
|
||||
off += fwr_size;
|
||||
/* did not reach end of contiguous block in filetype.
|
||||
no more I/O needed. off is incremented by fwr_size. */
|
||||
else {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
|
||||
bufsize-i_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* noncontiguous in memory as well as in file */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
k = num = buf_count = 0;
|
||||
i_offset = flat_buf->indices[0];
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
fwr_size = st_fwr_size;
|
||||
bwr_size = flat_buf->blocklens[0];
|
||||
|
||||
while (num < bufsize) {
|
||||
size = ADIOI_MIN(fwr_size, bwr_size);
|
||||
if (size) {
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
|
||||
|
||||
req_off = off;
|
||||
req_len = size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_WRITE
|
||||
}
|
||||
|
||||
new_fwr_size = fwr_size;
|
||||
new_bwr_size = bwr_size;
|
||||
|
||||
if (size == fwr_size) {
|
||||
/* reached end of contiguous block in file */
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
|
||||
new_fwr_size = flat_file->blocklens[j];
|
||||
if (size != bwr_size) {
|
||||
i_offset += size;
|
||||
new_bwr_size -= size;
|
||||
}
|
||||
}
|
||||
|
||||
if (size == bwr_size) {
|
||||
/* reached end of contiguous block in memory */
|
||||
|
||||
k = (k + 1)%flat_buf->count;
|
||||
buf_count++;
|
||||
i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
|
||||
flat_buf->indices[k];
|
||||
new_bwr_size = flat_buf->blocklens[k];
|
||||
if (size != fwr_size) {
|
||||
off += size;
|
||||
new_fwr_size -= size;
|
||||
}
|
||||
}
|
||||
num += size;
|
||||
fwr_size = new_fwr_size;
|
||||
bwr_size = new_bwr_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* write the buffer out finally */
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len);
|
||||
|
||||
if (!(fd->atomicity))
|
||||
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
/* This is a temporary way of filling in status. The right way is to
|
||||
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
|
||||
#endif
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2011 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_BGL
|
||||
|
||||
noinst_HEADERS += \
|
||||
adio/ad_bgl/ad_bgl.h \
|
||||
adio/ad_bgl/ad_bgl_aggrs.h \
|
||||
adio/ad_bgl/ad_bgl_pset.h \
|
||||
adio/ad_bgl/ad_bgl_tuning.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_bgl/ad_bgl_open.c \
|
||||
adio/ad_bgl/ad_bgl_close.c \
|
||||
adio/ad_bgl/ad_bgl_fcntl.c \
|
||||
adio/ad_bgl/ad_bgl_flush.c \
|
||||
adio/ad_bgl/ad_bgl_read.c \
|
||||
adio/ad_bgl/ad_bgl_write.c \
|
||||
adio/ad_bgl/ad_bgl_getsh.c \
|
||||
adio/ad_bgl/ad_bgl_setsh.c \
|
||||
adio/ad_bgl/ad_bgl.c \
|
||||
adio/ad_bgl/ad_bgl_aggrs.c \
|
||||
adio/ad_bgl/ad_bgl_pset.c \
|
||||
adio/ad_bgl/ad_bgl_hints.c \
|
||||
adio/ad_bgl/ad_bgl_rdcoll.c \
|
||||
adio/ad_bgl/ad_bgl_wrcoll.c \
|
||||
adio/ad_bgl/ad_bgl_tuning.c
|
||||
|
||||
endif BUILD_AD_BGL
|
||||
|
@ -1,97 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#ifndef AD_BGL_INCLUDE
|
||||
#define AD_BGL_INCLUDE
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <fcntl.h>
|
||||
#include "adio.h"
|
||||
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#ifdef HAVE_AIO_H
|
||||
#include <aio.h>
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
int ADIOI_BGL_aio(ADIO_File fd, void *buf, int len, ADIO_Offset offset,
|
||||
int wr, void *handle);
|
||||
#endif
|
||||
|
||||
void ADIOI_BGL_Open(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_BGL_Close(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
#if 0
|
||||
void ADIOI_BGL_IwriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_IreadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int
|
||||
*error_code);
|
||||
int ADIOI_BGL_ReadDone(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
int ADIOI_BGL_WriteDone(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_ReadComplete(ADIO_Request *request, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_WriteComplete(ADIO_Request *request, ADIO_Status *status,
|
||||
int *error_code);
|
||||
#endif
|
||||
void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
|
||||
|
||||
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp, int *error_code);
|
||||
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
|
||||
|
||||
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code);
|
||||
|
||||
#include "ad_bgl_tuning.h"
|
||||
|
||||
|
||||
#endif
|
@ -1,966 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_aggrs.c
|
||||
* \brief The externally used function from this file is is declared in ad_bgl_aggrs.h
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997-2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_cb_config_list.h"
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_pset.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_DBG_LOGGING
|
||||
#define AGG_DEBUG 1
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
static int aggrsInPsetSize=0;
|
||||
static int *aggrsInPset=NULL;
|
||||
|
||||
/* Comments copied from common:
|
||||
* This file contains four functions:
|
||||
*
|
||||
* ADIOI_Calc_aggregator()
|
||||
* ADIOI_Calc_file_domains()
|
||||
* ADIOI_Calc_my_req()
|
||||
* ADIOI_Calc_others_req()
|
||||
*
|
||||
* The last three of these were originally in ad_read_coll.c, but they are
|
||||
* also shared with ad_write_coll.c. I felt that they were better kept with
|
||||
* the rest of the shared aggregation code.
|
||||
*/
|
||||
|
||||
/* Discussion of values available from above:
|
||||
*
|
||||
* ADIO_Offset st_offsets[0..nprocs-1]
|
||||
* ADIO_Offset end_offsets[0..nprocs-1]
|
||||
* These contain a list of start and end offsets for each process in
|
||||
* the communicator. For example, an access at loc 10, size 10 would
|
||||
* have a start offset of 10 and end offset of 19.
|
||||
* int nprocs
|
||||
* number of processors in the collective I/O communicator
|
||||
* ADIO_Offset min_st_offset
|
||||
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
|
||||
* starting location of "file domain"; region that a given process will
|
||||
* perform aggregation for (i.e. actually do I/O)
|
||||
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
|
||||
* start + size - 1 roughly, but it can be less, or 0, in the case of
|
||||
* uneven distributions
|
||||
*/
|
||||
|
||||
/* forward declaration */
|
||||
static void
|
||||
ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BGL_ConfInfo_t *confInfo,
|
||||
ADIOI_BGL_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset );
|
||||
|
||||
/*
|
||||
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
|
||||
* The parameters are
|
||||
* . the number of aggregators (proxies) : fd->hints->cb_nodes
|
||||
* . the ranks of the aggregators : fd->hints->ranklist
|
||||
* By compute these two parameters in a BGL-PSET-aware way, the default 2-phase collective IO of
|
||||
* ADIO can work more efficiently.
|
||||
*/
|
||||
int
|
||||
ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
|
||||
{
|
||||
int r, s;
|
||||
ADIOI_BGL_ProcInfo_t *procInfo, *all_procInfo;
|
||||
ADIOI_BGL_ConfInfo_t *confInfo;
|
||||
|
||||
MPI_Comm_size( fd->comm, &s );
|
||||
MPI_Comm_rank( fd->comm, &r );
|
||||
|
||||
/* Collect individual BGL personality information */
|
||||
confInfo = ADIOI_BGL_ConfInfo_new ();
|
||||
procInfo = ADIOI_BGL_ProcInfo_new ();
|
||||
ADIOI_BGL_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset );
|
||||
|
||||
/* Gather BGL personality infomation onto process 0 */
|
||||
// if (r == 0)
|
||||
all_procInfo = ADIOI_BGL_ProcInfo_new_n (s);
|
||||
if(s > aggrsInPsetSize)
|
||||
{
|
||||
if(aggrsInPset) ADIOI_Free(aggrsInPset);
|
||||
aggrsInPset = (int *) ADIOI_Malloc (s *sizeof(int));
|
||||
aggrsInPsetSize = s;
|
||||
}
|
||||
|
||||
|
||||
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE,
|
||||
(void *)all_procInfo, sizeof(ADIOI_BGL_ProcInfo_t), MPI_BYTE,
|
||||
0,
|
||||
fd->comm );
|
||||
|
||||
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
|
||||
if (r == 0) {
|
||||
ADIOI_BGL_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset);
|
||||
// ADIOI_BGL_ProcInfo_free (all_procInfo);
|
||||
}
|
||||
ADIOI_BGL_ProcInfo_free (all_procInfo);
|
||||
|
||||
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
|
||||
Declared in adio_cb_config_list.h */
|
||||
ADIOI_cb_bcast_rank_map(fd);
|
||||
|
||||
/* Broadcast the BGL-GPFS related file domain info */
|
||||
MPI_Bcast( (void *)aggrsInPset,
|
||||
fd->hints->cb_nodes * sizeof(int), MPI_BYTE,
|
||||
0,
|
||||
fd->comm );
|
||||
|
||||
ADIOI_BGL_persInfo_free( confInfo, procInfo );
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* the purpose of abstracting out this routine is to make it easy for trying different proxy-selection criteria.
|
||||
*/
|
||||
static int
|
||||
ADIOI_BGL_select_agg_in_pset (const ADIOI_BGL_ConfInfo_t *confInfo,
|
||||
ADIOI_BGL_ProcInfo_t *pset_procInfo,
|
||||
int nCN_in_pset,
|
||||
int *tmp_ranklist)
|
||||
{
|
||||
/* first implementation, based on their rank order. */
|
||||
|
||||
int i, j, k;
|
||||
|
||||
/* The number of aggregators in the PSET is proportional to the CNs in the PSET */
|
||||
int nAggrs = nCN_in_pset * confInfo->aggRatio;
|
||||
if (nAggrs < ADIOI_BGL_NAGG_PSET_MIN) nAggrs = ADIOI_BGL_NAGG_PSET_MIN;
|
||||
|
||||
/* for not virtual-node-mode, pick aggregators in this PSET based on the order of the global rank */
|
||||
if (!confInfo->isVNM)
|
||||
{
|
||||
for (i=0; i<nAggrs; i++) tmp_ranklist[i] = pset_procInfo[i].rank;
|
||||
}
|
||||
|
||||
/* for virtual-node-mode, first pick aggregators among CPU-0 */
|
||||
else
|
||||
{
|
||||
/* Try to pick from CPU-0 first, then CPU-1, then ... CPU-n */
|
||||
j = 0;
|
||||
for (k=0; k < confInfo->cpuidSize; k++){
|
||||
for (i=0; i< nCN_in_pset ; i++) {
|
||||
if (pset_procInfo[i].cpuid == k)
|
||||
tmp_ranklist[j++] = pset_procInfo[i].rank;
|
||||
if ( j >= nAggrs) break;
|
||||
}
|
||||
if ( j >= nAggrs) break;
|
||||
}
|
||||
}
|
||||
|
||||
return nAggrs;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
|
||||
* The first order of tmp_ranklist is : PSET number
|
||||
* The secondary order of the list is determined in ADIOI_BGL_select_agg_in_pset() and thus adjustable.
|
||||
*/
|
||||
static int
|
||||
ADIOI_BGL_compute_agg_ranklist_serial_do (const ADIOI_BGL_ConfInfo_t *confInfo,
|
||||
ADIOI_BGL_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset,
|
||||
int *tmp_ranklist)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
/* a list of the numbers of all the PSETS */
|
||||
int *psetNumList = (int *) ADIOI_Malloc ( confInfo->nProcs * sizeof(int) );
|
||||
|
||||
/* sweep through all processes' records, collect the numbers of all the PSETS.
|
||||
* The reason for not doing MIN, MAX is that the owned PSETs may not have contiguous numbers */
|
||||
int n_psets=0;
|
||||
for (i=0; i<confInfo->nProcs; i++) {
|
||||
|
||||
ADIOI_BGL_ProcInfo_t *info_p = all_procInfo+i;
|
||||
|
||||
int exist = 0;
|
||||
for (j=n_psets-1; j>=0; j--)
|
||||
if (info_p->psetNum == psetNumList[j]) { exist=1; break; }
|
||||
|
||||
if (!exist) {
|
||||
psetNumList [n_psets] = info_p->psetNum;
|
||||
n_psets ++;
|
||||
}
|
||||
}
|
||||
|
||||
/* bucket sort: put the CN nodes into ordered buckets, each of which represents a PSET */
|
||||
|
||||
/* bucket space for bucket sort */
|
||||
ADIOI_BGL_ProcInfo_t *sorted_procInfo = ADIOI_BGL_ProcInfo_new_n ( n_psets * confInfo->virtualPsetSize );
|
||||
int *PsetIdx = (int *) ADIOI_Malloc ( n_psets * sizeof(int) );
|
||||
AD_BGL_assert ( (PsetIdx != NULL) );
|
||||
|
||||
/* initialize bucket pointer */
|
||||
for (i=0; i<n_psets; i++) {
|
||||
PsetIdx[i] = i*confInfo->virtualPsetSize;
|
||||
}
|
||||
|
||||
/* sort */
|
||||
for (i=0; i<confInfo->nProcs; i++) {
|
||||
int pset_id = all_procInfo[i].psetNum;
|
||||
|
||||
for (j=n_psets-1; j>=0; j--) if (pset_id == psetNumList[j]) break;
|
||||
AD_BGL_assert ( (j >= 0) ); /* got to find a PSET bucket */
|
||||
|
||||
sorted_procInfo[ PsetIdx[j] ++ ] = all_procInfo[i];
|
||||
}
|
||||
|
||||
ADIOI_Free(psetNumList);
|
||||
|
||||
/* select a number of CN aggregators from each Pset */
|
||||
int naggs = 0;
|
||||
for (i=0; i<n_psets; i++) {
|
||||
|
||||
/* the number of CN in this PSET -- may not be a full PSET */
|
||||
int nCN_in_pset = PsetIdx[i] - i*confInfo->virtualPsetSize;
|
||||
|
||||
/* select aggregators and put them into tmp_ranklist contiguously. */
|
||||
int local_naggs = ADIOI_BGL_select_agg_in_pset( confInfo,
|
||||
sorted_procInfo + i*confInfo->virtualPsetSize,
|
||||
nCN_in_pset,
|
||||
tmp_ranklist + naggs);
|
||||
aggrsInPset[i+1] = local_naggs;
|
||||
|
||||
naggs += local_naggs;
|
||||
}
|
||||
aggrsInPset[0] = n_psets;
|
||||
|
||||
/* leave */
|
||||
ADIOI_Free ( PsetIdx );
|
||||
ADIOI_BGL_ProcInfo_free ( sorted_procInfo );
|
||||
return naggs;
|
||||
}
|
||||
|
||||
/*
|
||||
* compute aggregators ranklist and put it into fd->hints struct
|
||||
*/
|
||||
static void
|
||||
ADIOI_BGL_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BGL_ConfInfo_t *confInfo,
|
||||
ADIOI_BGL_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset )
|
||||
{
|
||||
# if AGG_DEBUG
|
||||
int i;
|
||||
# endif
|
||||
int naggs;
|
||||
int *tmp_ranklist;
|
||||
|
||||
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
|
||||
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<confInfo->nProcs; i++) {
|
||||
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].cpuid, all_procInfo[i].rank );
|
||||
}
|
||||
# endif
|
||||
|
||||
naggs =
|
||||
ADIOI_BGL_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);
|
||||
|
||||
# define VERIFY 0
|
||||
# if VERIFY
|
||||
DBG_FPRINTF(stderr, "\tconfInfo = %3d,%3d,%3d,%3d,%3d,%3d,%.4f; naggs = %d\n",
|
||||
confInfo->PsetSize ,
|
||||
confInfo->numPsets ,
|
||||
confInfo->isVNM ,
|
||||
confInfo->virtualPsetSize ,
|
||||
confInfo->nProcs ,
|
||||
confInfo->nAggrs ,
|
||||
confInfo->aggRatio ,
|
||||
naggs );
|
||||
# endif
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<naggs; i++) {
|
||||
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
|
||||
}
|
||||
# endif
|
||||
|
||||
/* copy the ranklist of IO aggregators to fd->hints */
|
||||
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
|
||||
|
||||
fd->hints->cb_nodes = naggs;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
|
||||
|
||||
/* */
|
||||
ADIOI_Free( tmp_ranklist );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Description from common/ad_aggregate.c. (Does it completely apply to bgl?)
|
||||
* ADIOI_Calc_aggregator()
|
||||
*
|
||||
* The intention here is to implement a function which provides basically
|
||||
* the same functionality as in Rajeev's original version of
|
||||
* ADIOI_Calc_my_req(). He used a ceiling division approach to assign the
|
||||
* file domains, and we use the same approach here when calculating the
|
||||
* location of an offset/len in a specific file domain. Further we assume
|
||||
* this same distribution when calculating the rank_index, which is later
|
||||
* used to map to a specific process rank in charge of the file domain.
|
||||
*
|
||||
* A better (i.e. more general) approach would be to use the list of file
|
||||
* domains only. This would be slower in the case where the
|
||||
* original ceiling division was used, but it would allow for arbitrary
|
||||
* distributions of regions to aggregators. We'd need to know the
|
||||
* nprocs_for_coll in that case though, which we don't have now.
|
||||
*
|
||||
* Note a significant difference between this function and Rajeev's old code:
|
||||
* this code doesn't necessarily return a rank in the range
|
||||
* 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a
|
||||
* result of the rank mapping; any set of ranks in the communicator could be
|
||||
* used now.
|
||||
*
|
||||
* Returns an integer representing a rank in the collective I/O communicator.
|
||||
*
|
||||
* The "len" parameter is also modified to indicate the amount of data
|
||||
* actually available in this file domain.
|
||||
*/
|
||||
/*
|
||||
* This is more general aggregator search function which does not base on the assumption
|
||||
* that each aggregator hosts the file domain with the same size
|
||||
*/
|
||||
int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
|
||||
ADIO_Offset off,
|
||||
ADIO_Offset min_off,
|
||||
ADIO_Offset *len,
|
||||
ADIO_Offset fd_size,
|
||||
ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end)
|
||||
{
|
||||
int rank_index, rank;
|
||||
ADIO_Offset avail_bytes;
|
||||
|
||||
AD_BGL_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
|
||||
|
||||
/* binary search --> rank_index is returned */
|
||||
int ub = fd->hints->cb_nodes;
|
||||
int lb = 0;
|
||||
/* get an index into our array of aggregators */
|
||||
/* Common code for striping - bgl doesn't use it but it's
|
||||
here to make diff'ing easier.
|
||||
rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1);
|
||||
|
||||
if (fd->hints->striping_unit > 0) {
|
||||
* wkliao: implementation for file domain alignment
|
||||
fd_start[] and fd_end[] have been aligned with file lock
|
||||
boundaries when returned from ADIOI_Calc_file_domains() so cannot
|
||||
just use simple arithmatic as above *
|
||||
rank_index = 0;
|
||||
while (off > fd_end[rank_index]) rank_index++;
|
||||
}
|
||||
bgl does it's own striping below
|
||||
*/
|
||||
rank_index = fd->hints->cb_nodes / 2;
|
||||
while ( off < fd_start[rank_index] || off > fd_end[rank_index] ) {
|
||||
if ( off > fd_end [rank_index] ) {
|
||||
lb = rank_index;
|
||||
rank_index = (rank_index + ub) / 2;
|
||||
}
|
||||
else
|
||||
if ( off < fd_start[rank_index] ) {
|
||||
ub = rank_index;
|
||||
rank_index = (rank_index + lb) / 2;
|
||||
}
|
||||
}
|
||||
/* we index into fd_end with rank_index, and fd_end was allocated to be no
|
||||
* bigger than fd->hins->cb_nodes. If we ever violate that, we're
|
||||
* overrunning arrays. Obviously, we should never ever hit this abort */
|
||||
if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
|
||||
FPRINTF(stderr, "Error in ADIOI_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size=%lld off=%lld\n",
|
||||
rank_index,fd->hints->cb_nodes,fd_size,off);
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
// DBG_FPRINTF ("ADIOI_BGL_Calc_aggregator: rank_index = %d\n", rank_index );
|
||||
|
||||
/*
|
||||
* remember here that even in Rajeev's original code it was the case that
|
||||
* different aggregators could end up with different amounts of data to
|
||||
* aggregate. here we use fd_end[] to make sure that we know how much
|
||||
* data this aggregator is working with.
|
||||
*
|
||||
* the +1 is to take into account the end vs. length issue.
|
||||
*/
|
||||
avail_bytes = fd_end[rank_index] + 1 - off;
|
||||
if (avail_bytes < *len && avail_bytes > 0) {
|
||||
/* this file domain only has part of the requested contig. region */
|
||||
|
||||
*len = avail_bytes;
|
||||
}
|
||||
|
||||
/* map our index to a rank */
|
||||
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
|
||||
rank = fd->hints->ranklist[rank_index];
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute a dynamic access range based file domain partition among I/O aggregators,
|
||||
* which align to the GPFS block size
|
||||
* Divide the I/O workload among "nprocs_for_coll" processes. This is
|
||||
* done by (logically) dividing the file into file domains (FDs); each
|
||||
* process may directly access only its own file domain.
|
||||
* Additional effort is to make sure that each I/O aggregator get
|
||||
* a file domain that aligns to the GPFS block size. So, there will
|
||||
* not be any false sharing of GPFS file blocks among multiple I/O nodes.
|
||||
*
|
||||
* The common version of this now accepts a min_fd_size and striping_unit.
|
||||
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
|
||||
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
|
||||
*/
|
||||
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
ADIO_Offset *end_offsets,
|
||||
int nprocs,
|
||||
int nprocs_for_coll,
|
||||
ADIO_Offset *min_st_offset_ptr,
|
||||
ADIO_Offset **fd_start_ptr,
|
||||
ADIO_Offset **fd_end_ptr,
|
||||
ADIO_Offset *fd_size_ptr,
|
||||
void *fs_ptr)
|
||||
{
|
||||
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
|
||||
int i, aggr;
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5004, 0, NULL);
|
||||
#endif
|
||||
|
||||
# if AGG_DEBUG
|
||||
static char myname[] = "ADIOI_BGL_GPFS_Calc_file_domains";
|
||||
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
|
||||
myname,__LINE__,nprocs_for_coll);
|
||||
# endif
|
||||
__blksize_t blksize = 1048576; /* default to 1M */
|
||||
if(fs_ptr && ((ADIOI_BGL_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
|
||||
blksize = ((ADIOI_BGL_fs*)fs_ptr)->blksize;
|
||||
# if AGG_DEBUG
|
||||
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
|
||||
# endif
|
||||
/* find min of start offsets and max of end offsets of all processes */
|
||||
min_st_offset = st_offsets [0];
|
||||
max_end_offset = end_offsets[0];
|
||||
for (i=1; i<nprocs; i++) {
|
||||
min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
|
||||
max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
|
||||
}
|
||||
|
||||
// DBG_FPRINTF(stderr, "_calc_file_domains, min_st_offset, max_ = %qd, %qd\n", min_st_offset, max_end_offset );
|
||||
|
||||
/* determine the "file domain (FD)" of each process, i.e., the portion of
|
||||
the file that will be "owned" by each process */
|
||||
|
||||
ADIO_Offset gpfs_ub = (max_end_offset +blksize-1) / blksize * blksize - 1;
|
||||
ADIO_Offset gpfs_lb = min_st_offset / blksize * blksize;
|
||||
ADIO_Offset gpfs_ub_rdoff = (max_end_offset +blksize-1) / blksize * blksize - 1 - max_end_offset;
|
||||
ADIO_Offset gpfs_lb_rdoff = min_st_offset - min_st_offset / blksize * blksize;
|
||||
ADIO_Offset fd_gpfs_range = gpfs_ub - gpfs_lb + 1;
|
||||
|
||||
int naggs = nprocs_for_coll;
|
||||
|
||||
/* Tweak the file domains so that no fd is smaller than a threshold. We
|
||||
* have to strike a balance between efficency and parallelism: somewhere
|
||||
* between 10k processes sending 32-byte requests and one process sending a
|
||||
* 320k request is a (system-dependent) sweet spot
|
||||
|
||||
This is from the common code - the new min_fd_size parm that we didn't implement.
|
||||
(And common code uses a different declaration of fd_size so beware) */
|
||||
|
||||
|
||||
/* this is not entirely sufficient on BlueGene: we must be mindful of
|
||||
* imbalance over psets. the hint processing code has already picked, say,
|
||||
* 8 processors per pset, so if we go increasing fd_size we'll end up with
|
||||
* some psets with 8 processors and some psets with none. */
|
||||
/*
|
||||
if (fd_size < min_fd_size)
|
||||
fd_size = min_fd_size;
|
||||
*/
|
||||
fd_size = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
|
||||
*fd_start_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
|
||||
*fd_end_ptr = (ADIO_Offset *) ADIOI_Malloc(nprocs_for_coll * sizeof(ADIO_Offset));
|
||||
fd_start = *fd_start_ptr;
|
||||
fd_end = *fd_end_ptr;
|
||||
|
||||
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
|
||||
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
|
||||
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
|
||||
ADIO_Offset naggs_small = naggs - naggs_large;
|
||||
|
||||
/* nb_cn_small * blksize: evenly split file domain among processors:
|
||||
* equivalent to fd_gpfs_rnage/naggs
|
||||
* (nb_cn_small+1) * blksize: keeps file domain at least 'blksize' big
|
||||
*/
|
||||
for (i=0; i<naggs; i++)
|
||||
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
|
||||
else fd_size[i] = (nb_cn_small+1) * blksize;
|
||||
/*potential optimization: if n_gpfs_blk smalller than
|
||||
* naggs, slip in some zero-sized file
|
||||
* domains to spread the work across all psets. */
|
||||
|
||||
# if AGG_DEBUG
|
||||
DBG_FPRINTF(stderr,"%s(%d): "
|
||||
"gpfs_ub %llu, "
|
||||
"gpfs_lb %llu, "
|
||||
"gpfs_ub_rdoff %llu, "
|
||||
"gpfs_lb_rdoff %llu, "
|
||||
"fd_gpfs_range %llu, "
|
||||
"n_gpfs_blk %llu, "
|
||||
"nb_cn_small %llu, "
|
||||
"naggs_large %llu, "
|
||||
"naggs_small %llu, "
|
||||
"\n",
|
||||
myname,__LINE__,
|
||||
gpfs_ub ,
|
||||
gpfs_lb ,
|
||||
gpfs_ub_rdoff,
|
||||
gpfs_lb_rdoff,
|
||||
fd_gpfs_range,
|
||||
n_gpfs_blk ,
|
||||
nb_cn_small ,
|
||||
naggs_large ,
|
||||
naggs_small
|
||||
);
|
||||
# endif
|
||||
|
||||
fd_size[0] -= gpfs_lb_rdoff;
|
||||
fd_size[naggs-1] -= gpfs_ub_rdoff;
|
||||
|
||||
/* compute the file domain for each aggr */
|
||||
ADIO_Offset offset = min_st_offset;
|
||||
for (aggr=0; aggr<naggs; aggr++) {
|
||||
fd_start[aggr] = offset;
|
||||
fd_end [aggr] = offset + fd_size[aggr] - 1;
|
||||
offset += fd_size[aggr];
|
||||
}
|
||||
|
||||
*fd_size_ptr = fd_size[0];
|
||||
*min_st_offset_ptr = min_st_offset;
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5005, 0, NULL);
|
||||
#endif
|
||||
ADIOI_Free (fd_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* When a process is an IO aggregator, this will return its index in the aggrs list.
|
||||
* Otherwise, this will return -1
|
||||
*/
|
||||
int ADIOI_BGL_Aggrs_index( ADIO_File fd, int myrank )
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<fd->hints->cb_nodes; i++)
|
||||
if (fd->hints->ranklist[i] == myrank) return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* ADIOI_BGL_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
|
||||
* is specific for static file domain partitioning.
|
||||
*
|
||||
* ADIOI_Calc_my_req() - calculate what portions of the access requests
|
||||
* of this process are located in the file domains of various processes
|
||||
* (including this one)
|
||||
*/
|
||||
void ADIOI_BGL_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
int contig_access_count, ADIO_Offset
|
||||
min_st_offset, ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end, ADIO_Offset fd_size,
|
||||
int nprocs,
|
||||
int *count_my_req_procs_ptr,
|
||||
int **count_my_req_per_proc_ptr,
|
||||
ADIOI_Access **my_req_ptr,
|
||||
int **buf_idx_ptr)
|
||||
/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
|
||||
They are used as memory buffer indices so it seems like the 2G limit is in effect */
|
||||
{
|
||||
int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
|
||||
int i, l, proc;
|
||||
ADIO_Offset fd_len, rem_len, curr_idx, off;
|
||||
ADIOI_Access *my_req;
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5024, 0, NULL);
|
||||
#endif
|
||||
|
||||
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
|
||||
count_my_req_per_proc = *count_my_req_per_proc_ptr;
|
||||
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
|
||||
process in process i's file domain. calloc initializes to zero.
|
||||
I'm allocating memory of size nprocs, so that I can do an
|
||||
MPI_Alltoall later on.*/
|
||||
|
||||
buf_idx = (int *) ADIOI_Malloc(nprocs*sizeof(int));
|
||||
/* buf_idx is relevant only if buftype_is_contig.
|
||||
buf_idx[i] gives the index into user_buf where data received
|
||||
from proc. i should be placed. This allows receives to be done
|
||||
without extra buffer. This can't be done if buftype is not contig. */
|
||||
|
||||
/* initialize buf_idx to -1 */
|
||||
for (i=0; i < nprocs; i++) buf_idx[i] = -1;
|
||||
|
||||
/* one pass just to calculate how much space to allocate for my_req;
|
||||
* contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
|
||||
*/
|
||||
for (i=0; i < contig_access_count; i++) {
|
||||
/* short circuit offset/len processing if len == 0
|
||||
* (zero-byte read/write */
|
||||
if (len_list[i] == 0)
|
||||
continue;
|
||||
off = offset_list[i];
|
||||
fd_len = len_list[i];
|
||||
/* note: we set fd_len to be the total size of the access. then
|
||||
* ADIOI_Calc_aggregator() will modify the value to return the
|
||||
* amount that was available from the file domain that holds the
|
||||
* first part of the access.
|
||||
*/
|
||||
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
fd_start, fd_end);
|
||||
count_my_req_per_proc[proc]++;
|
||||
|
||||
/* figure out how much data is remaining in the access (i.e. wasn't
|
||||
* part of the file domain that had the starting byte); we'll take
|
||||
* care of this data (if there is any) in the while loop below.
|
||||
*/
|
||||
rem_len = len_list[i] - fd_len;
|
||||
|
||||
while (rem_len > 0) {
|
||||
off += fd_len; /* point to first remaining byte */
|
||||
fd_len = rem_len; /* save remaining size, pass to calc */
|
||||
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
fd_size, fd_start, fd_end);
|
||||
|
||||
count_my_req_per_proc[proc]++;
|
||||
rem_len -= fd_len; /* reduce remaining length by amount from fd */
|
||||
}
|
||||
}
|
||||
|
||||
/* now allocate space for my_req, offset, and len */
|
||||
|
||||
*my_req_ptr = (ADIOI_Access *)
|
||||
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
|
||||
my_req = *my_req_ptr;
|
||||
|
||||
count_my_req_procs = 0;
|
||||
for (i=0; i < nprocs; i++) {
|
||||
if (count_my_req_per_proc[i]) {
|
||||
my_req[i].offsets = (ADIO_Offset *)
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
|
||||
my_req[i].lens = (int *)
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
|
||||
count_my_req_procs++;
|
||||
}
|
||||
my_req[i].count = 0; /* will be incremented where needed
|
||||
later */
|
||||
}
|
||||
|
||||
/* now fill in my_req */
|
||||
curr_idx = 0;
|
||||
for (i=0; i<contig_access_count; i++) {
|
||||
/* short circuit offset/len processing if len == 0
|
||||
* (zero-byte read/write */
|
||||
if (len_list[i] == 0)
|
||||
continue;
|
||||
off = offset_list[i];
|
||||
fd_len = len_list[i];
|
||||
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
fd_start, fd_end);
|
||||
|
||||
/* for each separate contiguous access from this process */
|
||||
if (buf_idx[proc] == -1)
|
||||
{
|
||||
ADIOI_Assert(curr_idx == (int) curr_idx);
|
||||
buf_idx[proc] = (int) curr_idx;
|
||||
}
|
||||
|
||||
l = my_req[proc].count;
|
||||
curr_idx += fd_len;
|
||||
|
||||
rem_len = len_list[i] - fd_len;
|
||||
|
||||
/* store the proc, offset, and len information in an array
|
||||
* of structures, my_req. Each structure contains the
|
||||
* offsets and lengths located in that process's FD,
|
||||
* and the associated count.
|
||||
*/
|
||||
my_req[proc].offsets[l] = off;
|
||||
ADIOI_Assert(fd_len == (int) fd_len);
|
||||
my_req[proc].lens[l] = (int) fd_len;
|
||||
my_req[proc].count++;
|
||||
|
||||
while (rem_len > 0) {
|
||||
off += fd_len;
|
||||
fd_len = rem_len;
|
||||
proc = ADIOI_BGL_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
fd_size, fd_start, fd_end);
|
||||
|
||||
if (buf_idx[proc] == -1)
|
||||
{
|
||||
ADIOI_Assert(curr_idx == (int) curr_idx);
|
||||
buf_idx[proc] = (int) curr_idx;
|
||||
}
|
||||
|
||||
l = my_req[proc].count;
|
||||
curr_idx += fd_len;
|
||||
rem_len -= fd_len;
|
||||
|
||||
my_req[proc].offsets[l] = off;
|
||||
ADIOI_Assert(fd_len == (int) fd_len);
|
||||
my_req[proc].lens[l] = (int) fd_len;
|
||||
my_req[proc].count++;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef AGG_DEBUG
|
||||
for (i=0; i<nprocs; i++) {
|
||||
if (count_my_req_per_proc[i] > 0) {
|
||||
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
|
||||
my_req[i].count);
|
||||
for (l=0; l < my_req[i].count; l++) {
|
||||
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
|
||||
my_req[i].offsets[l], l, my_req[i].lens[l]);
|
||||
}
|
||||
}
|
||||
DBG_FPRINTF(stderr, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
*count_my_req_procs_ptr = count_my_req_procs;
|
||||
*buf_idx_ptr = buf_idx;
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5025, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* ADIOI_Calc_others_req (copied to bgl and switched to all to all for performance)
|
||||
*
|
||||
* param[in] count_my_req_procs Number of processes whose file domain my
|
||||
* request touches.
|
||||
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
|
||||
* contig. requests of this process in
|
||||
* process i's file domain.
|
||||
* param[in] my_req A structure defining my request
|
||||
* param[in] nprocs Number of nodes in the block
|
||||
* param[in] myrank Rank of this node
|
||||
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
|
||||
* my process's file domain (including my
|
||||
* process itself)
|
||||
* param[out] others_req_ptr Array of other process' requests that lie
|
||||
* in my process's file domain
|
||||
*/
|
||||
void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
int *count_my_req_per_proc,
|
||||
ADIOI_Access *my_req,
|
||||
int nprocs, int myrank,
|
||||
int *count_others_req_procs_ptr,
|
||||
ADIOI_Access **others_req_ptr)
|
||||
{
|
||||
/* determine what requests of other processes lie in this process's
|
||||
file domain */
|
||||
|
||||
/* count_others_req_procs = number of processes whose requests lie in
|
||||
this process's file domain (including this process itself)
|
||||
count_others_req_per_proc[i] indicates how many separate contiguous
|
||||
requests of proc. i lie in this process's file domain. */
|
||||
|
||||
int *count_others_req_per_proc, count_others_req_procs;
|
||||
int i;
|
||||
ADIOI_Access *others_req;
|
||||
|
||||
/* Parameters for MPI_Alltoallv */
|
||||
int *scounts, *sdispls, *rcounts, *rdispls;
|
||||
|
||||
/* Parameters for MPI_Alltoallv. These are the buffers, which
|
||||
* are later computed to be the lowest address of all buffers
|
||||
* to be sent/received for offsets and lengths. Initialize to
|
||||
* the highest possible address which is the current minimum.
|
||||
*/
|
||||
void *sendBufForOffsets=(void*)0xFFFFFFFF,
|
||||
*sendBufForLens =(void*)0xFFFFFFFF,
|
||||
*recvBufForOffsets=(void*)0xFFFFFFFF,
|
||||
*recvBufForLens =(void*)0xFFFFFFFF;
|
||||
|
||||
/* first find out how much to send/recv and from/to whom */
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5026, 0, NULL);
|
||||
#endif
|
||||
/* Send 1 int to each process. count_my_req_per_proc[i] is the number of
|
||||
* requests that my process will do to the file domain owned by process[i].
|
||||
* Receive 1 int from each process. count_others_req_per_proc[i] is the number of
|
||||
* requests that process[i] will do to the file domain owned by my process.
|
||||
*/
|
||||
count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
|
||||
/* cora2a1=timebase(); */
|
||||
MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
|
||||
count_others_req_per_proc, 1, MPI_INT, fd->comm);
|
||||
/* total_cora2a+=timebase()-cora2a1; */
|
||||
|
||||
/* Allocate storage for an array of other nodes' accesses of our
|
||||
* node's file domain. Also allocate storage for the alltoallv
|
||||
* parameters.
|
||||
*/
|
||||
*others_req_ptr = (ADIOI_Access *)
|
||||
ADIOI_Malloc(nprocs*sizeof(ADIOI_Access));
|
||||
others_req = *others_req_ptr;
|
||||
|
||||
scounts = ADIOI_Malloc(nprocs*sizeof(int));
|
||||
sdispls = ADIOI_Malloc(nprocs*sizeof(int));
|
||||
rcounts = ADIOI_Malloc(nprocs*sizeof(int));
|
||||
rdispls = ADIOI_Malloc(nprocs*sizeof(int));
|
||||
|
||||
/* If process[i] has any requests in my file domain,
|
||||
* initialize an ADIOI_Access structure that will describe each request
|
||||
* from process[i]. The offsets, lengths, and buffer pointers still need
|
||||
* to be obtained to complete the setting of this structure.
|
||||
*/
|
||||
count_others_req_procs = 0;
|
||||
for (i=0; i<nprocs; i++) {
|
||||
if (count_others_req_per_proc[i]) {
|
||||
others_req[i].count = count_others_req_per_proc[i];
|
||||
|
||||
others_req[i].offsets = (ADIO_Offset *)
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
|
||||
others_req[i].lens = (int *)
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
|
||||
|
||||
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
|
||||
recvBufForOffsets = others_req[i].offsets;
|
||||
if ( (MPIR_Upint)others_req[i].lens < (MPIR_Upint)recvBufForLens )
|
||||
recvBufForLens = others_req[i].lens;
|
||||
|
||||
others_req[i].mem_ptrs = (MPI_Aint *)
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(MPI_Aint));
|
||||
|
||||
count_others_req_procs++;
|
||||
}
|
||||
else
|
||||
{
|
||||
others_req[i].count = 0;
|
||||
others_req[i].offsets = NULL;
|
||||
others_req[i].lens = NULL;
|
||||
}
|
||||
}
|
||||
/* If no recv buffer was allocated in the loop above, make it NULL */
|
||||
if ( recvBufForOffsets == (void*)0xFFFFFFFF) recvBufForOffsets = NULL;
|
||||
if ( recvBufForLens == (void*)0xFFFFFFFF) recvBufForLens = NULL;
|
||||
|
||||
/* Now send the calculated offsets and lengths to respective processes */
|
||||
|
||||
/************************/
|
||||
/* Exchange the offsets */
|
||||
/************************/
|
||||
|
||||
/* Determine the lowest sendBufForOffsets/Lens */
|
||||
for (i=0; i<nprocs; i++)
|
||||
{
|
||||
if ( (my_req[i].count) &&
|
||||
((MPIR_Upint)my_req[i].offsets <= (MPIR_Upint)sendBufForOffsets) )
|
||||
sendBufForOffsets = my_req[i].offsets;
|
||||
|
||||
if ( (my_req[i].count) &&
|
||||
((MPIR_Upint)my_req[i].lens <= (MPIR_Upint)sendBufForLens) )
|
||||
sendBufForLens = my_req[i].lens;
|
||||
}
|
||||
|
||||
/* If no send buffer was found in the loop above, make it NULL */
|
||||
if ( sendBufForOffsets == (void*)0xFFFFFFFF) sendBufForOffsets = NULL;
|
||||
if ( sendBufForLens == (void*)0xFFFFFFFF) sendBufForLens = NULL;
|
||||
|
||||
/* Calculate the displacements from the sendBufForOffsets/Lens */
|
||||
for (i=0; i<nprocs; i++)
|
||||
{
|
||||
// Send these offsets to process i.
|
||||
scounts[i] = count_my_req_per_proc[i];
|
||||
if ( scounts[i] == 0 )
|
||||
sdispls[i] = 0;
|
||||
else
|
||||
sdispls[i] = (int)
|
||||
( ( (MPIR_Upint)my_req[i].offsets -
|
||||
(MPIR_Upint)sendBufForOffsets ) /
|
||||
(MPIR_Upint)sizeof(ADIO_Offset) );
|
||||
|
||||
// Receive these offsets from process i.
|
||||
rcounts[i] = count_others_req_per_proc[i];
|
||||
if ( rcounts[i] == 0 )
|
||||
rdispls[i] = 0;
|
||||
else
|
||||
rdispls[i] = (int)
|
||||
( ( (MPIR_Upint)others_req[i].offsets -
|
||||
(MPIR_Upint)recvBufForOffsets ) /
|
||||
(MPIR_Upint)sizeof(ADIO_Offset) );
|
||||
}
|
||||
|
||||
/* Exchange the offsets */
|
||||
MPI_Alltoallv(sendBufForOffsets,
|
||||
scounts, sdispls, ADIO_OFFSET,
|
||||
recvBufForOffsets,
|
||||
rcounts, rdispls, ADIO_OFFSET,
|
||||
fd->comm);
|
||||
|
||||
/************************/
|
||||
/* Exchange the lengths */
|
||||
/************************/
|
||||
|
||||
for (i=0; i<nprocs; i++)
|
||||
{
|
||||
// Send these lengths to process i.
|
||||
scounts[i] = count_my_req_per_proc[i];
|
||||
if ( scounts[i] == 0 )
|
||||
sdispls[i] = 0;
|
||||
else
|
||||
sdispls[i] = (int)
|
||||
( ( (MPIR_Upint)my_req[i].lens -
|
||||
(MPIR_Upint)sendBufForLens ) /
|
||||
(MPIR_Upint) sizeof(int) );
|
||||
|
||||
// Receive these offsets from process i.
|
||||
rcounts[i] = count_others_req_per_proc[i];
|
||||
if ( rcounts[i] == 0 )
|
||||
rdispls[i] = 0;
|
||||
else
|
||||
rdispls[i] = (int)
|
||||
( ( (MPIR_Upint)others_req[i].lens -
|
||||
(MPIR_Upint)recvBufForLens ) /
|
||||
(MPIR_Upint) sizeof(int) );
|
||||
}
|
||||
|
||||
/* Exchange the lengths */
|
||||
MPI_Alltoallv(sendBufForLens,
|
||||
scounts, sdispls, MPI_INT,
|
||||
recvBufForLens,
|
||||
rcounts, rdispls, MPI_INT,
|
||||
fd->comm);
|
||||
|
||||
/* Clean up */
|
||||
ADIOI_Free(count_others_req_per_proc);
|
||||
ADIOI_Free (scounts);
|
||||
ADIOI_Free (sdispls);
|
||||
ADIOI_Free (rcounts);
|
||||
ADIOI_Free (rdispls);
|
||||
|
||||
*count_others_req_procs_ptr = count_others_req_procs;
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5027, 0, NULL);
|
||||
#endif
|
||||
}
|
@ -1,108 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_aggrs.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*
|
||||
* File: ad_bgl_aggrs.h
|
||||
*
|
||||
* Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
|
||||
* . Aligned file-domain partitioning, integrated in 7/28/2005
|
||||
*
|
||||
* In addition, following optimizations are planned:
|
||||
* . Integrating multiple file-domain partitioning schemes
|
||||
* (corresponding to Alok Chouhdary's persistent file domain work).
|
||||
*/
|
||||
|
||||
#ifndef AD_BGL_AGGRS_H_
|
||||
#define AD_BGL_AGGRS_H_
|
||||
|
||||
#include "adio.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
#if !defined(GPFS_SUPER_MAGIC)
|
||||
#define GPFS_SUPER_MAGIC (0x47504653)
|
||||
#endif
|
||||
|
||||
#if !defined(PVFS2_SUPER_MAGIC)
|
||||
#define PVFS2_SUPER_MAGIC (0x20030528)
|
||||
#endif
|
||||
|
||||
/* File system (BGL) specific information -
|
||||
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
|
||||
typedef struct ADIOI_BGL_fs_s {
|
||||
__blksize_t blksize;
|
||||
int fsync_aggr; /* "fsync aggregation" flags (below) */
|
||||
#define ADIOI_BGL_FSYNC_AGGREGATION_DISABLED 0x00
|
||||
#define ADIOI_BGL_FSYNC_AGGREGATION_ENABLED 0x01
|
||||
#define ADIOI_BGL_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
|
||||
} ADIOI_BGL_fs;
|
||||
|
||||
/* generate a list of I/O aggregators that utilizes BGL-PSET orginization. */
|
||||
int ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
|
||||
|
||||
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
|
||||
void ADIOI_BGL_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
ADIO_Offset *end_offsets,
|
||||
int nprocs,
|
||||
int nprocs_for_coll,
|
||||
ADIO_Offset *min_st_offset_ptr,
|
||||
ADIO_Offset **fd_start_ptr,
|
||||
ADIO_Offset **fd_end_ptr,
|
||||
ADIO_Offset *fd_size_ptr,
|
||||
void *fs_ptr);
|
||||
|
||||
/* a utilitiy function for debugging */
|
||||
int ADIOI_BGL_Aggrs_index(ADIO_File fd, int myrank );
|
||||
|
||||
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
int ADIOI_BGL_Calc_aggregator(ADIO_File fd,
|
||||
ADIO_Offset off,
|
||||
ADIO_Offset min_off,
|
||||
ADIO_Offset *len,
|
||||
ADIO_Offset fd_size,
|
||||
ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end);
|
||||
|
||||
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
void ADIOI_BGL_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
int contig_access_count, ADIO_Offset
|
||||
min_st_offset, ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end, ADIO_Offset fd_size,
|
||||
int nprocs,
|
||||
int *count_my_req_procs_ptr,
|
||||
int **count_my_req_per_proc_ptr,
|
||||
ADIOI_Access **my_req_ptr,
|
||||
int **buf_idx_ptr);
|
||||
|
||||
/*
|
||||
* ADIOI_Calc_others_req
|
||||
*
|
||||
* param[in] count_my_req_procs Number of processes whose file domain my
|
||||
* request touches.
|
||||
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
|
||||
* contig. requests of this process in
|
||||
* process i's file domain.
|
||||
* param[in] my_req A structure defining my request
|
||||
* param[in] nprocs Number of nodes in the block
|
||||
* param[in] myrank Rank of this node
|
||||
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
|
||||
* my process's file domain (including my
|
||||
* process itself)
|
||||
* param[out] others_req_ptr Array of other process' requests that lie
|
||||
* in my process's file domain
|
||||
*/
|
||||
void ADIOI_BGL_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
int *count_my_req_per_proc,
|
||||
ADIOI_Access *my_req,
|
||||
int nprocs, int myrank,
|
||||
int *count_others_req_procs_ptr,
|
||||
ADIOI_Access **others_req_ptr);
|
||||
|
||||
|
||||
#endif /* AD_BGL_AGGRS_H_ */
|
@ -1,58 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_fcntl.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "adio_extern.h"
|
||||
/* #ifdef MPISGI
|
||||
#include "mpisgi2.h"
|
||||
#endif */
|
||||
|
||||
void ADIOI_BGL_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
|
||||
int *error_code)
|
||||
{
|
||||
static char myname[] = "ADIOI_BGL_FCNTL";
|
||||
|
||||
switch(flag) {
|
||||
case ADIO_FCNTL_GET_FSIZE:
|
||||
fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
if (fd->fp_sys_posn != -1)
|
||||
lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET);
|
||||
if (fcntl_struct->fsize == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
break;
|
||||
|
||||
case ADIO_FCNTL_SET_DISKSPACE:
|
||||
ADIOI_GEN_Prealloc(fd, fcntl_struct->diskspace, error_code);
|
||||
break;
|
||||
|
||||
case ADIO_FCNTL_SET_ATOMICITY:
|
||||
fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1;
|
||||
*error_code = MPI_SUCCESS;
|
||||
break;
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
default:
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_ARG,
|
||||
"**flag", "**flag %d", flag);
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
}
|
@ -1,90 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_flush.c
|
||||
* \brief Scalable flush based on underlying filesystem and psets
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
|
||||
void ADIOI_BGL_Flush(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int err=0;
|
||||
static char myname[] = "ADIOI_BGL_FLUSH";
|
||||
|
||||
|
||||
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATION_ENABLED)
|
||||
{
|
||||
int rank;
|
||||
|
||||
/* Barrier so we can collectively do fewer fsync's */
|
||||
MPI_Barrier(fd->comm);
|
||||
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
/* All ranks marked as "fsync aggregators" should fsync.
|
||||
(We currently only do one fsync on rank 0 but this is general
|
||||
enough to support >1 aggregator using allreduce to get the
|
||||
results instead of simply bcast'ing the results from rank 0.)*/
|
||||
if(((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr & ADIOI_BGL_FSYNC_AGGREGATOR)
|
||||
{
|
||||
err = fsync(fd->fd_sys);
|
||||
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
/* We want errno, not the return code if it failed */
|
||||
if (err == -1) err = errno;
|
||||
else err = 0;
|
||||
}
|
||||
/* Just pick an errno (using unsigned MPI_MAX) from any failures */
|
||||
MPI_Allreduce( MPI_IN_PLACE, (unsigned*)&err, 1, MPI_UNSIGNED, MPI_MAX, fd->comm);
|
||||
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
|
||||
|
||||
if (err) /* if it's non-zero, it must be an errno */
|
||||
{
|
||||
errno = err;
|
||||
err = -1;
|
||||
}
|
||||
}
|
||||
else /* Non-aggregated fsync */
|
||||
{
|
||||
#ifdef USE_DBG_LOGGING
|
||||
int rank;
|
||||
#endif
|
||||
err = fsync(fd->fd_sys);
|
||||
#ifdef USE_DBG_LOGGING
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
if(rank == 0)
|
||||
{
|
||||
DBG_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"no aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1)
|
||||
{
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
@ -1,84 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_getsh.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
|
||||
/* returns the current location of the shared_fp in terms of the
|
||||
no. of etypes relative to the current view, and also increments the
|
||||
shared_fp by the number of etypes to be accessed (incr) in the read
|
||||
or write following this function. */
|
||||
|
||||
void ADIOI_BGL_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
|
||||
int *error_code)
|
||||
{
|
||||
ADIO_Offset new_fp;
|
||||
int err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_BGL_GET_SHARED_FP";
|
||||
|
||||
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
|
||||
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
|
||||
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF,
|
||||
dupcommself,
|
||||
fd->shared_fp_fname,
|
||||
fd->file_system,
|
||||
fd->fns,
|
||||
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
|
||||
0,
|
||||
MPI_BYTE,
|
||||
MPI_BYTE,
|
||||
MPI_INFO_NULL,
|
||||
ADIO_PERM_NULL,
|
||||
error_code);
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
*shared_fp = 0;
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
err = read(fd->shared_fp_fd->fd_sys, shared_fp, sizeof(ADIO_Offset));
|
||||
/* if the file is empty, the above read may return error
|
||||
(reading beyond end of file). In that case, shared_fp = 0,
|
||||
set above, is the correct value. */
|
||||
}
|
||||
else {
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
|
||||
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
if (err == 0) {
|
||||
err = read(fd->shared_fp_fd->fd_sys, shared_fp,
|
||||
sizeof(ADIO_Offset));
|
||||
}
|
||||
if (err == -1) {
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
new_fp = *shared_fp + incr;
|
||||
|
||||
err = lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
if (err == 0) {
|
||||
err = write(fd->shared_fp_fd->fd_sys, &new_fp, sizeof(ADIO_Offset));
|
||||
}
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,542 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_hints.c
|
||||
* \brief BlueGene hint processing
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_pset.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
|
||||
#define ADIOI_BGL_CB_BUFFER_SIZE_DFLT "16777216"
|
||||
#define ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT "4194304"
|
||||
#define ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT "4194304"
|
||||
#define ADIOI_BGL_NAGG_IN_PSET_HINT_NAME "bgl_nodes_pset"
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
*
|
||||
* BlueGene MPIIO configuration and performance tuning. Used by ad_bgl and ad_bglockless ADIO's.
|
||||
*
|
||||
* \section hint_sec Hints
|
||||
* - bgl_nodes_pset - Specify how many aggregators to use per pset.
|
||||
* This hint will override the cb_nodes hint based on BlueGene psets.
|
||||
* - N - Use N nodes per pset as aggregators.
|
||||
* - Default is based on partition configuration and cb_nodes.
|
||||
*
|
||||
* The following default key/value pairs may differ from other platform defaults.
|
||||
*
|
||||
* - key = cb_buffer_size value = 16777216
|
||||
* - key = romio_cb_read value = enable
|
||||
* - key = romio_cb_write value = enable
|
||||
* - key = ind_rd_buffer_size value = 4194304
|
||||
* - key = ind_wr_buffer_size value = 4194304
|
||||
*/
|
||||
|
||||
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
|
||||
extern int
|
||||
ADIOI_BGL_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
|
||||
|
||||
void ADIOI_BGL_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
{
|
||||
/* if fd->info is null, create a new info object.
|
||||
Initialize fd->info to default values.
|
||||
Initialize fd->hints to default values.
|
||||
Examine the info object passed by the user. If it contains values that
|
||||
ROMIO understands, override the default. */
|
||||
|
||||
MPI_Info info;
|
||||
char *value;
|
||||
int flag, intval, tmp_val, nprocs=0, nprocs_is_valid = 0;
|
||||
static char myname[] = "ADIOI_BGL_SETINFO";
|
||||
|
||||
int did_anything = 0;
|
||||
|
||||
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
|
||||
info = fd->info;
|
||||
|
||||
/* Note that fd->hints is allocated at file open time; thus it is
|
||||
* not necessary to allocate it, or check for allocation, here.
|
||||
*/
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
AD_BGL_assert ((value != NULL));
|
||||
|
||||
/* initialize info and hints to default values if they haven't been
|
||||
* previously initialized
|
||||
*/
|
||||
if (!fd->hints->initialized) {
|
||||
|
||||
did_anything = 1;
|
||||
|
||||
/* buffer size for collective I/O */
|
||||
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
|
||||
fd->hints->cb_buffer_size = atoi(ADIOI_BGL_CB_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* default is to let romio automatically decide when to use
|
||||
* collective buffering
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
fd->hints->cb_write = ADIOI_HINT_ENABLE;
|
||||
|
||||
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
|
||||
fd->hints->cb_config_list = NULL;
|
||||
|
||||
/* number of processes that perform I/O in collective I/O */
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
nprocs_is_valid = 1;
|
||||
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
|
||||
ADIOI_Info_set(info, "cb_nodes", value);
|
||||
fd->hints->cb_nodes = -1;
|
||||
|
||||
/* hint indicating that no indep. I/O will be performed on this file */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
|
||||
/* bgl is not implementing file realms (ADIOI_IOStridedColl),
|
||||
initialize to disabled it. */
|
||||
/* hint instructing the use of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
|
||||
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
|
||||
|
||||
/* hint guiding the assignment of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
|
||||
fd->hints->cb_fr_type = ADIOI_FR_AAR;
|
||||
|
||||
/* hint to align file realms with a certain byte value */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
|
||||
fd->hints->cb_fr_alignment = 1;
|
||||
|
||||
/* hint to set a threshold percentage for a datatype's size/extent at
|
||||
* which data sieving should be done in collective I/O */
|
||||
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
|
||||
fd->hints->cb_ds_threshold = 0;
|
||||
|
||||
/* hint to switch between point-to-point or all-to-all for two-phase */
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
|
||||
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
|
||||
|
||||
/* deferred_open derived from no_indep_rw and cb_{read,write} */
|
||||
fd->hints->deferred_open = 0;
|
||||
|
||||
/* buffer size for data sieving in independent reads */
|
||||
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_rd_buffer_size = atoi(ADIOI_BGL_IND_RD_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* buffer size for data sieving in independent writes */
|
||||
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_wr_buffer_size = atoi(ADIOI_BGL_IND_WR_BUFFER_SIZE_DFLT);
|
||||
|
||||
if(fd->file_system == ADIO_UFS)
|
||||
{
|
||||
/* default for ufs/pvfs is to disable data sieving */
|
||||
ADIOI_Info_set(info, "romio_ds_read", "disable");
|
||||
fd->hints->ds_read = ADIOI_HINT_DISABLE;
|
||||
ADIOI_Info_set(info, "romio_ds_write", "disable");
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* default is to let romio automatically decide when to use data
|
||||
* sieving
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_ds_read", "automatic");
|
||||
fd->hints->ds_read = ADIOI_HINT_AUTO;
|
||||
ADIOI_Info_set(info, "romio_ds_write", "automatic");
|
||||
fd->hints->ds_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
/* still to do: tune this a bit for a variety of file systems. there's
|
||||
* no good default value so just leave it unset */
|
||||
fd->hints->min_fdomain_size = 0;
|
||||
fd->hints->striping_unit = 0;
|
||||
|
||||
fd->hints->initialized = 1;
|
||||
}
|
||||
|
||||
/* add in user's info if supplied */
|
||||
if (users_info != MPI_INFO_NULL) {
|
||||
ADIOI_Info_get(users_info, "cb_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"cb_buffer_size",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "cb_buffer_size", value);
|
||||
fd->hints->cb_buffer_size = intval;
|
||||
|
||||
}
|
||||
#if 0
|
||||
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
|
||||
/* aligning file realms to certain sizes (e.g. stripe sizes)
|
||||
* may benefit I/O performance */
|
||||
ADIOI_Info_get(users_info, "romio_cb_fr_alignment", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_fr_alignment",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_fr_alignment", value);
|
||||
fd->hints->cb_fr_alignment = intval;
|
||||
|
||||
}
|
||||
|
||||
/* for collective I/O, try to be smarter about when to do data sieving
|
||||
* using a specific threshold for the datatype size/extent
|
||||
* (percentage 0-100%) */
|
||||
ADIOI_Info_get(users_info, "romio_cb_ds_threshold", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) > 0)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_ds_threshold",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_ds_threshold", value);
|
||||
fd->hints->cb_ds_threshold = intval;
|
||||
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_cb_alltoall", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_alltoall;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_alltoall) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_alltoall",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
#endif
|
||||
/* new hints for enabling/disabling coll. buffering on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_get(users_info, "romio_cb_read", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
/* romio_cb_read overrides no_indep_rw */
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->cb_read = ADIOI_HINT_DISABLE;
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_read", value);
|
||||
fd->hints->cb_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_read;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_read) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_read",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_cb_write", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
fd->hints->cb_write = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE"))
|
||||
{
|
||||
/* romio_cb_write overrides no_indep_rw, too */
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->cb_write = ADIOI_HINT_DISABLE;
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") ||
|
||||
!strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_write", value);
|
||||
fd->hints->cb_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_write;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_write) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_write",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* bgl is not implementing file realms (ADIOI_IOStridedColl) ... */
|
||||
/* enable/disable persistent file realms for collective I/O */
|
||||
/* may want to check for no_indep_rdwr hint as well */
|
||||
ADIOI_Info_get(users_info, "romio_cb_pfr", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", value);
|
||||
fd->hints->cb_pfr = ADIOI_HINT_AUTO;
|
||||
}
|
||||
|
||||
tmp_val = fd->hints->cb_pfr;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->cb_pfr) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_pfr",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
|
||||
/* file realm assignment types ADIOI_FR_AAR(0),
|
||||
ADIOI_FR_FSZ(-1), ADIOI_FR_USR_REALMS(-2), all others specify
|
||||
a regular fr size in bytes. probably not the best way... */
|
||||
ADIOI_Info_get(users_info, "romio_cb_fr_type", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval=atoi(value)) >= -2)) {
|
||||
tmp_val = intval;
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != intval) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_cb_fr_type",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
ADIOI_Info_set(info, "romio_cb_fr_type", value);
|
||||
fd->hints->cb_fr_type = intval;
|
||||
|
||||
}
|
||||
#endif
|
||||
/* new hint for specifying no indep. read/write will be performed */
|
||||
ADIOI_Info_get(users_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
|
||||
/* if 'no_indep_rw' set, also hint that we will do
|
||||
* collective buffering: if we aren't doing independent io,
|
||||
* then we have to do collective */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", value);
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->no_indep_rw = 1;
|
||||
fd->hints->cb_read = 1;
|
||||
fd->hints->cb_write = 1;
|
||||
tmp_val = 1;
|
||||
}
|
||||
else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", value);
|
||||
fd->hints->no_indep_rw = 0;
|
||||
tmp_val = 0;
|
||||
}
|
||||
else {
|
||||
/* default is above */
|
||||
tmp_val = 0;
|
||||
}
|
||||
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (tmp_val != fd->hints->no_indep_rw) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_no_indep_rw",
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
/* new hints for enabling/disabling data sieving on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_get(users_info, "romio_ds_read", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_ds_read", value);
|
||||
fd->hints->ds_read = ADIOI_HINT_AUTO;
|
||||
}
|
||||
/* otherwise ignore */
|
||||
}
|
||||
ADIOI_Info_get(users_info, "romio_ds_write", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag) {
|
||||
if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_ENABLE;
|
||||
}
|
||||
else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC"))
|
||||
{
|
||||
ADIOI_Info_set(info, "romio_ds_write", value);
|
||||
fd->hints->ds_write = ADIOI_HINT_AUTO;
|
||||
}
|
||||
/* otherwise ignore */
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
ADIOI_Info_set(info, "ind_wr_buffer_size", value);
|
||||
fd->hints->ind_wr_buffer_size = intval;
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
ADIOI_Info_set(info, "ind_rd_buffer_size", value);
|
||||
fd->hints->ind_rd_buffer_size = intval;
|
||||
}
|
||||
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if ( flag && ((intval = atoi(value)) > 0) ) {
|
||||
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
|
||||
fd->hints->min_fdomain_size = intval;
|
||||
}
|
||||
/* Now we use striping unit in common code so we should
|
||||
process hints for it. */
|
||||
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if ( flag && ((intval = atoi(value)) > 0) ) {
|
||||
ADIOI_Info_set(info, "striping_unit", value);
|
||||
fd->hints->striping_unit = intval;
|
||||
}
|
||||
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
|
||||
did_anything = 1;
|
||||
ADIOI_Info_set(info, ADIOI_BGL_NAGG_IN_PSET_HINT_NAME, value);
|
||||
fd->hints->cb_nodes = intval;
|
||||
}
|
||||
}
|
||||
|
||||
/* associate CB aggregators to certain CNs in every involved PSET */
|
||||
if (did_anything) {
|
||||
ADIOI_BGL_gen_agg_ranklist(fd, fd->hints->cb_nodes);
|
||||
}
|
||||
/* ignore defered open hints and do not enable it for bluegene: need all
|
||||
* processors in the open path so we can stat-and-broadcast the blocksize
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
fd->hints->deferred_open = 0;
|
||||
|
||||
/* BobC commented this out, but since hint processing runs on both bgl and
|
||||
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
|
||||
* PVFS */
|
||||
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
|
||||
/* disable data sieving for fs that do not
|
||||
support file locking */
|
||||
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
/* get rid of this value if it is set */
|
||||
ADIOI_Info_delete(info, "ind_wr_buffer_size");
|
||||
}
|
||||
/* note: leave ind_wr_buffer_size alone; used for other cases
|
||||
* as well. -- Rob Ross, 04/22/2003
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_ds_write", "disable");
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
|
||||
ADIOI_Free(value);
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,304 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_open.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
|
||||
#include <sys/statfs.h>
|
||||
#include <sys/vfs.h>
|
||||
|
||||
/* COPIED FROM ad_fstype.c since it is static in that file
|
||||
|
||||
ADIO_FileSysType_parentdir - determines a string pathname for the
|
||||
parent directory of a given filename.
|
||||
|
||||
Input Parameters:
|
||||
. filename - pointer to file name character array
|
||||
|
||||
Output Parameters:
|
||||
. dirnamep - pointer to location in which to store a pointer to a string
|
||||
|
||||
Note that the caller should free the memory located at the pointer returned
|
||||
after the string is no longer needed.
|
||||
*/
|
||||
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX 65535
|
||||
#endif
|
||||
|
||||
/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that
|
||||
here. We assume that S_ISLNK is *always* defined as a macro. If
|
||||
that is not universally true, then add a test to the romio
|
||||
configure that trys to link a program that references S_ISLNK */
|
||||
#if !defined(S_ISLNK)
|
||||
# if defined(S_IFLNK)
|
||||
/* Check for the link bit */
|
||||
# define S_ISLNK(mode) ((mode) & S_IFLNK)
|
||||
# else
|
||||
/* no way to check if it is a link, so say false */
|
||||
# define S_ISLNK(mode) 0
|
||||
# endif
|
||||
#endif /* !(S_ISLNK) */
|
||||
|
||||
/* ADIO_FileSysType_parentdir
|
||||
*
|
||||
* Returns pointer to string in dirnamep; that string is allocated with
|
||||
* strdup and must be free()'d.
|
||||
*/
|
||||
static void ADIO_FileSysType_parentdir(char *filename, char **dirnamep)
|
||||
{
|
||||
int err;
|
||||
char *dir = NULL, *slash;
|
||||
struct stat statbuf;
|
||||
|
||||
err = lstat(filename, &statbuf);
|
||||
|
||||
if (err || (!S_ISLNK(statbuf.st_mode))) {
|
||||
/* no such file, or file is not a link; these are the "normal"
|
||||
* cases where we can just return the parent directory.
|
||||
*/
|
||||
dir = ADIOI_Strdup(filename);
|
||||
}
|
||||
else {
|
||||
/* filename is a symlink. we've presumably already tried
|
||||
* to stat it and found it to be missing (dangling link),
|
||||
* but this code doesn't care if the target is really there
|
||||
* or not.
|
||||
*/
|
||||
int namelen;
|
||||
char *linkbuf;
|
||||
|
||||
linkbuf = ADIOI_Malloc(PATH_MAX+1);
|
||||
namelen = readlink(filename, linkbuf, PATH_MAX+1);
|
||||
if (namelen == -1) {
|
||||
/* something strange has happened between the time that
|
||||
* we determined that this was a link and the time that
|
||||
* we attempted to read it; punt and use the old name.
|
||||
*/
|
||||
dir = ADIOI_Strdup(filename);
|
||||
}
|
||||
else {
|
||||
/* successfully read the link */
|
||||
linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */
|
||||
dir = ADIOI_Strdup(linkbuf);
|
||||
ADIOI_Free(linkbuf);
|
||||
}
|
||||
}
|
||||
|
||||
slash = strrchr(dir, '/');
|
||||
if (!slash) ADIOI_Strncpy(dir, ".", 2);
|
||||
else {
|
||||
if (slash == dir) *(dir + 1) = '\0';
|
||||
else *slash = '\0';
|
||||
}
|
||||
|
||||
*dirnamep = dir;
|
||||
return;
|
||||
}
|
||||
|
||||
static void scaleable_stat(ADIO_File fd)
|
||||
{
|
||||
struct stat64 bgl_stat;
|
||||
struct statfs bgl_statfs;
|
||||
int rank, rc;
|
||||
char * dir;
|
||||
long buf[2];
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
if (rank == 0) {
|
||||
/* Get the (real) underlying file system block size */
|
||||
rc = stat64(fd->filename, &bgl_stat);
|
||||
if (rc >= 0)
|
||||
{
|
||||
buf[0] = bgl_stat.st_blksize;
|
||||
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
|
||||
fd->filename,bgl_stat.st_blksize);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
|
||||
fd->filename,rc,errno);
|
||||
}
|
||||
/* Get the (real) underlying file system type so we can
|
||||
* plan our fsync scaling strategy */
|
||||
rc = statfs(fd->filename,&bgl_statfs);
|
||||
if (rc >= 0)
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",
|
||||
fd->filename,bgl_statfs.f_type);
|
||||
buf[1] = bgl_statfs.f_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",
|
||||
fd->filename,rc,errno);
|
||||
ADIO_FileSysType_parentdir(fd->filename, &dir);
|
||||
rc = statfs(dir,&bgl_statfs);
|
||||
if (rc >= 0)
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Successful statfs '%s'. Magic number=%#X\n",dir,bgl_statfs.f_type);
|
||||
buf[1] = bgl_statfs.f_type;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Hmm. Guess we'll assume the worst-case, that it's not GPFS
|
||||
* or BGLOCKLESSMPIO_F_TYPE (default PVFS2) below */
|
||||
buf[1] = -1; /* bogus magic number */
|
||||
DBGV_FPRINTF(stderr,"Statfs '%s' failed with rc=%d, errno=%d\n",dir,rc,errno);
|
||||
}
|
||||
free(dir);
|
||||
}
|
||||
}
|
||||
/* now we can broadcast the stat/statfs data to everyone else */
|
||||
MPI_Bcast(buf, 2, MPI_LONG, 0, fd->comm);
|
||||
bgl_stat.st_blksize = buf[0];
|
||||
bgl_statfs.f_type = buf[1];
|
||||
|
||||
/* data from stat64 */
|
||||
/* store the blksize in the file system specific storage */
|
||||
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = bgl_stat.st_blksize;
|
||||
|
||||
/* data from statfs */
|
||||
if ((bgl_statfs.f_type == GPFS_SUPER_MAGIC) ||
|
||||
(bgl_statfs.f_type == bglocklessmpio_f_type))
|
||||
{
|
||||
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
|
||||
ADIOI_BGL_FSYNC_AGGREGATION_ENABLED;
|
||||
|
||||
/* Only one rank is an "fsync aggregator" because only one
|
||||
* fsync is needed */
|
||||
if (rank == 0)
|
||||
{
|
||||
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr |=
|
||||
ADIOI_BGL_FSYNC_AGGREGATOR;
|
||||
DBG_FPRINTF(stderr,"fsync aggregator %d\n",rank);
|
||||
}
|
||||
else ; /* aggregation enabled but this rank is not an aggregator*/
|
||||
}
|
||||
else; /* Other filesystems default to no fsync aggregation */
|
||||
}
|
||||
|
||||
|
||||
void ADIOI_BGL_Open(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int perm, old_mask, amode;
|
||||
static char myname[] = "ADIOI_BGL_OPEN";
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
ad_bgl_get_env_vars();
|
||||
|
||||
if (fd->perm == ADIO_PERM_NULL) {
|
||||
old_mask = umask(022);
|
||||
umask(old_mask);
|
||||
perm = old_mask ^ 0666;
|
||||
}
|
||||
else perm = fd->perm;
|
||||
|
||||
amode = 0;
|
||||
if (fd->access_mode & ADIO_CREATE)
|
||||
amode = amode | O_CREAT;
|
||||
if (fd->access_mode & ADIO_RDONLY)
|
||||
amode = amode | O_RDONLY;
|
||||
if (fd->access_mode & ADIO_WRONLY)
|
||||
amode = amode | O_WRONLY;
|
||||
if (fd->access_mode & ADIO_RDWR)
|
||||
amode = amode | O_RDWR;
|
||||
if (fd->access_mode & ADIO_EXCL)
|
||||
amode = amode | O_EXCL;
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
|
||||
#endif
|
||||
fd->fd_sys = open(fd->filename, amode, perm);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
|
||||
#endif
|
||||
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
|
||||
fd->fd_direct = -1;
|
||||
|
||||
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
|
||||
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
|
||||
if(fd->fd_sys != -1)
|
||||
{
|
||||
/* Initialize the ad_bgl file system specific information */
|
||||
AD_BGL_assert(fd->fs_ptr == NULL);
|
||||
fd->fs_ptr = (ADIOI_BGL_fs*) ADIOI_Malloc(sizeof(ADIOI_BGL_fs));
|
||||
|
||||
((ADIOI_BGL_fs*)fd->fs_ptr)->blksize = 1048576; /* default to 1M */
|
||||
|
||||
/* default is no fsync aggregation */
|
||||
((ADIOI_BGL_fs*)fd->fs_ptr)->fsync_aggr =
|
||||
ADIOI_BGL_FSYNC_AGGREGATION_DISABLED;
|
||||
|
||||
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
|
||||
#endif
|
||||
scaleable_stat(fd);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (fd->fd_sys == -1) {
|
||||
if (errno == ENAMETOOLONG)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamelong",
|
||||
"**filenamelong %s %d",
|
||||
fd->filename,
|
||||
strlen(fd->filename));
|
||||
else if (errno == ENOENT)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_NO_SUCH_FILE,
|
||||
"**filenoexist",
|
||||
"**filenoexist %s",
|
||||
fd->filename);
|
||||
else if (errno == ENOTDIR || errno == ELOOP)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_BAD_FILE,
|
||||
"**filenamedir",
|
||||
"**filenamedir %s",
|
||||
fd->filename);
|
||||
else if (errno == EACCES) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_ACCESS,
|
||||
"**fileaccess",
|
||||
"**fileaccess %s",
|
||||
fd->filename );
|
||||
}
|
||||
else if (errno == EROFS) {
|
||||
/* Read only file or file system and write access requested */
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_READ_ONLY,
|
||||
"**ioneedrd", 0 );
|
||||
}
|
||||
else {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
/*
|
||||
*vim: ts=8 sts=4 sw=4 noexpandtab
|
||||
*/
|
@ -1,109 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_pset.c
|
||||
* \brief Definition of functions associated to structs ADIOI_BGL_ProcInfo_t and ADIOI_BGL_ConfInfo_t
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_pset.h"
|
||||
#include "mpidimpl.h"
|
||||
|
||||
ADIOI_BGL_ProcInfo_t *
|
||||
ADIOI_BGL_ProcInfo_new()
|
||||
{
|
||||
ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ProcInfo_t));
|
||||
AD_BGL_assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
ADIOI_BGL_ProcInfo_t *
|
||||
ADIOI_BGL_ProcInfo_new_n( int n )
|
||||
{
|
||||
ADIOI_BGL_ProcInfo_t *p = (ADIOI_BGL_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BGL_ProcInfo_t));
|
||||
AD_BGL_assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
void
|
||||
ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info )
|
||||
{
|
||||
if (info != NULL) ADIOI_Free (info);
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
ADIOI_BGL_ProcInfo_set(ADIOI_BGL_ProcInfo_t *info, const DCMF_Hardware_t *hw, int r)
|
||||
{
|
||||
info->psetNum = hw->idOfPset;
|
||||
info->xInPset = hw->xCoord;
|
||||
info->yInPset = hw->yCoord;
|
||||
info->zInPset = hw->zCoord;
|
||||
info->cpuid = hw->tCoord;
|
||||
info->rank = r;
|
||||
info->rankInPset = hw->rankInPset;
|
||||
}
|
||||
|
||||
|
||||
ADIOI_BGL_ConfInfo_t *
|
||||
ADIOI_BGL_ConfInfo_new ()
|
||||
{
|
||||
ADIOI_BGL_ConfInfo_t *p = (ADIOI_BGL_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BGL_ConfInfo_t));
|
||||
AD_BGL_assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
ADIOI_BGL_ConfInfo_set(ADIOI_BGL_ConfInfo_t *info, const DCMF_Hardware_t *hw, int s, int n_aggrs)
|
||||
{
|
||||
info->PsetSize = hw->sizeOfPset;
|
||||
info->numPsets = (hw->xSize * hw->ySize *
|
||||
hw->zSize) / hw->sizeOfPset;
|
||||
info->isVNM = (hw->tSize != 1);
|
||||
info->cpuidSize = hw->tSize;
|
||||
info->virtualPsetSize = hw->sizeOfPset * hw->tSize;
|
||||
info->nProcs = s;
|
||||
|
||||
/* More complicated logic maybe needed for nAggrs specification */
|
||||
info->nAggrs = n_aggrs;
|
||||
if ( info->nAggrs <=0 || MIN(info->nProcs, info->virtualPsetSize) < info->nAggrs )
|
||||
info->nAggrs = ADIOI_BGL_NAGG_PSET_DFLT;
|
||||
if ( info->nAggrs > info->virtualPsetSize ) info->nAggrs = info->virtualPsetSize;
|
||||
|
||||
info->aggRatio = 1. * info->nAggrs / info->virtualPsetSize;
|
||||
if (info->aggRatio > 1) info->aggRatio = 1.;
|
||||
}
|
||||
|
||||
void
|
||||
ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info )
|
||||
{
|
||||
if (info != NULL) ADIOI_Free (info);
|
||||
}
|
||||
|
||||
void
|
||||
ADIOI_BGL_persInfo_init(ADIOI_BGL_ConfInfo_t *conf,
|
||||
ADIOI_BGL_ProcInfo_t *proc,
|
||||
int s, int r, int n_aggrs)
|
||||
{
|
||||
DCMF_Hardware_t hw;
|
||||
DCMF_Hardware(&hw);
|
||||
|
||||
ADIOI_BGL_ConfInfo_set (conf, &hw, s, n_aggrs);
|
||||
ADIOI_BGL_ProcInfo_set (proc, &hw, r);
|
||||
}
|
||||
|
||||
void
|
||||
ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf, ADIOI_BGL_ProcInfo_t *proc )
|
||||
{
|
||||
ADIOI_BGL_ConfInfo_free( conf );
|
||||
ADIOI_BGL_ProcInfo_free( proc );
|
||||
}
|
@ -1,82 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_pset.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* File: ad_bgl_pset.h
|
||||
*
|
||||
* Defines two structures that keep BG/L PSET specific information and their public interfaces:
|
||||
* . ADIOI_BGL_ProcInfo_t object keeps specific information to each process
|
||||
* . ADIOI_BGL_ConfInfo_t object keeps general information for the whole communicator, only kept
|
||||
* on process 0.
|
||||
*/
|
||||
|
||||
#ifndef AD_BGL_PSET_H_
|
||||
#define AD_BGL_PSET_H_
|
||||
|
||||
/* Keeps specific information to each process, will be exchanged among processes */
|
||||
typedef struct {
|
||||
|
||||
int psetNum; /* which PSET I am in */
|
||||
int rank; /* my rank */
|
||||
int xInPset; /* my relative coordinates in my PSET */
|
||||
int yInPset;
|
||||
int zInPset;
|
||||
int cpuid; /* my CPU id -- for virtual node mode (t coord)*/
|
||||
int rankInPset; /* my relative rank in my PSET */
|
||||
|
||||
int __pad; /* pad to 16 byte alignment */
|
||||
|
||||
} ADIOI_BGL_ProcInfo_t __attribute__((aligned(16)));
|
||||
|
||||
|
||||
/* Keeps general information for the whole communicator, only on process 0 */
|
||||
typedef struct {
|
||||
|
||||
int PsetSize;
|
||||
int nAggrs;
|
||||
int numPsets;
|
||||
int isVNM;
|
||||
int virtualPsetSize;
|
||||
int nProcs;
|
||||
float aggRatio;
|
||||
int cpuidSize; /* how many cpu ids? (t size) */
|
||||
|
||||
} ADIOI_BGL_ConfInfo_t __attribute__((aligned(16)));
|
||||
|
||||
|
||||
#undef MIN
|
||||
#define MIN(a,b) ((a<b ? a : b))
|
||||
|
||||
|
||||
/* Default is to choose 8 aggregator nodes in each 32 CN pset.
|
||||
Also defines default ratio of aggregator nodes in each a pset.
|
||||
For Virtual Node Mode, the ratio is 8/64 */
|
||||
#define ADIOI_BGL_NAGG_PSET_MIN 1
|
||||
#define ADIOI_BGL_NAGG_PSET_DFLT 8
|
||||
#define ADIOI_BGL_PSET_SIZE_DFLT 32
|
||||
|
||||
|
||||
/* public funcs for ADIOI_BGL_ProcInfo_t objects */
|
||||
ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new();
|
||||
ADIOI_BGL_ProcInfo_t * ADIOI_BGL_ProcInfo_new_n( int n );
|
||||
void ADIOI_BGL_ProcInfo_free( ADIOI_BGL_ProcInfo_t *info );
|
||||
|
||||
|
||||
/* public funcs for ADIOI_BGL_ConfInfo_t objects */
|
||||
ADIOI_BGL_ConfInfo_t * ADIOI_BGL_ConfInfo_new ();
|
||||
void ADIOI_BGL_ConfInfo_free( ADIOI_BGL_ConfInfo_t *info );
|
||||
|
||||
|
||||
/* public funcs for a pair of ADIOI_BGL_ConfInfo_t and ADIOI_BGL_ProcInfo_t objects */
|
||||
void ADIOI_BGL_persInfo_init( ADIOI_BGL_ConfInfo_t *conf,
|
||||
ADIOI_BGL_ProcInfo_t *proc,
|
||||
int s, int r, int n_aggrs );
|
||||
void ADIOI_BGL_persInfo_free( ADIOI_BGL_ConfInfo_t *conf,
|
||||
ADIOI_BGL_ProcInfo_t *proc );
|
||||
|
||||
|
||||
#endif /* AD_BGL_PSET_H_ */
|
@ -1,549 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_read.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bgl_tuning.h"
|
||||
|
||||
void ADIOI_BGL_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size;
|
||||
ADIO_Offset len;
|
||||
static char myname[] = "ADIOI_BGL_READCONTIG";
|
||||
#if BGL_PROFILE
|
||||
/* timing */
|
||||
double io_time, io_time2;
|
||||
|
||||
if (bglmpio_timing) {
|
||||
io_time = MPI_Wtime();
|
||||
bglmpio_prof_cr[ BGLMPIO_CIO_DATA_SIZE ] += len;
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
|
||||
ADIOI_Assert(len == (unsigned int) len); /* read takes an unsigned int parm */
|
||||
|
||||
#if BGL_PROFILE
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* read from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bglmpio_timing2) bglmpio_prof_cr[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#else /* BGL_PROFILE */
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* read from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
else ADIOI_READ_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = read(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#endif /* BGL_PROFILE */
|
||||
|
||||
#if BGL_PROFILE
|
||||
if (bglmpio_timing) bglmpio_prof_cr[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
|
||||
#endif
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io", "**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, err);
|
||||
#endif
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
#define ADIOI_BUFFERED_READ \
|
||||
{ \
|
||||
if (req_off >= readbuf_off + readbuf_len) { \
|
||||
readbuf_off = req_off; \
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));\
|
||||
lseek(fd->fd_sys, readbuf_off, SEEK_SET);\
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);\
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);\
|
||||
if (err == -1) err_flag = 1; \
|
||||
} \
|
||||
while (req_len > readbuf_off + readbuf_len - req_off) { \
|
||||
ADIOI_Assert((readbuf_off + readbuf_len - req_off) == (int) (readbuf_off + readbuf_len - req_off));\
|
||||
partial_read = (int) (readbuf_off + readbuf_len - req_off); \
|
||||
tmp_buf = (char *) ADIOI_Malloc(partial_read); \
|
||||
memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \
|
||||
ADIOI_Free(readbuf); \
|
||||
readbuf = (char *) ADIOI_Malloc(partial_read + max_bufsize); \
|
||||
memcpy(readbuf, tmp_buf, partial_read); \
|
||||
ADIOI_Free(tmp_buf); \
|
||||
readbuf_off += readbuf_len-partial_read; \
|
||||
readbuf_len = (unsigned) (partial_read + ADIOI_MIN(max_bufsize, \
|
||||
end_offset-readbuf_off+1)); \
|
||||
lseek(fd->fd_sys, readbuf_off+partial_read, SEEK_SET);\
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
|
||||
err = read(fd->fd_sys, readbuf+partial_read, readbuf_len-partial_read);\
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off+partial_read, SEEK_SET, readbuf_len-partial_read);\
|
||||
if (err == -1) err_flag = 1; \
|
||||
} \
|
||||
ADIOI_Assert(req_len == (size_t)req_len); \
|
||||
memcpy((char *)buf + userbuf_off, readbuf+req_off-readbuf_off, req_len); \
|
||||
}
|
||||
|
||||
|
||||
void ADIOI_BGL_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
{
|
||||
/* offset is in units of etype relative to the filetype. */
|
||||
|
||||
|
||||
ADIOI_Flatlist_node *flat_buf, *flat_file;
|
||||
ADIO_Offset i_offset, new_brd_size, brd_size, size;
|
||||
int i, j, k, err=-1, st_index=0;
|
||||
ADIO_Offset frd_size=0, new_frd_size, st_frd_size;
|
||||
unsigned num, bufsize;
|
||||
int n_etypes_in_filetype;
|
||||
ADIO_Offset n_filetypes, etype_in_filetype, st_n_filetypes, size_in_filetype;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size, partial_read;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off, req_len, sum;
|
||||
ADIO_Offset off, req_off, disp, end_offset=0, readbuf_off, start_off;
|
||||
char *readbuf, *tmp_buf, *value;
|
||||
int err_flag=0, info_flag;
|
||||
unsigned max_bufsize, readbuf_len;
|
||||
static char myname[] = "ADIOI_BGL_READSTRIDED";
|
||||
|
||||
if (fd->hints->ds_read == ADIOI_HINT_DISABLE) {
|
||||
/* if user has disabled data sieving on reads, use naive
|
||||
* approach instead.
|
||||
*/
|
||||
/*FPRINTF(stderr, "ADIOI_GEN_ReadStrided_naive(%d):\n", __LINE__);*/
|
||||
ADIOI_GEN_ReadStrided_naive(fd,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
file_ptr_type,
|
||||
offset,
|
||||
status,
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
|
||||
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
#endif
|
||||
*error_code = MPI_SUCCESS;
|
||||
return;
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
|
||||
bufsize = buftype_size * count;
|
||||
|
||||
/* get max_bufsize from the info object. */
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value,
|
||||
&info_flag);
|
||||
max_bufsize = atoi(value);
|
||||
ADIOI_Free(value);
|
||||
|
||||
if (!buftype_is_contig && filetype_is_contig) {
|
||||
|
||||
/* noncontiguous in memory, contiguous in file. */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
|
||||
fd->disp + (ADIO_Offset)etype_size * offset;
|
||||
|
||||
start_off = off;
|
||||
end_offset = off + bufsize - 1;
|
||||
readbuf_off = off;
|
||||
readbuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
|
||||
|
||||
/* if atomicity is true, lock (exclusive) the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
lseek(fd->fd_sys, readbuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, readbuf_off, SEEK_SET, readbuf_len);
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
for (j=0; j<count; j++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<flat_buf->count; i++) {
|
||||
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
|
||||
req_off = off;
|
||||
req_len = flat_buf->blocklens[i];
|
||||
ADIOI_BUFFERED_READ
|
||||
off += flat_buf->blocklens[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
|
||||
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
|
||||
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
else { /* noncontiguous in file */
|
||||
|
||||
/* filetype already flattened in ADIO_Open */
|
||||
flat_file = ADIOI_Flatlist;
|
||||
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
|
||||
disp = fd->disp;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* Wei-keng reworked type processing to be a bit more efficient */
|
||||
offset = fd->fp_ind - disp;
|
||||
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
|
||||
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
|
||||
/* now offset is local to this extent */
|
||||
|
||||
/* find the block where offset is located, skip blocklens[i]==0 */
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
ADIO_Offset dist;
|
||||
if (flat_file->blocklens[i] == 0) continue;
|
||||
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
|
||||
/* frd_size is from offset to the end of block i */
|
||||
if (dist == 0) {
|
||||
i++;
|
||||
offset = flat_file->indices[i];
|
||||
frd_size = flat_file->blocklens[i];
|
||||
break;
|
||||
}
|
||||
if (dist > 0) {
|
||||
frd_size = dist;
|
||||
break;
|
||||
}
|
||||
}
|
||||
st_index = i; /* starting index in flat_file->indices[] */
|
||||
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
else {
|
||||
n_etypes_in_filetype = filetype_size/etype_size;
|
||||
n_filetypes = offset / n_etypes_in_filetype;
|
||||
etype_in_filetype = offset % n_etypes_in_filetype;
|
||||
size_in_filetype = etype_in_filetype * etype_size;
|
||||
|
||||
sum = 0;
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
sum += flat_file->blocklens[i];
|
||||
if (sum > size_in_filetype) {
|
||||
st_index = i;
|
||||
frd_size = sum - size_in_filetype;
|
||||
abs_off_in_filetype = flat_file->indices[i] +
|
||||
size_in_filetype - (sum - flat_file->blocklens[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* abs. offset in bytes in the file */
|
||||
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
|
||||
abs_off_in_filetype;
|
||||
}
|
||||
|
||||
start_off = offset;
|
||||
|
||||
/* Wei-keng Liao: read request is within a single flat_file contig
|
||||
* block e.g. with subarray types that actually describe the whole
|
||||
* array */
|
||||
if (buftype_is_contig && bufsize <= frd_size) {
|
||||
ADIO_ReadContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
|
||||
offset, status, error_code);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* update MPI-IO file pointer to point to the first byte that
|
||||
* can be accessed in the fileview. */
|
||||
fd->fp_ind = offset + bufsize;
|
||||
if (bufsize == frd_size) {
|
||||
do {
|
||||
st_index++;
|
||||
if (st_index == flat_file->count) {
|
||||
st_index = 0;
|
||||
n_filetypes++;
|
||||
}
|
||||
} while (flat_file->blocklens[st_index] == 0);
|
||||
fd->fp_ind = disp + flat_file->indices[st_index]
|
||||
+ n_filetypes*filetype_extent;
|
||||
}
|
||||
}
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/* Calculate end_offset, the last byte-offset that will be accessed.
|
||||
e.g., if start_offset=0 and 100 bytes to be read, end_offset=99*/
|
||||
|
||||
st_frd_size = frd_size;
|
||||
st_n_filetypes = n_filetypes;
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
frd_size = ADIOI_MIN(st_frd_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
i_offset += frd_size;
|
||||
end_offset = off + frd_size - 1;
|
||||
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] + n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
|
||||
/* if atomicity is true, lock (exclusive) the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
/* initial read into readbuf */
|
||||
readbuf_off = offset;
|
||||
readbuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
readbuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-readbuf_off+1));
|
||||
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_READ_LOCK(fd, offset, SEEK_SET, readbuf_len);
|
||||
err = read(fd->fd_sys, readbuf, readbuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, offset, SEEK_SET, readbuf_len);
|
||||
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
if (buftype_is_contig && !filetype_is_contig) {
|
||||
|
||||
/* contiguous in memory, noncontiguous in file. should be the most
|
||||
common case. */
|
||||
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
frd_size = ADIOI_MIN(st_frd_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
if (frd_size) {
|
||||
/* TYPE_UB and TYPE_LB can result in
|
||||
frd_size = 0. save system call in such cases */
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = read(fd->fd_sys, ((char *) buf) + i, frd_size);*/
|
||||
|
||||
req_off = off;
|
||||
req_len = frd_size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_READ
|
||||
}
|
||||
i_offset += frd_size;
|
||||
|
||||
if (off + frd_size < disp + flat_file->indices[j] +
|
||||
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
|
||||
off += frd_size;
|
||||
/* did not reach end of contiguous block in filetype.
|
||||
no more I/O needed. off is incremented by frd_size. */
|
||||
else {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* noncontiguous in memory as well as in file */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
k = num = buf_count = 0;
|
||||
i_offset = flat_buf->indices[0];
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
frd_size = st_frd_size;
|
||||
brd_size = flat_buf->blocklens[0];
|
||||
|
||||
while (num < bufsize) {
|
||||
size = ADIOI_MIN(frd_size, brd_size);
|
||||
if (size) {
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = read(fd->fd_sys, ((char *) buf) + i, size); */
|
||||
|
||||
req_off = off;
|
||||
req_len = size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_READ
|
||||
}
|
||||
|
||||
new_frd_size = frd_size;
|
||||
new_brd_size = brd_size;
|
||||
|
||||
if (size == frd_size) {
|
||||
/* reached end of contiguous block in file */
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
|
||||
new_frd_size = flat_file->blocklens[j];
|
||||
if (size != brd_size) {
|
||||
i_offset += size;
|
||||
new_brd_size -= size;
|
||||
}
|
||||
}
|
||||
|
||||
if (size == brd_size) {
|
||||
/* reached end of contiguous block in memory */
|
||||
|
||||
k = (k + 1)%flat_buf->count;
|
||||
buf_count++;
|
||||
i_offset = ((ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
|
||||
flat_buf->indices[k]);
|
||||
new_brd_size = flat_buf->blocklens[k];
|
||||
if (size != frd_size) {
|
||||
off += size;
|
||||
new_frd_size -= size;
|
||||
}
|
||||
}
|
||||
ADIOI_Assert(((ADIO_Offset)num + size) == (unsigned)(num + size));
|
||||
num += size;
|
||||
frd_size = new_frd_size;
|
||||
brd_size = new_brd_size;
|
||||
}
|
||||
}
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
|
||||
ADIOI_Free(readbuf); /* malloced in the buffered_read macro */
|
||||
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
/* This is a temporary way of filling in status. The right way is to
|
||||
keep track of how much data was actually read and placed in buf
|
||||
by ADIOI_BUFFERED_READ. */
|
||||
#endif
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
}
|
@ -1,68 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_setsh.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
|
||||
/* set the shared file pointer to "offset" etypes relative to the current
|
||||
view */
|
||||
|
||||
/*
|
||||
This looks very similar to ADIOI_GEN_Set_shared_fp, except this
|
||||
function avoids locking the file twice. The generic version does
|
||||
|
||||
Write lock
|
||||
ADIO_WriteContig
|
||||
Unlock
|
||||
|
||||
For BGL, ADIOI_BGL_WriteContig does a lock before writing to disable
|
||||
caching. To avoid the lock being called twice, this version for BGL does
|
||||
|
||||
Write lock
|
||||
Lseek
|
||||
Write
|
||||
Unlock
|
||||
|
||||
*/
|
||||
|
||||
void ADIOI_BGL_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
|
||||
{
|
||||
int err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_BGL_SET_SHARED_FP";
|
||||
|
||||
if (fd->shared_fp_fd == ADIO_FILE_NULL) {
|
||||
MPI_Comm_dup(MPI_COMM_SELF, &dupcommself);
|
||||
fd->shared_fp_fd = ADIO_Open(MPI_COMM_SELF, dupcommself,
|
||||
fd->shared_fp_fname,
|
||||
fd->file_system, fd->fns,
|
||||
ADIO_CREATE | ADIO_RDWR | ADIO_DELETE_ON_CLOSE,
|
||||
0, MPI_BYTE, MPI_BYTE, MPI_INFO_NULL,
|
||||
ADIO_PERM_NULL, error_code);
|
||||
}
|
||||
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
|
||||
ADIOI_WRITE_LOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
lseek(fd->shared_fp_fd->fd_sys, 0, SEEK_SET);
|
||||
err = write(fd->shared_fp_fd->fd_sys, &offset, sizeof(ADIO_Offset));
|
||||
ADIOI_UNLOCK(fd->shared_fp_fd, 0, SEEK_SET, sizeof(ADIO_Offset));
|
||||
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
@ -1,163 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_tuning.c
|
||||
* \brief defines ad_bgl performance tuning
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 2008 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_bgl_tuning.c
|
||||
*
|
||||
* defines global variables and functions for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#include "ad_bgl_tuning.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#if !defined(PVFS2_SUPER_MAGIC)
|
||||
#define PVFS2_SUPER_MAGIC (0x20030528)
|
||||
#endif
|
||||
|
||||
int bglmpio_timing;
|
||||
int bglmpio_timing2;
|
||||
int bglmpio_comm;
|
||||
int bglmpio_tunegather;
|
||||
int bglmpio_tuneblocking;
|
||||
long bglocklessmpio_f_type;
|
||||
|
||||
double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
|
||||
double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
\section env_sec Environment Variables
|
||||
* - BGLMPIO_COMM - Define how data is exchanged on collective
|
||||
* reads and writes. Possible values:
|
||||
* - 0 - Use MPI_Alltoallv.
|
||||
* - 1 - Use MPI_Isend/MPI_Irecv.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGLMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
|
||||
* Must also compile the library with BGL_PROFILE defined. Possible values:
|
||||
* - 0 - Do not collect/report timing.
|
||||
* - 1 - Collect/report timing.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGLMPIO_TIMING2 - collect additional averages for MPI I/O collective calls.
|
||||
* Must also compile the library with BGL_PROFILE defined. Possible values:
|
||||
* - 0 - Do not collect/report averages.
|
||||
* - 1 - Collect/report averages.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - BGLMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
|
||||
* for aggregator collective i/o. Possible values:
|
||||
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
|
||||
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - BGLMPIO_TUNEBLOCKING - Tune how aggregate file domains are
|
||||
* calculated (block size). Possible values:
|
||||
* - 0 - Evenly calculate file domains across aggregators. Also use
|
||||
* MPI_Isend/MPI_Irecv to exchange domain information.
|
||||
* - 1 - Align file domains with the underlying file system's block size. Also use
|
||||
* MPI_Alltoallv to exchange domain information.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
|
||||
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
|
||||
* "bgl:" or "bglockless:") on a file name will override this environment
|
||||
* variable. Possible values:
|
||||
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
|
||||
* statfs() field f_type.
|
||||
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
|
||||
*
|
||||
*/
|
||||
void ad_bgl_get_env_vars() {
|
||||
char *x, *dummy;
|
||||
|
||||
bglmpio_comm = 0;
|
||||
x = getenv( "BGLMPIO_COMM" );
|
||||
if (x) bglmpio_comm = atoi(x);
|
||||
bglmpio_timing = 0;
|
||||
x = getenv( "BGLMPIO_TIMING" );
|
||||
if (x) bglmpio_timing = atoi(x);
|
||||
bglmpio_timing2 = 0;
|
||||
x = getenv( "BGLMPIO_TIMING2" );
|
||||
if (x) bglmpio_timing2 = atoi(x);
|
||||
bglmpio_tunegather = 1;
|
||||
x = getenv( "BGLMPIO_TUNEGATHER" );
|
||||
if (x) bglmpio_tunegather = atoi(x);
|
||||
bglmpio_tuneblocking = 1;
|
||||
x = getenv( "BGLMPIO_TUNEBLOCKING" );
|
||||
if (x) bglmpio_tuneblocking = atoi(x);
|
||||
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
|
||||
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
|
||||
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
|
||||
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
|
||||
bglocklessmpio_f_type,bglocklessmpio_f_type);
|
||||
}
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_bgl_wr_timing_report( int rw, ADIO_File fd, int myrank, int nprocs )
|
||||
{
|
||||
int i;
|
||||
|
||||
if (bglmpio_timing) {
|
||||
|
||||
double *bglmpio_prof_org = bglmpio_prof_cr;
|
||||
if (rw) bglmpio_prof_org = bglmpio_prof_cw;
|
||||
|
||||
double bglmpio_prof_avg[ BGLMPIO_CIO_LAST ];
|
||||
double bglmpio_prof_max[ BGLMPIO_CIO_LAST ];
|
||||
|
||||
MPI_Reduce( bglmpio_prof_org, bglmpio_prof_avg, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, fd->comm );
|
||||
MPI_Reduce( bglmpio_prof_org, bglmpio_prof_max, BGLMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, fd->comm );
|
||||
|
||||
if (myrank == 0) {
|
||||
|
||||
for (i=0; i<BGLMPIO_CIO_LAST; i++) bglmpio_prof_avg[i] /= nprocs;
|
||||
|
||||
if (bglmpio_timing2) {
|
||||
bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bglmpio_prof_max[ BGLMPIO_CIO_T_POSI_RW ];
|
||||
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_RW ];
|
||||
} else {
|
||||
|
||||
bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] = 0;
|
||||
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] = 0;
|
||||
}
|
||||
|
||||
bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] = bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs /
|
||||
bglmpio_prof_max[ BGLMPIO_CIO_T_MPIO_CRW ];
|
||||
|
||||
printf("\tTIMING-1 %1s , ", (rw ? "W" : "R") );
|
||||
printf( "SZ: %12.4f , ", bglmpio_prof_avg[ BGLMPIO_CIO_DATA_SIZE ] * nprocs );
|
||||
printf( "SK-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_SEEK ] );
|
||||
printf( "SK-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_SEEK ] );
|
||||
printf( "LC-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_LCOMP ] );
|
||||
printf( "GA-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_GATHER ] );
|
||||
printf( "AN-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_PATANA ] );
|
||||
printf( "FD-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_FD_PART ] );
|
||||
printf( "MY-a: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MYREQ ] );
|
||||
printf( "OT-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_OTHREQ ] );
|
||||
printf( "EX-m: %10.3f , ", bglmpio_prof_max[ BGLMPIO_CIO_T_DEXCH ] );
|
||||
printf("\tTIMING-2 %1s , ", (rw ? "W" : "R") );
|
||||
printf( "PXT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_POSI_RW ] );
|
||||
printf( "MPT-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_RW ] );
|
||||
printf("MPTC-m: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_T_MPIO_CRW ] );
|
||||
printf( "PXB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_POSI_RW ] );
|
||||
printf( "MPB: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_RW ] );
|
||||
printf( "MPBC: %10.3f , ", bglmpio_prof_avg[ BGLMPIO_CIO_B_MPIO_CRW ] );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,95 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_tuning.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_bgl_tuning.h
|
||||
*
|
||||
* declares global variables and macros for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#ifndef AD_BGL_TUNING_H_
|
||||
#define AD_BGL_TUNING_H_
|
||||
|
||||
#include "adio.h"
|
||||
|
||||
#define AD_BGL_assert( a ) if (!(a)) { \
|
||||
fprintf( stderr, "AD_BGL_assert, file=%s, line=%d\n", __FILE__, __LINE__ ); \
|
||||
MPI_Abort( MPI_COMM_WORLD, 1 ); \
|
||||
}
|
||||
|
||||
/*-----------------------------------------
|
||||
* Global variables for the control of
|
||||
* 1. timing
|
||||
* 2. select specific optimizations
|
||||
*-----------------------------------------*/
|
||||
|
||||
/* timing fields */
|
||||
enum {
|
||||
BGLMPIO_CIO_DATA_SIZE=0,
|
||||
BGLMPIO_CIO_T_SEEK,
|
||||
BGLMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
|
||||
BGLMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
|
||||
BGLMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
|
||||
BGLMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
|
||||
BGLMPIO_CIO_T_MYREQ, /* time for ADIOI_BGL_Calc_my_req(), local */
|
||||
BGLMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
|
||||
BGLMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
|
||||
BGLMPIO_CIO_T_POSI_RW,
|
||||
BGLMPIO_CIO_B_POSI_RW,
|
||||
BGLMPIO_CIO_T_MPIO_RW, /* time for ADIOI_BGL_WriteContig() */
|
||||
BGLMPIO_CIO_B_MPIO_RW,
|
||||
BGLMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_BGL_WriteStridedColl() */
|
||||
BGLMPIO_CIO_B_MPIO_CRW,
|
||||
BGLMPIO_CIO_LAST
|
||||
};
|
||||
|
||||
extern double bglmpio_prof_cw [BGLMPIO_CIO_LAST];
|
||||
extern double bglmpio_prof_cr [BGLMPIO_CIO_LAST];
|
||||
|
||||
|
||||
/* corresponds to environment variables to select optimizations and timing level */
|
||||
extern int bglmpio_timing;
|
||||
extern int bglmpio_timing2;
|
||||
extern int bglmpio_comm;
|
||||
extern int bglmpio_tunegather;
|
||||
extern int bglmpio_tuneblocking;
|
||||
extern long bglocklessmpio_f_type;
|
||||
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
void ad_bgl_get_env_vars();
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_bgl_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
|
||||
|
||||
/* note:
|
||||
* T := timing;
|
||||
* CIO := collective I/O
|
||||
*/
|
||||
#define BGLMPIO_T_CIO_RESET( LEVEL, RW ) \
|
||||
if (bglmpio_timing_cw_level >= LEVEL) { \
|
||||
int i; \
|
||||
for ( i = 0; i < BGLMPIO_T_LAST; i ++ ) \
|
||||
bglmpio_prof_c##RW [ i ] = 0; \
|
||||
}
|
||||
|
||||
#define BGLMPIO_T_CIO_REPORT( LEVEL, RW, FD, MYRANK, NPROCS ) \
|
||||
if (bglmpio_timing_cw_level >= LEVEL) { \
|
||||
ad_bgl_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
|
||||
}
|
||||
|
||||
#define BGLMPIO_T_CIO_SET_GET( LEVEL, RW, DOBAR, ISSET, ISGET, VAR1, VAR2 ) \
|
||||
if (bglmpio_timing_cw_level >= LEVEL) { \
|
||||
if ( DOBAR ) MPI_Barrier( fd->comm ); \
|
||||
double temp = MPI_Wtime(); \
|
||||
if ( ISSET ) bglmpio_prof_c##RW [ VAR1 ] = temp; \
|
||||
if ( ISGET ) bglmpio_prof_c##RW [ VAR2 ] = temp - bglmpio_prof_c##RW [ VAR2 ] ; \
|
||||
}
|
||||
|
||||
#endif /* AD_BGL_TUNING_H_ */
|
@ -1,611 +0,0 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_write.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "adio_extern.h"
|
||||
|
||||
#include "ad_bgl_tuning.h"
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
void ADIOI_BGL_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size;
|
||||
ADIO_Offset len;
|
||||
static char myname[] = "ADIOI_BGL_WRITECONTIG";
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5036, 0, NULL);
|
||||
#endif
|
||||
#if BGL_PROFILE
|
||||
/* timing */
|
||||
double io_time, io_time2;
|
||||
|
||||
if (bglmpio_timing) {
|
||||
io_time = MPI_Wtime();
|
||||
bglmpio_prof_cw[ BGLMPIO_CIO_DATA_SIZE ] += len;
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
len = (ADIO_Offset)datatype_size * (ADIO_Offset)count;
|
||||
ADIOI_Assert(len == (unsigned int) len); /* write takes an unsigned int parm */
|
||||
|
||||
#if BGL_PROFILE
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* write from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_SEEK ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
if (bglmpio_timing2) io_time2 = MPI_Wtime();
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
if (bglmpio_timing2) bglmpio_prof_cw[ BGLMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time2);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#else /* BGL_PROFILE */
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
if (fd->fp_sys_posn != offset)
|
||||
lseek(fd->fd_sys, offset, SEEK_SET);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_sys_posn = offset + err;
|
||||
/* individual file pointer not updated */
|
||||
}
|
||||
else { /* write from curr. location of ind. file pointer */
|
||||
offset = fd->fp_ind;
|
||||
if (fd->fp_sys_posn != fd->fp_ind)
|
||||
lseek(fd->fd_sys, fd->fp_ind, SEEK_SET);
|
||||
ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, len);
|
||||
err = write(fd->fd_sys, buf, (unsigned int)len);
|
||||
ADIOI_UNLOCK(fd, offset, SEEK_SET, len);
|
||||
fd->fp_ind += err;
|
||||
fd->fp_sys_posn = fd->fp_ind;
|
||||
}
|
||||
|
||||
#endif /* BGL_PROFILE */
|
||||
|
||||
#if BGL_PROFILE
|
||||
if (bglmpio_timing) bglmpio_prof_cw[ BGLMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
|
||||
#endif
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, err);
|
||||
#endif
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5037, 0, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#define ADIOI_BUFFERED_WRITE \
|
||||
{ \
|
||||
if (req_off >= writebuf_off + writebuf_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
writebuf_off = req_off; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (err == -1) { \
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
|
||||
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
|
||||
while (write_sz != req_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
req_len -= write_sz; \
|
||||
userbuf_off += write_sz; \
|
||||
writebuf_off += writebuf_len; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (err == -1) { \
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, \
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
} \
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
/* this macro is used when filetype is contig and buftype is not contig.
|
||||
it does not do a read-modify-write and does not lock*/
|
||||
#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
|
||||
{ \
|
||||
if (req_off >= writebuf_off + writebuf_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
writebuf_off = req_off; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
} \
|
||||
write_sz = (unsigned) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
ADIOI_Assert((ADIO_Offset)write_sz == ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off));\
|
||||
memcpy(writebuf+req_off-writebuf_off, (char *)buf +userbuf_off, write_sz);\
|
||||
while (write_sz != req_len) { \
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET); \
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len); \
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
|
||||
if (err == -1) err_flag = 1; \
|
||||
req_len -= write_sz; \
|
||||
userbuf_off += write_sz; \
|
||||
writebuf_off += writebuf_len; \
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ADIOI_BGL_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
{
|
||||
/* offset is in units of etype relative to the filetype. */
|
||||
|
||||
|
||||
|
||||
ADIOI_Flatlist_node *flat_buf, *flat_file;
|
||||
ADIO_Offset i_offset, sum, size_in_filetype;
|
||||
int i, j, k, err=-1, st_index=0;
|
||||
int n_etypes_in_filetype;
|
||||
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off;
|
||||
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
|
||||
char *writebuf, *value;
|
||||
unsigned bufsize, writebuf_len, max_bufsize, write_sz;
|
||||
int err_flag=0, info_flag;
|
||||
ADIO_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size, req_len;
|
||||
static char myname[] = "ADIOI_BGL_WRITESTRIDED";
|
||||
|
||||
if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
|
||||
/* if user has disabled data sieving on reads, use naive
|
||||
* approach instead.
|
||||
*/
|
||||
/*FPRINTF(stderr, "ADIOI_GEN_WriteStrided_naive(%d):\n", __LINE__);*/
|
||||
ADIOI_GEN_WriteStrided_naive(fd,
|
||||
buf,
|
||||
count,
|
||||
datatype,
|
||||
file_ptr_type,
|
||||
offset,
|
||||
status,
|
||||
error_code);
|
||||
return;
|
||||
}
|
||||
/*FPRINTF(stderr, "%s(%d):\n",myname, __LINE__);*/
|
||||
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
#endif
|
||||
*error_code = MPI_SUCCESS;
|
||||
return;
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
ADIOI_Assert((buftype_size * count) == ((ADIO_Offset)(unsigned)buftype_size * (ADIO_Offset)count));
|
||||
bufsize = buftype_size * count;
|
||||
|
||||
/* get max_bufsize from the info object. */
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_Info_get(fd->info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL, value,
|
||||
&info_flag);
|
||||
max_bufsize = atoi(value);
|
||||
ADIOI_Free(value);
|
||||
|
||||
if (!buftype_is_contig && filetype_is_contig) {
|
||||
|
||||
/* noncontiguous in memory, contiguous in file. */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
|
||||
fd->disp + etype_size * offset;
|
||||
|
||||
start_off = off;
|
||||
end_offset = off + bufsize - 1;
|
||||
writebuf_off = off;
|
||||
writebuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
|
||||
|
||||
/* if atomicity is true, lock the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
for (j=0; j<count; j++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<flat_buf->count; i++) {
|
||||
userbuf_off = (ADIO_Offset)j*(ADIO_Offset)buftype_extent + flat_buf->indices[i];
|
||||
req_off = off;
|
||||
req_len = flat_buf->blocklens[i];
|
||||
ADIOI_BUFFERED_WRITE_WITHOUT_READ
|
||||
off += flat_buf->blocklens[i];
|
||||
}
|
||||
}
|
||||
|
||||
/* write the buffer out finally */
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len);
|
||||
if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
else { /* noncontiguous in file */
|
||||
|
||||
/* filetype already flattened in ADIO_Open */
|
||||
flat_file = ADIOI_Flatlist;
|
||||
while (flat_file->type != fd->filetype) flat_file = flat_file->next;
|
||||
disp = fd->disp;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* Wei-keng reworked type processing to be a bit more efficient */
|
||||
offset = fd->fp_ind - disp;
|
||||
n_filetypes = (offset - flat_file->indices[0]) / filetype_extent;
|
||||
offset -= (ADIO_Offset)n_filetypes * filetype_extent;
|
||||
/* now offset is local to this extent */
|
||||
|
||||
/* find the block where offset is located, skip blocklens[i]==0 */
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
ADIO_Offset dist;
|
||||
if (flat_file->blocklens[i] == 0) continue;
|
||||
dist = flat_file->indices[i] + flat_file->blocklens[i] - offset;
|
||||
/* fwr_size is from offset to the end of block i */
|
||||
if (dist == 0) {
|
||||
i++;
|
||||
offset = flat_file->indices[i];
|
||||
fwr_size = flat_file->blocklens[i];
|
||||
break;
|
||||
}
|
||||
if (dist > 0) {
|
||||
fwr_size = dist;
|
||||
break;
|
||||
}
|
||||
}
|
||||
st_index = i; /* starting index in flat_file->indices[] */
|
||||
offset += disp + (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
else {
|
||||
int i;
|
||||
n_etypes_in_filetype = filetype_size/etype_size;
|
||||
n_filetypes = offset / n_etypes_in_filetype;
|
||||
etype_in_filetype = offset % n_etypes_in_filetype;
|
||||
size_in_filetype = etype_in_filetype * etype_size;
|
||||
|
||||
sum = 0;
|
||||
for (i=0; i<flat_file->count; i++) {
|
||||
sum += flat_file->blocklens[i];
|
||||
if (sum > size_in_filetype) {
|
||||
st_index = i;
|
||||
fwr_size = sum - size_in_filetype;
|
||||
abs_off_in_filetype = flat_file->indices[i] +
|
||||
size_in_filetype - (sum - flat_file->blocklens[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* abs. offset in bytes in the file */
|
||||
offset = disp + (ADIO_Offset) n_filetypes*filetype_extent +
|
||||
abs_off_in_filetype;
|
||||
}
|
||||
|
||||
start_off = offset;
|
||||
/* Wei-keng Liao:write request is within single flat_file contig block*/
|
||||
/* this could happen, for example, with subarray types that are
|
||||
* actually fairly contiguous */
|
||||
if (buftype_is_contig && bufsize <= fwr_size) {
|
||||
ADIO_WriteContig(fd, buf, bufsize, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
|
||||
offset, status, error_code);
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
/* update MPI-IO file pointer to point to the first byte
|
||||
* that can be accessed in the fileview. */
|
||||
fd->fp_ind = offset + bufsize;
|
||||
if (bufsize == fwr_size) {
|
||||
do {
|
||||
st_index++;
|
||||
if (st_index == flat_file->count) {
|
||||
st_index = 0;
|
||||
n_filetypes++;
|
||||
}
|
||||
} while (flat_file->blocklens[st_index] == 0);
|
||||
fd->fp_ind = disp + flat_file->indices[st_index]
|
||||
+ (ADIO_Offset)n_filetypes*filetype_extent;
|
||||
}
|
||||
}
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
/* Calculate end_offset, the last byte-offset that will be accessed.
|
||||
e.g., if start_offset=0 and 100 bytes to be write, end_offset=99*/
|
||||
|
||||
st_fwr_size = fwr_size;
|
||||
st_n_filetypes = n_filetypes;
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
i_offset += fwr_size;
|
||||
end_offset = off + fwr_size - 1;
|
||||
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset);
|
||||
}
|
||||
|
||||
/* if atomicity is true, lock the region to be accessed */
|
||||
if (fd->atomicity)
|
||||
ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
/* initial read for the read-modify-write */
|
||||
writebuf_off = offset;
|
||||
writebuf = (char *) ADIOI_Malloc(max_bufsize);
|
||||
writebuf_len = (unsigned)(ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
err = read(fd->fd_sys, writebuf, writebuf_len);
|
||||
if (err == -1) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_IO,
|
||||
"ADIOI_BGL_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (buftype_is_contig && !filetype_is_contig) {
|
||||
|
||||
/* contiguous in memory, noncontiguous in file. should be the most
|
||||
common case. */
|
||||
|
||||
i_offset = 0;
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
|
||||
while (i_offset < bufsize) {
|
||||
if (fwr_size) {
|
||||
/* TYPE_UB and TYPE_LB can result in
|
||||
fwr_size = 0. save system call in such cases */
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = write(fd->fd_sys, ((char *) buf) + i_offset, fwr_size);*/
|
||||
|
||||
req_off = off;
|
||||
req_len = fwr_size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_WRITE
|
||||
}
|
||||
i_offset += fwr_size;
|
||||
|
||||
if (off + fwr_size < disp + flat_file->indices[j] +
|
||||
flat_file->blocklens[j] + n_filetypes*(ADIO_Offset)filetype_extent)
|
||||
off += fwr_size;
|
||||
/* did not reach end of contiguous block in filetype.
|
||||
no more I/O needed. off is incremented by fwr_size. */
|
||||
else {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
fwr_size = ADIOI_MIN(flat_file->blocklens[j],
|
||||
bufsize-i_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* noncontiguous in memory as well as in file */
|
||||
|
||||
ADIOI_Flatten_datatype(datatype);
|
||||
flat_buf = ADIOI_Flatlist;
|
||||
while (flat_buf->type != datatype) flat_buf = flat_buf->next;
|
||||
|
||||
k = num = buf_count = 0;
|
||||
i_offset = flat_buf->indices[0];
|
||||
j = st_index;
|
||||
off = offset;
|
||||
n_filetypes = st_n_filetypes;
|
||||
fwr_size = st_fwr_size;
|
||||
bwr_size = flat_buf->blocklens[0];
|
||||
|
||||
while (num < bufsize) {
|
||||
size = ADIOI_MIN(fwr_size, bwr_size);
|
||||
if (size) {
|
||||
/* lseek(fd->fd_sys, off, SEEK_SET);
|
||||
err = write(fd->fd_sys, ((char *) buf) + i_offset, size); */
|
||||
|
||||
req_off = off;
|
||||
req_len = size;
|
||||
userbuf_off = i_offset;
|
||||
ADIOI_BUFFERED_WRITE
|
||||
}
|
||||
|
||||
new_fwr_size = fwr_size;
|
||||
new_bwr_size = bwr_size;
|
||||
|
||||
if (size == fwr_size) {
|
||||
/* reached end of contiguous block in file */
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
while (flat_file->blocklens[j]==0) {
|
||||
j = (j+1) % flat_file->count;
|
||||
n_filetypes += (j == 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
off = disp + flat_file->indices[j] +
|
||||
n_filetypes*(ADIO_Offset)filetype_extent;
|
||||
|
||||
new_fwr_size = flat_file->blocklens[j];
|
||||
if (size != bwr_size) {
|
||||
i_offset += size;
|
||||
new_bwr_size -= size;
|
||||
}
|
||||
}
|
||||
|
||||
if (size == bwr_size) {
|
||||
/* reached end of contiguous block in memory */
|
||||
|
||||
k = (k + 1)%flat_buf->count;
|
||||
buf_count++;
|
||||
i_offset = (ADIO_Offset)buftype_extent*(ADIO_Offset)(buf_count/flat_buf->count) +
|
||||
flat_buf->indices[k];
|
||||
new_bwr_size = flat_buf->blocklens[k];
|
||||
if (size != fwr_size) {
|
||||
off += size;
|
||||
new_fwr_size -= size;
|
||||
}
|
||||
}
|
||||
num += size;
|
||||
fwr_size = new_fwr_size;
|
||||
bwr_size = new_bwr_size;
|
||||
}
|
||||
}
|
||||
|
||||
/* write the buffer out finally */
|
||||
lseek(fd->fd_sys, writebuf_off, SEEK_SET);
|
||||
if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
err = write(fd->fd_sys, writebuf, writebuf_len);
|
||||
|
||||
if (!(fd->atomicity))
|
||||
ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
|
||||
else ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
||||
fd->fp_sys_posn = -1; /* set it to null. */
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
/* This is a temporary way of filling in status. The right way is to
|
||||
keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
|
||||
#endif
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2011 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_BGLOCKLESS
|
||||
|
||||
noinst_HEADERS += adio/ad_bglockless/ad_bglockless.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_bglockless/ad_bglockless.c \
|
||||
adio/ad_bglockless/ad_bglockless_features.c
|
||||
|
||||
endif BUILD_AD_BGLOCKLESS
|
||||
|
@ -1,44 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "../ad_bg/ad_bg.h"
|
||||
#include "ad_bglockless.h"
|
||||
|
||||
/* adioi.h has the ADIOI_Fns_struct define */
|
||||
#include "adioi.h"
|
||||
|
||||
struct ADIOI_Fns_struct ADIO_BGLOCKLESS_operations = {
|
||||
ADIOI_BG_Open, /* Open */
|
||||
ADIOI_GEN_OpenColl, /* Collective open */
|
||||
ADIOI_GEN_ReadContig, /* ReadContig */
|
||||
ADIOI_GEN_WriteContig, /* WriteContig */
|
||||
ADIOI_BG_ReadStridedColl, /* ReadStridedColl */
|
||||
ADIOI_BG_WriteStridedColl, /* WriteStridedColl */
|
||||
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
|
||||
ADIOI_GEN_Fcntl, /* Fcntl */
|
||||
ADIOI_BG_SetInfo, /* SetInfo */
|
||||
ADIOI_GEN_ReadStrided, /* ReadStrided */
|
||||
ADIOI_NOLOCK_WriteStrided, /* WriteStrided */
|
||||
ADIOI_BG_Close, /* Close */
|
||||
#ifdef ROMIO_HAVE_WORKING_AIO
|
||||
ADIOI_GEN_IreadContig, /* IreadContig */
|
||||
ADIOI_GEN_IwriteContig, /* IwriteContig */
|
||||
#else
|
||||
ADIOI_FAKE_IreadContig, /* IreadContig */
|
||||
ADIOI_FAKE_IwriteContig, /* IwriteContig */
|
||||
#endif
|
||||
ADIOI_GEN_IODone, /* ReadDone */
|
||||
ADIOI_GEN_IODone, /* WriteDone */
|
||||
ADIOI_GEN_IOComplete, /* ReadComplete */
|
||||
ADIOI_GEN_IOComplete, /* WriteComplete */
|
||||
ADIOI_GEN_IreadStrided, /* IreadStrided */
|
||||
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
|
||||
ADIOI_BG_Flush, /* Flush */
|
||||
ADIOI_GEN_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_BGLOCKLESS_Feature /* Features */
|
||||
};
|
@ -1,14 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2008 Uchicago Argonne LLC
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#ifndef AD_BGLOCKLESS_INCLUDE
|
||||
#define AD_PVFS2_INCLUDE
|
||||
|
||||
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag);
|
||||
|
||||
#endif
|
||||
|
@ -1,15 +0,0 @@
|
||||
#include "adio.h"
|
||||
|
||||
int ADIOI_BGLOCKLESS_Feature(ADIO_File fd, int flag)
|
||||
{
|
||||
switch(flag) {
|
||||
case ADIO_SCALABLE_OPEN:
|
||||
return 1;
|
||||
case ADIO_SHARED_FP:
|
||||
case ADIO_LOCKS:
|
||||
case ADIO_SEQUENTIAL:
|
||||
case ADIO_DATA_SIEVING_WRITES:
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
26
ompi/mca/io/romio/romio/adio/ad_gpfs/Makefile.mk
Обычный файл
26
ompi/mca/io/romio/romio/adio/ad_gpfs/Makefile.mk
Обычный файл
@ -0,0 +1,26 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2012 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_GPFS
|
||||
|
||||
noinst_HEADERS += \
|
||||
adio/ad_gpfs/ad_gpfs_aggrs.h \
|
||||
adio/ad_gpfs/ad_gpfs.h \
|
||||
adio/ad_gpfs/ad_gpfs_tuning.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_gpfs/ad_gpfs_aggrs.c \
|
||||
adio/ad_gpfs/ad_gpfs_close.c \
|
||||
adio/ad_gpfs/ad_gpfs_flush.c \
|
||||
adio/ad_gpfs/ad_gpfs_tuning.c \
|
||||
adio/ad_gpfs/ad_gpfs.c \
|
||||
adio/ad_gpfs/ad_gpfs_open.c \
|
||||
adio/ad_gpfs/ad_gpfs_hints.c \
|
||||
adio/ad_gpfs/ad_gpfs_rdcoll.c \
|
||||
adio/ad_gpfs/ad_gpfs_wrcoll.c
|
||||
|
||||
endif BUILD_AD_GPFS
|
@ -2,7 +2,7 @@
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl.c
|
||||
* \file ad_gpfs.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
@ -11,34 +11,28 @@
|
||||
* Copyright (C) 2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_gpfs.h"
|
||||
|
||||
/* adioi.h has the ADIOI_Fns_struct define */
|
||||
#include "adioi.h"
|
||||
|
||||
struct ADIOI_Fns_struct ADIO_BGL_operations = {
|
||||
ADIOI_BGL_Open, /* Open */
|
||||
struct ADIOI_Fns_struct ADIO_GPFS_operations = {
|
||||
ADIOI_GPFS_Open, /* Open */
|
||||
ADIOI_GEN_OpenColl, /* Collective open */
|
||||
ADIOI_BGL_ReadContig, /* ReadContig */
|
||||
ADIOI_BGL_WriteContig, /* WriteContig */
|
||||
#if BGL_OPTIM_STEP1_2
|
||||
ADIOI_BGL_ReadStridedColl, /* ReadStridedColl */
|
||||
ADIOI_BGL_WriteStridedColl, /* WriteStridedColl */
|
||||
#else
|
||||
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
|
||||
ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
|
||||
#endif
|
||||
ADIOI_GEN_ReadContig, /* ReadContig */
|
||||
ADIOI_GEN_WriteContig, /* WriteContig */
|
||||
ADIOI_GPFS_ReadStridedColl, /* ReadStridedColl */
|
||||
ADIOI_GPFS_WriteStridedColl, /* WriteStridedColl */
|
||||
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
|
||||
ADIOI_BGL_Fcntl, /* Fcntl */
|
||||
#if BGL_OPTIM_STEP1_1
|
||||
ADIOI_BGL_SetInfo, /* SetInfo */
|
||||
ADIOI_GEN_Fcntl, /* Fcntl */
|
||||
#if defined(BGQPLATFORM) || defined(PEPLATFORM)
|
||||
ADIOI_GPFS_SetInfo, /* SetInfo for BlueGene or PE */
|
||||
#else
|
||||
ADIOI_GEN_SetInfo, /* SetInfo */
|
||||
ADIOI_GEN_SetInfo, /* SetInfo for any platform besides BlueGene or PE */
|
||||
#endif
|
||||
ADIOI_BGL_ReadStrided, /* ReadStrided */
|
||||
ADIOI_BGL_WriteStrided, /* WriteStrided */
|
||||
ADIOI_BGL_Close, /* Close */
|
||||
ADIOI_GEN_ReadStrided, /* ReadStrided */
|
||||
ADIOI_GEN_WriteStrided, /* WriteStrided */
|
||||
ADIOI_GPFS_Close, /* Close */
|
||||
#ifdef ROMIO_HAVE_WORKING_AIO
|
||||
#warning Consider BG support for NFS before enabling this.
|
||||
ADIOI_GEN_IreadContig, /* IreadContig */
|
||||
@ -53,8 +47,17 @@ struct ADIOI_Fns_struct ADIO_BGL_operations = {
|
||||
ADIOI_GEN_IOComplete, /* WriteComplete */
|
||||
ADIOI_GEN_IreadStrided, /* IreadStrided */
|
||||
ADIOI_GEN_IwriteStrided, /* IwriteStrided */
|
||||
ADIOI_BGL_Flush, /* Flush */
|
||||
ADIOI_GPFS_Flush, /* Flush */
|
||||
ADIOI_GEN_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_GEN_Feature, /* Features */
|
||||
#ifdef BGQPLATFORM
|
||||
"GPFS+BGQ: IBM GPFS for Blue Gene",
|
||||
#elif PEPLATFORM
|
||||
"GPFS+PE: IBM GPFS for PE",
|
||||
#else
|
||||
"GPFS: IBM GPFS",
|
||||
#endif
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
71
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs.h
Обычный файл
71
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs.h
Обычный файл
@ -0,0 +1,71 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#ifndef AD_GPFS_INCLUDE
|
||||
#define AD_GPFS_INCLUDE
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <fcntl.h>
|
||||
#include "adio.h"
|
||||
|
||||
#ifdef HAVE_SIGNAL_H
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#ifdef HAVE_AIO_H
|
||||
#include <aio.h>
|
||||
#endif
|
||||
|
||||
|
||||
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code);
|
||||
|
||||
void ADIOI_GPFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_GPFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
|
||||
|
||||
void ADIOI_GPFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_GPFS_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
|
||||
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code);
|
||||
|
||||
#include "ad_gpfs_tuning.h"
|
||||
|
||||
|
||||
#endif
|
@ -2,8 +2,8 @@
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_aggrs.c
|
||||
* \brief The externally used function from this file is is declared in ad_bg_aggrs.h
|
||||
* \file ad_gpfs_aggrs.c
|
||||
* \brief The externally used function from this file is is declared in ad_gpfs_aggrs.h
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
@ -12,25 +12,24 @@
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*#define TRACE_ON */
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_cb_config_list.h"
|
||||
#include "ad_bg.h"
|
||||
#include "ad_bg_pset.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
#include "ad_gpfs.h"
|
||||
#include "ad_gpfs_aggrs.h"
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
#include "mpidi_macros.h"
|
||||
|
||||
#ifdef USE_DBG_LOGGING
|
||||
#define AGG_DEBUG 1
|
||||
#endif
|
||||
|
||||
static int aggrsInPsetSize=0;
|
||||
static int *aggrsInPset=NULL;
|
||||
#ifndef TRACE_ERR
|
||||
# define TRACE_ERR(format...)
|
||||
#endif
|
||||
|
||||
/* Comments copied from common:
|
||||
* This file contains four functions:
|
||||
@ -63,260 +62,6 @@ static int *aggrsInPset=NULL;
|
||||
* uneven distributions
|
||||
*/
|
||||
|
||||
/* forward declaration */
|
||||
static void
|
||||
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset );
|
||||
|
||||
/*
|
||||
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
|
||||
* The parameters are
|
||||
* . the number of aggregators (proxies) : fd->hints->cb_nodes
|
||||
* . the ranks of the aggregators : fd->hints->ranklist
|
||||
* By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
|
||||
* ADIO can work more efficiently.
|
||||
*/
|
||||
int
|
||||
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
|
||||
{
|
||||
int r, s;
|
||||
ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
|
||||
ADIOI_BG_ConfInfo_t *confInfo;
|
||||
TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
|
||||
|
||||
MPI_Comm_size( fd->comm, &s );
|
||||
MPI_Comm_rank( fd->comm, &r );
|
||||
|
||||
/* Collect individual BG personality information */
|
||||
confInfo = ADIOI_BG_ConfInfo_new ();
|
||||
procInfo = ADIOI_BG_ProcInfo_new ();
|
||||
ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
|
||||
|
||||
/* Gather BG personality infomation onto process 0 */
|
||||
/* if (r == 0) */
|
||||
all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
|
||||
if(s > aggrsInPsetSize)
|
||||
{
|
||||
if(aggrsInPset) ADIOI_Free(aggrsInPset);
|
||||
aggrsInPset = (int *) ADIOI_Malloc (s *sizeof(int));
|
||||
aggrsInPsetSize = s;
|
||||
}
|
||||
|
||||
|
||||
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
|
||||
(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
|
||||
0,
|
||||
fd->comm );
|
||||
|
||||
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
|
||||
if (r == 0) {
|
||||
ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo, aggrsInPset);
|
||||
/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
|
||||
}
|
||||
ADIOI_BG_ProcInfo_free (all_procInfo);
|
||||
|
||||
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
|
||||
Declared in adio_cb_config_list.h */
|
||||
ADIOI_cb_bcast_rank_map(fd);
|
||||
|
||||
/* Broadcast the BG-GPFS related file domain info */
|
||||
MPI_Bcast( (void *)aggrsInPset,
|
||||
fd->hints->cb_nodes * sizeof(int), MPI_BYTE,
|
||||
0,
|
||||
fd->comm );
|
||||
|
||||
ADIOI_BG_persInfo_free( confInfo, procInfo );
|
||||
TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* There are some number of bridge nodes (randomly) distributed through the job
|
||||
* We need to split the nodes among the bridge nodes */
|
||||
/* Maybe find which bridge node is closer (manhattan distance) and try to
|
||||
* distribute evenly.
|
||||
*/
|
||||
/*
|
||||
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
|
||||
* The first order of tmp_ranklist is : PSET number
|
||||
* The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int rank;
|
||||
int bridge;
|
||||
} sortstruct;
|
||||
|
||||
static int intsort(const void *p1, const void *p2)
|
||||
{
|
||||
sortstruct *i1, *i2;
|
||||
i1 = (sortstruct *)p1;
|
||||
i2 = (sortstruct *)p2;
|
||||
return(i1->bridge - i2->bridge);
|
||||
}
|
||||
|
||||
static int
|
||||
ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset,
|
||||
int *tmp_ranklist)
|
||||
{
|
||||
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
|
||||
/* BES: This should be done in the init routines probably. */
|
||||
int i, j;
|
||||
int aggTotal;
|
||||
int distance, numAggs;
|
||||
int *aggList;
|
||||
/* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
|
||||
* bridge node */
|
||||
|
||||
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
|
||||
for(i=0; i < confInfo->nProcs; i++)
|
||||
{
|
||||
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
|
||||
bridgelist[i].rank = i;
|
||||
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
|
||||
}
|
||||
|
||||
/* This list contains rank->bridge info. Now, we need to sort this list. */
|
||||
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
|
||||
|
||||
/* In this array, we can pick an appropriate number of midpoints based on
|
||||
* our bridgenode index and the number of aggregators */
|
||||
|
||||
numAggs = confInfo->aggRatio * confInfo->ioMaxSize /*virtualPsetSize*/;
|
||||
if(numAggs == 1)
|
||||
aggTotal = 1;
|
||||
else
|
||||
/* the number of aggregators is (numAggs per bridgenode) plus each
|
||||
* bridge node is an aggregator */
|
||||
aggTotal = confInfo->numBridgeRanks * (numAggs+1);
|
||||
|
||||
distance = (confInfo->ioMaxSize /*virtualPsetSize*/ / numAggs);
|
||||
TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d numAggs: %d distance: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, distance, aggTotal);
|
||||
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
|
||||
|
||||
|
||||
/* For each bridge node, determine who the aggregators will be */
|
||||
/* basically, the n*distance and bridge node */
|
||||
if(aggTotal == 1) /* special case when we only have one bridge node */
|
||||
aggList[0] = bridgelist[0].bridge;
|
||||
else
|
||||
{
|
||||
for(i=0; i < confInfo->numBridgeRanks; i++)
|
||||
{
|
||||
aggList[i]=bridgelist[i*confInfo->ioMaxSize /*virtualPsetSize*/].bridge;
|
||||
TRACE_ERR("aggList[%d]: %d\n", i, aggList[i]);
|
||||
|
||||
for(j = 0; j < numAggs; j++)
|
||||
{
|
||||
/* Sets up a list of nodes which will act as aggregators. numAggs
|
||||
* per bridge node total. The list of aggregators is
|
||||
* bridgeNodes
|
||||
* bridgeNode[0]aggr[0]
|
||||
* bridgeNode[0]aggr[1]...
|
||||
* bridgeNode[0]aggr[N]...
|
||||
* ...
|
||||
* bridgeNode[N]aggr[0]..
|
||||
* bridgeNode[N]aggr[N]
|
||||
*/
|
||||
aggList[i*numAggs+j+confInfo->numBridgeRanks] = bridgelist[i*confInfo->ioMaxSize /*virtualPsetSize*/ + j*distance+1].rank;
|
||||
TRACE_ERR("(post bridge) agglist[%d] -> %d\n", confInfo->numBridgeRanks +i*numAggs+j, aggList[i*numAggs+j+confInfo->numBridgeRanks]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(tmp_ranklist, aggList, (numAggs*confInfo->numBridgeRanks+numAggs)*sizeof(int));
|
||||
for(i=0;i<aggTotal;i++)
|
||||
{
|
||||
TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
|
||||
}
|
||||
|
||||
|
||||
ADIOI_Free (bridgelist);
|
||||
ADIOI_Free (aggList);
|
||||
|
||||
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
|
||||
return aggTotal;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* compute aggregators ranklist and put it into fd->hints struct
|
||||
*/
|
||||
static void
|
||||
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo,
|
||||
int *aggrsInPset )
|
||||
{
|
||||
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
|
||||
int i;
|
||||
int naggs;
|
||||
int size;
|
||||
int *tmp_ranklist;
|
||||
|
||||
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
|
||||
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<confInfo->nProcs; i++) {
|
||||
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
|
||||
}
|
||||
# endif
|
||||
|
||||
naggs=
|
||||
ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, aggrsInPset, tmp_ranklist);
|
||||
|
||||
# define VERIFY 1
|
||||
# if VERIFY
|
||||
DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
|
||||
confInfo->ioMinSize ,
|
||||
confInfo->ioMaxSize ,
|
||||
confInfo->nAggrs ,
|
||||
confInfo->numBridgeRanks ,
|
||||
confInfo->nProcs ,
|
||||
confInfo->ioMaxSize /*virtualPsetSize*/ ,
|
||||
confInfo->cpuIDsize,
|
||||
confInfo->aggRatio ,
|
||||
naggs );
|
||||
# endif
|
||||
MPI_Comm_size( fd->comm, &size );
|
||||
/* This fix is for when the bridgenode rnk is not part of the particular
|
||||
* subcomm associated with this MPI File operation. I don't know if
|
||||
* this is the best/right answer but it passes the test cases at least.
|
||||
* I don't know how common file IO in subcomms is anyway... */
|
||||
for(i=0;i<naggs;i++)
|
||||
{
|
||||
if(tmp_ranklist[i] > size)
|
||||
{
|
||||
TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
|
||||
i, tmp_ranklist[i], fd->comm);
|
||||
tmp_ranklist[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<naggs; i++) {
|
||||
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
|
||||
}
|
||||
# endif
|
||||
|
||||
/* copy the ranklist of IO aggregators to fd->hints */
|
||||
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
|
||||
|
||||
fd->hints->cb_nodes = naggs;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
|
||||
|
||||
/* */
|
||||
ADIOI_Free( tmp_ranklist );
|
||||
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Description from common/ad_aggregate.c. (Does it completely apply to bg?)
|
||||
* ADIOI_Calc_aggregator()
|
||||
*
|
||||
@ -349,7 +94,7 @@ ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
* This is more general aggregator search function which does not base on the assumption
|
||||
* that each aggregator hosts the file domain with the same size
|
||||
*/
|
||||
int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
|
||||
ADIO_Offset off,
|
||||
ADIO_Offset min_off,
|
||||
ADIO_Offset *len,
|
||||
@ -359,9 +104,9 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
{
|
||||
int rank_index, rank;
|
||||
ADIO_Offset avail_bytes;
|
||||
TRACE_ERR("Entering ADIOI_BG_Calc_aggregator\n");
|
||||
TRACE_ERR("Entering ADIOI_GPFS_Calc_aggregator\n");
|
||||
|
||||
ADIOI_BG_assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
|
||||
ADIOI_Assert ( (off <= fd_end[fd->hints->cb_nodes-1] && off >= min_off && fd_start[0] >= min_off ) );
|
||||
|
||||
/* binary search --> rank_index is returned */
|
||||
int ub = fd->hints->cb_nodes;
|
||||
@ -401,7 +146,7 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
rank_index,fd->hints->cb_nodes,fd_size,off);
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
/* DBG_FPRINTF ("ADIOI_BG_Calc_aggregator: rank_index = %d\n",
|
||||
/* DBG_FPRINTF ("ADIOI_GPFS_Calc_aggregator: rank_index = %d\n",
|
||||
rank_index ); */
|
||||
|
||||
/*
|
||||
@ -422,7 +167,7 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
/* map our index to a rank */
|
||||
/* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
|
||||
rank = fd->hints->ranklist[rank_index];
|
||||
TRACE_ERR("Leaving ADIOI_BG_Calc_aggregator\n");
|
||||
TRACE_ERR("Leaving ADIOI_GPFS_Calc_aggregator\n");
|
||||
|
||||
return rank;
|
||||
}
|
||||
@ -441,7 +186,8 @@ int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
* It doesn't seem necessary here (using GPFS block sizes) but keep it in mind
|
||||
* (e.g. we could pass striping unit instead of using fs_ptr->blksize).
|
||||
*/
|
||||
void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
|
||||
ADIO_Offset *st_offsets,
|
||||
ADIO_Offset *end_offsets,
|
||||
int nprocs,
|
||||
int nprocs_for_coll,
|
||||
@ -453,20 +199,23 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
{
|
||||
ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, *fd_size;
|
||||
int i, aggr;
|
||||
TRACE_ERR("Entering ADIOI_BG_GPFS_Calc_file_domains\n");
|
||||
TRACE_ERR("Entering ADIOI_GPFS_Calc_file_domains\n");
|
||||
blksize_t blksize;
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5004, 0, NULL);
|
||||
#endif
|
||||
|
||||
# if AGG_DEBUG
|
||||
static char myname[] = "ADIOI_BG_GPFS_Calc_file_domains";
|
||||
static char myname[] = "ADIOI_GPFS_Calc_file_domains";
|
||||
DBG_FPRINTF(stderr, "%s(%d): %d aggregator(s)\n",
|
||||
myname,__LINE__,nprocs_for_coll);
|
||||
# endif
|
||||
__blksize_t blksize = 1048576; /* default to 1M */
|
||||
if(fs_ptr && ((ADIOI_BG_fs*)fs_ptr)->blksize) /* ignore null ptr or 0 blksize */
|
||||
blksize = ((ADIOI_BG_fs*)fs_ptr)->blksize;
|
||||
if (fd->blksize <= 0)
|
||||
/* default to 1M if blksize unset */
|
||||
fd->blksize = 1048576;
|
||||
blksize = fd->blksize;
|
||||
|
||||
# if AGG_DEBUG
|
||||
DBG_FPRINTF(stderr,"%s(%d): Blocksize=%ld\n",myname,__LINE__,blksize);
|
||||
# endif
|
||||
@ -509,14 +258,144 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
fd_start = *fd_start_ptr;
|
||||
fd_end = *fd_end_ptr;
|
||||
|
||||
/* each process will have a file domain of some number of gpfs blocks, but
|
||||
* the division of blocks is not likely to be even. Some file domains will
|
||||
* be "large" and others "small"
|
||||
*
|
||||
* Example: consider 17 blocks distributed over 3 aggregators.
|
||||
* nb_cn_small = 17/3 = 5
|
||||
* naggs_large = 17 - 3*(17/3) = 17 - 15 = 2
|
||||
* naggs_small = 3 - 2 = 1
|
||||
*
|
||||
* and you end up with file domains of {5-blocks, 6-blocks, 6-blocks}
|
||||
*
|
||||
* what about (relatively) small files? say, a file of 1000 blocks
|
||||
* distributed over 2064 aggregators:
|
||||
* nb_cn_small = 1000/2064 = 0
|
||||
* naggs_large = 1000 - 2064*(1000/2064) = 1000
|
||||
* naggs_small = 2064 - 1000 = 1064
|
||||
* and you end up with domains of {0, 0, 0, ... 1, 1, 1 ...}
|
||||
*
|
||||
* it might be a good idea instead of having all the zeros up front, to
|
||||
* "mix" those zeros into the fd_size array. that way, no pset/bridge-set
|
||||
* is left with zero work. In fact, even if the small file domains aren't
|
||||
* zero, it's probably still a good idea to mix the "small" file domains
|
||||
* across the fd_size array to keep the io nodes in balance */
|
||||
|
||||
|
||||
ADIO_Offset n_gpfs_blk = fd_gpfs_range / blksize;
|
||||
ADIO_Offset nb_cn_small = n_gpfs_blk/naggs;
|
||||
ADIO_Offset naggs_large = n_gpfs_blk - naggs * (n_gpfs_blk/naggs);
|
||||
ADIO_Offset naggs_small = naggs - naggs_large;
|
||||
|
||||
for (i=0; i<naggs; i++)
|
||||
if (i < naggs_small) fd_size[i] = nb_cn_small * blksize;
|
||||
else fd_size[i] = (nb_cn_small+1) * blksize;
|
||||
#ifdef BGQPLATFORM
|
||||
if (gpfsmpio_balancecontig == 1) {
|
||||
/* File domains blocks are assigned to aggregators in a breadth-first
|
||||
* fashion relative to the ions - additionally, file domains on the
|
||||
* aggregators sharing the same bridgeset and ion have contiguous
|
||||
* offsets. */
|
||||
|
||||
// initialize everything to small
|
||||
for (i=0; i<naggs; i++)
|
||||
fd_size[i] = nb_cn_small * blksize;
|
||||
|
||||
// go thru and distribute the large across the bridges
|
||||
|
||||
/* bridelistoffset: agg rank list offsets using the bridgelist - each
|
||||
* entry is created by adding up the indexes for the aggs from all
|
||||
* previous bridges */
|
||||
int *bridgelistoffset =
|
||||
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
|
||||
/* tmpbridgelistnum: copy of the bridgelistnum whose entries can be
|
||||
* decremented to keep track of bridge assignments during the actual
|
||||
* large block assignments to the agg rank list*/
|
||||
int *tmpbridgelistnum =
|
||||
(int *) ADIOI_Malloc(fd->hints->fs_hints.bg.numbridges*sizeof(int));
|
||||
|
||||
int j;
|
||||
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++) {
|
||||
int k, bridgerankoffset = 0;
|
||||
for (k=0;k<j;k++) {
|
||||
bridgerankoffset += fd->hints->fs_hints.bg.bridgelistnum[k];
|
||||
}
|
||||
bridgelistoffset[j] = bridgerankoffset;
|
||||
}
|
||||
|
||||
for (j=0;j<fd->hints->fs_hints.bg.numbridges;j++)
|
||||
tmpbridgelistnum[j] = fd->hints->fs_hints.bg.bridgelistnum[j];
|
||||
int bridgeiter = 0;
|
||||
|
||||
/* distribute the large blocks across the aggs going breadth-first
|
||||
* across the bridgelist - this distributes the fd sizes across the
|
||||
* ions, so later in the file domain assignment when it iterates thru
|
||||
* the ranklist the offsets will be contiguous within the bridge and
|
||||
* ion as well */
|
||||
for (j=0;j<naggs_large;j++) {
|
||||
int foundbridge = 0;
|
||||
int numbridgelistpasses = 0;
|
||||
while (!foundbridge) {
|
||||
if (tmpbridgelistnum[bridgeiter] > 0) {
|
||||
foundbridge = 1;
|
||||
/*
|
||||
printf("bridgeiter is %d tmpbridgelistnum[bridgeiter] is %d bridgelistoffset[bridgeiter] is %d\n",bridgeiter,tmpbridgelistnum[bridgeiter],bridgelistoffset[bridgeiter]);
|
||||
printf("naggs is %d bridgeiter is %d bridgelistoffset[bridgeiter] is %d tmpbridgelistnum[bridgeiter] is %d\n",naggs, bridgeiter,bridgelistoffset[bridgeiter],tmpbridgelistnum[bridgeiter]);
|
||||
printf("naggs is %d bridgeiter is %d setting fd_size[%d]\n",naggs, bridgeiter,bridgelistoffset[bridgeiter]+(fd->hints->bridgelistnum[bridgeiter]-tmpbridgelistnum[bridgeiter]));
|
||||
*/
|
||||
int currentbridgelistnum =
|
||||
(fd->hints->fs_hints.bg.bridgelistnum[bridgeiter]-
|
||||
tmpbridgelistnum[bridgeiter]);
|
||||
int currentfdsizeindex = bridgelistoffset[bridgeiter] +
|
||||
currentbridgelistnum;
|
||||
fd_size[currentfdsizeindex] = (nb_cn_small+1) * blksize;
|
||||
tmpbridgelistnum[bridgeiter]--;
|
||||
}
|
||||
if (bridgeiter == (fd->hints->fs_hints.bg.numbridges-1)) {
|
||||
/* guard against infinite loop - should only ever make 1 pass
|
||||
* thru bridgelist */
|
||||
ADIOI_Assert(numbridgelistpasses == 0);
|
||||
numbridgelistpasses++;
|
||||
bridgeiter = 0;
|
||||
}
|
||||
else
|
||||
bridgeiter++;
|
||||
}
|
||||
}
|
||||
ADIOI_Free(tmpbridgelistnum);
|
||||
ADIOI_Free(bridgelistoffset);
|
||||
|
||||
} else {
|
||||
/* BG/L- and BG/P-style distribution of file domains: simple allocation of
|
||||
* file domins to each aggregator */
|
||||
for (i=0; i<naggs; i++) {
|
||||
if (i < naggs_large) {
|
||||
fd_size[i] = (nb_cn_small+1) * blksize;
|
||||
} else {
|
||||
fd_size[i] = nb_cn_small * blksize;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef balancecontigtrace
|
||||
int myrank;
|
||||
MPI_Comm_rank(fd->comm,&myrank);
|
||||
if (myrank == 0) {
|
||||
fprintf(stderr,"naggs_small is %d nb_cn_small is %d\n",naggs_small,nb_cn_small);
|
||||
for (i=0; i<naggs; i++) {
|
||||
fprintf(stderr,"fd_size[%d] set to %d agg rank is %d\n",i,fd_size[i],fd->hints->ranklist[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#else // not BGQ platform
|
||||
for (i=0; i<naggs; i++) {
|
||||
if (i < naggs_large) {
|
||||
fd_size[i] = (nb_cn_small+1) * blksize;
|
||||
} else {
|
||||
fd_size[i] = nb_cn_small * blksize;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
# if AGG_DEBUG
|
||||
DBG_FPRINTF(stderr,"%s(%d): "
|
||||
@ -561,30 +440,18 @@ void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
MPE_Log_event (5005, 0, NULL);
|
||||
#endif
|
||||
ADIOI_Free (fd_size);
|
||||
TRACE_ERR("Leaving ADIOI_BG_GPFS_Calc_file_domains\n");
|
||||
TRACE_ERR("Leaving ADIOI_GPFS_Calc_file_domains\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* When a process is an IO aggregator, this will return its index in the aggrs list.
|
||||
* Otherwise, this will return -1
|
||||
*/
|
||||
int ADIOI_BG_Aggrs_index( ADIO_File fd, int myrank )
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<fd->hints->cb_nodes; i++)
|
||||
if (fd->hints->ranklist[i] == myrank) return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* ADIOI_BG_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
|
||||
* ADIOI_GPFS_Calc_my_req() overrides ADIOI_Calc_my_req for the default implementation
|
||||
* is specific for static file domain partitioning.
|
||||
*
|
||||
* ADIOI_Calc_my_req() - calculate what portions of the access requests
|
||||
* of this process are located in the file domains of various processes
|
||||
* (including this one)
|
||||
*/
|
||||
void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
void ADIOI_GPFS_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
int contig_access_count, ADIO_Offset
|
||||
min_st_offset, ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end, ADIO_Offset fd_size,
|
||||
@ -600,12 +467,11 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
int i, l, proc;
|
||||
ADIO_Offset fd_len, rem_len, curr_idx, off;
|
||||
ADIOI_Access *my_req;
|
||||
TRACE_ERR("Entering ADIOI_BG_Calc_my_req\n");
|
||||
TRACE_ERR("Entering ADIOI_GPFS_Calc_my_req\n");
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5024, 0, NULL);
|
||||
#endif
|
||||
|
||||
*count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs,sizeof(int));
|
||||
count_my_req_per_proc = *count_my_req_per_proc_ptr;
|
||||
/* count_my_req_per_proc[i] gives the no. of contig. requests of this
|
||||
@ -638,7 +504,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
* first part of the access.
|
||||
*/
|
||||
/* BES */
|
||||
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
fd_start, fd_end);
|
||||
count_my_req_per_proc[proc]++;
|
||||
|
||||
@ -651,7 +517,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
while (rem_len > 0) {
|
||||
off += fd_len; /* point to first remaining byte */
|
||||
fd_len = rem_len; /* save remaining size, pass to calc */
|
||||
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
fd_size, fd_start, fd_end);
|
||||
|
||||
count_my_req_per_proc[proc]++;
|
||||
@ -670,8 +536,8 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
if (count_my_req_per_proc[i]) {
|
||||
my_req[i].offsets = (ADIO_Offset *)
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
|
||||
my_req[i].lens = (int *)
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(int));
|
||||
my_req[i].lens =
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] * sizeof(ADIO_Offset));
|
||||
count_my_req_procs++;
|
||||
}
|
||||
my_req[i].count = 0; /* will be incremented where needed
|
||||
@ -687,7 +553,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
continue;
|
||||
off = offset_list[i];
|
||||
fd_len = len_list[i];
|
||||
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size,
|
||||
fd_start, fd_end);
|
||||
|
||||
/* for each separate contiguous access from this process */
|
||||
@ -708,14 +574,13 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
* and the associated count.
|
||||
*/
|
||||
my_req[proc].offsets[l] = off;
|
||||
ADIOI_Assert(fd_len == (int) fd_len);
|
||||
my_req[proc].lens[l] = (int) fd_len;
|
||||
my_req[proc].lens[l] = fd_len;
|
||||
my_req[proc].count++;
|
||||
|
||||
while (rem_len > 0) {
|
||||
off += fd_len;
|
||||
fd_len = rem_len;
|
||||
proc = ADIOI_BG_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
proc = ADIOI_GPFS_Calc_aggregator(fd, off, min_st_offset, &fd_len,
|
||||
fd_size, fd_start, fd_end);
|
||||
|
||||
if (buf_idx[proc] == -1)
|
||||
@ -729,8 +594,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
rem_len -= fd_len;
|
||||
|
||||
my_req[proc].offsets[l] = off;
|
||||
ADIOI_Assert(fd_len == (int) fd_len);
|
||||
my_req[proc].lens[l] = (int) fd_len;
|
||||
my_req[proc].lens[l] = fd_len;
|
||||
my_req[proc].count++;
|
||||
}
|
||||
}
|
||||
@ -743,7 +607,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
DBG_FPRINTF(stderr, "data needed from %d (count = %d):\n", i,
|
||||
my_req[i].count);
|
||||
for (l=0; l < my_req[i].count; l++) {
|
||||
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %d\n", l,
|
||||
DBG_FPRINTF(stderr, " off[%d] = %lld, len[%d] = %lld\n", l,
|
||||
my_req[i].offsets[l], l, my_req[i].lens[l]);
|
||||
}
|
||||
}
|
||||
@ -756,7 +620,7 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5025, 0, NULL);
|
||||
#endif
|
||||
TRACE_ERR("Leaving ADIOI_BG_Calc_my_req\n");
|
||||
TRACE_ERR("Leaving ADIOI_GPFS_Calc_my_req\n");
|
||||
}
|
||||
|
||||
/*
|
||||
@ -776,14 +640,14 @@ void ADIOI_BG_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *l
|
||||
* param[out] others_req_ptr Array of other process' requests that lie
|
||||
* in my process's file domain
|
||||
*/
|
||||
void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
int *count_my_req_per_proc,
|
||||
ADIOI_Access *my_req,
|
||||
int nprocs, int myrank,
|
||||
int *count_others_req_procs_ptr,
|
||||
ADIOI_Access **others_req_ptr)
|
||||
{
|
||||
TRACE_ERR("Entering ADIOI_BG_Calc_others_req\n");
|
||||
TRACE_ERR("Entering ADIOI_GPFS_Calc_others_req\n");
|
||||
/* determine what requests of other processes lie in this process's
|
||||
file domain */
|
||||
|
||||
@ -820,7 +684,7 @@ void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
*/
|
||||
count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs*sizeof(int));
|
||||
/* cora2a1=timebase(); */
|
||||
for(i=0;i<nprocs;i++)
|
||||
/*for(i=0;i<nprocs;i++) ?*/
|
||||
MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
|
||||
count_others_req_per_proc, 1, MPI_INT, fd->comm);
|
||||
|
||||
@ -852,8 +716,8 @@ for(i=0;i<nprocs;i++)
|
||||
|
||||
others_req[i].offsets = (ADIO_Offset *)
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
|
||||
others_req[i].lens = (int *)
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(int));
|
||||
others_req[i].lens =
|
||||
ADIOI_Malloc(count_others_req_per_proc[i]*sizeof(ADIO_Offset));
|
||||
|
||||
if ( (MPIR_Upint)others_req[i].offsets < (MPIR_Upint)recvBufForOffsets )
|
||||
recvBufForOffsets = others_req[i].offsets;
|
||||
@ -903,7 +767,6 @@ for(i=0;i<nprocs;i++)
|
||||
if ( sendBufForLens == (void*)0xFFFFFFFFFFFFFFFF) sendBufForLens = NULL;
|
||||
|
||||
/* Calculate the displacements from the sendBufForOffsets/Lens */
|
||||
MPI_Barrier(fd->comm);
|
||||
for (i=0; i<nprocs; i++)
|
||||
{
|
||||
/* Send these offsets to process i.*/
|
||||
@ -911,7 +774,7 @@ for(i=0;i<nprocs;i++)
|
||||
if ( scounts[i] == 0 )
|
||||
sdispls[i] = 0;
|
||||
else
|
||||
sdispls[i] = (int)
|
||||
sdispls[i] = (int)
|
||||
( ( (MPIR_Upint)my_req[i].offsets -
|
||||
(MPIR_Upint)sendBufForOffsets ) /
|
||||
(MPIR_Upint)sizeof(ADIO_Offset) );
|
||||
@ -948,7 +811,7 @@ for(i=0;i<nprocs;i++)
|
||||
sdispls[i] = (int)
|
||||
( ( (MPIR_Upint)my_req[i].lens -
|
||||
(MPIR_Upint)sendBufForLens ) /
|
||||
(MPIR_Upint) sizeof(int) );
|
||||
(MPIR_Upint) sizeof(ADIO_Offset) );
|
||||
|
||||
/* Receive these offsets from process i. */
|
||||
rcounts[i] = count_others_req_per_proc[i];
|
||||
@ -958,14 +821,14 @@ for(i=0;i<nprocs;i++)
|
||||
rdispls[i] = (int)
|
||||
( ( (MPIR_Upint)others_req[i].lens -
|
||||
(MPIR_Upint)recvBufForLens ) /
|
||||
(MPIR_Upint) sizeof(int) );
|
||||
(MPIR_Upint) sizeof(ADIO_Offset) );
|
||||
}
|
||||
|
||||
/* Exchange the lengths */
|
||||
MPI_Alltoallv(sendBufForLens,
|
||||
scounts, sdispls, MPI_INT,
|
||||
scounts, sdispls, ADIO_OFFSET,
|
||||
recvBufForLens,
|
||||
rcounts, rdispls, MPI_INT,
|
||||
rcounts, rdispls, ADIO_OFFSET,
|
||||
fd->comm);
|
||||
|
||||
/* Clean up */
|
||||
@ -979,5 +842,5 @@ for(i=0;i<nprocs;i++)
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5027, 0, NULL);
|
||||
#endif
|
||||
TRACE_ERR("Leaving ADIOI_BG_Calc_others_req\n");
|
||||
TRACE_ERR("Leaving ADIOI_GPFS_Calc_others_req\n");
|
||||
}
|
@ -1,104 +1,86 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_aggrs.h
|
||||
* \brief ???
|
||||
*/
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_aggrs.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*
|
||||
* File: ad_gpfs_aggrs.h
|
||||
*
|
||||
* Declares functions optimized specifically for GPFS parallel I/O solution.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef AD_GPFS_AGGRS_H_
|
||||
#define AD_GPFS_AGGRS_H_
|
||||
|
||||
#include "adio.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
/*
|
||||
* File: ad_bg_aggrs.h
|
||||
*
|
||||
* Declares functions specific for BG/L - GPFS parallel I/O solution. The implemented optimizations are:
|
||||
* . Aligned file-domain partitioning, integrated in 7/28/2005
|
||||
*
|
||||
* In addition, following optimizations are planned:
|
||||
* . Integrating multiple file-domain partitioning schemes
|
||||
* (corresponding to Alok Chouhdary's persistent file domain work).
|
||||
*/
|
||||
|
||||
#ifndef AD_BG_AGGRS_H_
|
||||
#define AD_BG_AGGRS_H_
|
||||
|
||||
#include "adio.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
#if !defined(GPFS_SUPER_MAGIC)
|
||||
#define GPFS_SUPER_MAGIC (0x47504653)
|
||||
#ifdef HAVE_GPFS_H
|
||||
#include <gpfs.h>
|
||||
#endif
|
||||
|
||||
/* File system (BG) specific information -
|
||||
hung off of ADIOI_FileD file descriptor (fd->fs_ptr) at open */
|
||||
typedef struct ADIOI_BG_fs_s {
|
||||
__blksize_t blksize;
|
||||
int fsync_aggr; /* "fsync aggregation" flags (below) */
|
||||
#define ADIOI_BG_FSYNC_AGGREGATION_DISABLED 0x00
|
||||
#define ADIOI_BG_FSYNC_AGGREGATION_ENABLED 0x01
|
||||
#define ADIOI_BG_FSYNC_AGGREGATOR 0x10 /* This rank is an aggregator */
|
||||
} ADIOI_BG_fs;
|
||||
|
||||
/* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
|
||||
int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
|
||||
|
||||
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
|
||||
void ADIOI_BG_GPFS_Calc_file_domains(ADIO_Offset *st_offsets,
|
||||
ADIO_Offset *end_offsets,
|
||||
int nprocs,
|
||||
int nprocs_for_coll,
|
||||
ADIO_Offset *min_st_offset_ptr,
|
||||
ADIO_Offset **fd_start_ptr,
|
||||
ADIO_Offset **fd_end_ptr,
|
||||
ADIO_Offset *fd_size_ptr,
|
||||
void *fs_ptr);
|
||||
|
||||
/* a utilitiy function for debugging */
|
||||
int ADIOI_BG_Aggrs_index(ADIO_File fd, int myrank );
|
||||
|
||||
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
int ADIOI_BG_Calc_aggregator(ADIO_File fd,
|
||||
ADIO_Offset off,
|
||||
ADIO_Offset min_off,
|
||||
ADIO_Offset *len,
|
||||
ADIO_Offset fd_size,
|
||||
ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end);
|
||||
|
||||
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
void ADIOI_BG_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
int contig_access_count, ADIO_Offset
|
||||
min_st_offset, ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end, ADIO_Offset fd_size,
|
||||
int nprocs,
|
||||
int *count_my_req_procs_ptr,
|
||||
int **count_my_req_per_proc_ptr,
|
||||
ADIOI_Access **my_req_ptr,
|
||||
int **buf_idx_ptr);
|
||||
|
||||
/*
|
||||
* ADIOI_Calc_others_req
|
||||
*
|
||||
* param[in] count_my_req_procs Number of processes whose file domain my
|
||||
* request touches.
|
||||
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
|
||||
* contig. requests of this process in
|
||||
* process i's file domain.
|
||||
* param[in] my_req A structure defining my request
|
||||
* param[in] nprocs Number of nodes in the block
|
||||
* param[in] myrank Rank of this node
|
||||
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
|
||||
* my process's file domain (including my
|
||||
* process itself)
|
||||
* param[out] others_req_ptr Array of other process' requests that lie
|
||||
* in my process's file domain
|
||||
*/
|
||||
void ADIOI_BG_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
int *count_my_req_per_proc,
|
||||
ADIOI_Access *my_req,
|
||||
int nprocs, int myrank,
|
||||
int *count_others_req_procs_ptr,
|
||||
ADIOI_Access **others_req_ptr);
|
||||
|
||||
|
||||
#endif /* AD_BG_AGGRS_H_ */
|
||||
|
||||
|
||||
/* overriding ADIOI_Calc_file_domains() to apply 'aligned file domain partitioning'. */
|
||||
void ADIOI_GPFS_Calc_file_domains(ADIO_File fd,
|
||||
ADIO_Offset *st_offsets,
|
||||
ADIO_Offset *end_offsets,
|
||||
int nprocs,
|
||||
int nprocs_for_coll,
|
||||
ADIO_Offset *min_st_offset_ptr,
|
||||
ADIO_Offset **fd_start_ptr,
|
||||
ADIO_Offset **fd_end_ptr,
|
||||
ADIO_Offset *fd_size_ptr,
|
||||
void *fs_ptr);
|
||||
|
||||
/* overriding ADIOI_Calc_aggregator() for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
int ADIOI_GPFS_Calc_aggregator(ADIO_File fd,
|
||||
ADIO_Offset off,
|
||||
ADIO_Offset min_off,
|
||||
ADIO_Offset *len,
|
||||
ADIO_Offset fd_size,
|
||||
ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end);
|
||||
|
||||
/* overriding ADIOI_Calc_my_req for the default implementation is specific for
|
||||
static file domain partitioning */
|
||||
void ADIOI_GPFS_Calc_my_req ( ADIO_File fd, ADIO_Offset *offset_list, ADIO_Offset *len_list,
|
||||
int contig_access_count, ADIO_Offset
|
||||
min_st_offset, ADIO_Offset *fd_start,
|
||||
ADIO_Offset *fd_end, ADIO_Offset fd_size,
|
||||
int nprocs,
|
||||
int *count_my_req_procs_ptr,
|
||||
int **count_my_req_per_proc_ptr,
|
||||
ADIOI_Access **my_req_ptr,
|
||||
int **buf_idx_ptr);
|
||||
|
||||
/*
|
||||
* ADIOI_Calc_others_req
|
||||
*
|
||||
* param[in] count_my_req_procs Number of processes whose file domain my
|
||||
* request touches.
|
||||
* param[in] count_my_req_per_proc count_my_req_per_proc[i] gives the no. of
|
||||
* contig. requests of this process in
|
||||
* process i's file domain.
|
||||
* param[in] my_req A structure defining my request
|
||||
* param[in] nprocs Number of nodes in the block
|
||||
* param[in] myrank Rank of this node
|
||||
* param[out] count_others_req_proc_ptr Number of processes whose requests lie in
|
||||
* my process's file domain (including my
|
||||
* process itself)
|
||||
* param[out] others_req_ptr Array of other process' requests that lie
|
||||
* in my process's file domain
|
||||
*/
|
||||
void ADIOI_GPFS_Calc_others_req(ADIO_File fd, int count_my_req_procs,
|
||||
int *count_my_req_per_proc,
|
||||
ADIOI_Access *my_req,
|
||||
int nprocs, int myrank,
|
||||
int *count_others_req_procs_ptr,
|
||||
ADIOI_Access **others_req_ptr);
|
||||
|
||||
|
||||
#endif /* AD_GPFS_AGGRS_H_ */
|
@ -2,7 +2,7 @@
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_close.c
|
||||
* \file ad_gpfs_close.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
@ -12,18 +12,22 @@
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
#include "ad_gpfs.h"
|
||||
#include "ad_gpfs_tuning.h"
|
||||
#include <unistd.h>
|
||||
|
||||
void ADIOI_BGL_Close(ADIO_File fd, int *error_code)
|
||||
void ADIOI_GPFS_Close(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int err, derr=0;
|
||||
static char myname[] = "ADIOI_BGL_CLOSE";
|
||||
static char myname[] = "ADIOI_GPFS_CLOSE";
|
||||
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(9, 0, "start close");
|
||||
#endif
|
||||
|
||||
if (fd->null_fd >= 0)
|
||||
close(fd->null_fd);
|
||||
|
||||
err = close(fd->fd_sys);
|
||||
if (fd->fd_direct >= 0)
|
||||
{
|
68
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_flush.c
Обычный файл
68
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_flush.c
Обычный файл
@ -0,0 +1,68 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_flush.c
|
||||
* \brief Scalable flush for GPFS
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_gpfs.h"
|
||||
|
||||
void ADIOI_GPFS_Flush(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int err=0;
|
||||
static char myname[] = "ADIOI_GPFS_FLUSH";
|
||||
|
||||
int rank;
|
||||
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
/* the old logic about who is an fsync aggregator and who is not fell down
|
||||
* when deferred open was enabled. Instead, make this look more like
|
||||
* ad_pvfs2_flush. If one day the I/O aggregators have something they need
|
||||
* to flush, we can consult the 'fd->hints->ranklist[]' array. For now, a
|
||||
* flush from one process should suffice */
|
||||
|
||||
/* ensure all other proceses are done writing. On many platforms MPI_Reduce
|
||||
* is fastest because it has the lightest constraints. On Blue Gene, BARRIER
|
||||
* is optimized */
|
||||
MPI_Barrier(fd->comm);
|
||||
|
||||
if (rank == fd->hints->ranklist[0]) {
|
||||
err = fsync(fd->fd_sys);
|
||||
DBG_FPRINTF(stderr,"aggregation:fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
/* We want errno, not the return code if it failed */
|
||||
if (err == -1) err = errno;
|
||||
else err = 0;
|
||||
}
|
||||
MPI_Bcast(&err, 1, MPI_UNSIGNED, fd->hints->ranklist[0], fd->comm);
|
||||
DBGV_FPRINTF(stderr,"aggregation result:fsync %s, errno %#X,\n",fd->filename, err);
|
||||
|
||||
if (err) /* if it's non-zero, it must be an errno */
|
||||
{
|
||||
errno = err;
|
||||
err = -1;
|
||||
}
|
||||
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (err == -1)
|
||||
{
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__, MPI_ERR_IO,
|
||||
"**io",
|
||||
"**io %s", strerror(errno));
|
||||
DBGT_FPRINTF(stderr,"fsync %s, err=%#X, errno=%#X\n",fd->filename, err, errno);
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
288
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_hints.c
Обычный файл
288
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_hints.c
Обычный файл
@ -0,0 +1,288 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_hints.c
|
||||
* \brief GPFS hint processing - for now, only used for BlueGene and PE platforms
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_extern.h"
|
||||
#include "hint_fns.h"
|
||||
|
||||
#include "ad_gpfs.h"
|
||||
|
||||
#define ADIOI_GPFS_CB_BUFFER_SIZE_DFLT "16777216"
|
||||
#define ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT "4194304"
|
||||
#define ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT "4194304"
|
||||
|
||||
#ifdef BGQPLATFORM
|
||||
#define ADIOI_BG_NAGG_IN_PSET_HINT_NAME "bg_nodes_pset"
|
||||
#endif
|
||||
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
*
|
||||
* GPFS MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
|
||||
*
|
||||
* Used for BlueGene and PE platforms, which each have their own aggregator selection
|
||||
* algorithms that ignore user provided cb_config_list.
|
||||
*
|
||||
* \section hint_sec Hints
|
||||
* - bg_nodes_pset - BlueGene only - specify how many aggregators to use per pset.
|
||||
* This hint will override the cb_nodes hint based on BlueGene psets.
|
||||
* - N - Use N nodes per pset as aggregators.
|
||||
* - Default is based on partition configuration and cb_nodes.
|
||||
*
|
||||
* The following default key/value pairs may differ from other platform defaults.
|
||||
*
|
||||
* - key = cb_buffer_size value = 16777216
|
||||
* - key = romio_cb_read value = enable
|
||||
* - key = romio_cb_write value = enable
|
||||
* - key = ind_rd_buffer_size value = 4194304
|
||||
* - key = ind_wr_buffer_size value = 4194304
|
||||
*/
|
||||
|
||||
#ifdef BGQPLATFORM
|
||||
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
|
||||
extern int
|
||||
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
|
||||
#elif PEPLATFORM
|
||||
extern int
|
||||
ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
|
||||
#endif
|
||||
|
||||
void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
{
|
||||
/* if fd->info is null, create a new info object.
|
||||
Initialize fd->info to default values.
|
||||
Initialize fd->hints to default values.
|
||||
Examine the info object passed by the user. If it contains values that
|
||||
ROMIO understands, override the default. */
|
||||
|
||||
MPI_Info info;
|
||||
char *value;
|
||||
int flag, intval, nprocs=0, nprocs_is_valid = 0;
|
||||
static char myname[] = "ADIOI_GPFS_SETINFO";
|
||||
|
||||
int did_anything = 0;
|
||||
|
||||
if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
|
||||
info = fd->info;
|
||||
|
||||
/* Note that fd->hints is allocated at file open time; thus it is
|
||||
* not necessary to allocate it, or check for allocation, here.
|
||||
*/
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
ADIOI_Assert ((value != NULL));
|
||||
|
||||
/* initialize info and hints to default values if they haven't been
|
||||
* previously initialized
|
||||
*/
|
||||
if (!fd->hints->initialized) {
|
||||
|
||||
ad_gpfs_get_env_vars();
|
||||
did_anything = 1;
|
||||
|
||||
/* buffer size for collective I/O */
|
||||
ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
|
||||
fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* default is to let romio automatically decide when to use
|
||||
* collective buffering
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->cb_read = ADIOI_HINT_ENABLE;
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
fd->hints->cb_write = ADIOI_HINT_ENABLE;
|
||||
|
||||
if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
|
||||
fd->hints->cb_config_list = NULL;
|
||||
|
||||
/* number of processes that perform I/O in collective I/O */
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
nprocs_is_valid = 1;
|
||||
ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
|
||||
ADIOI_Info_set(info, "cb_nodes", value);
|
||||
fd->hints->cb_nodes = -1;
|
||||
|
||||
/* hint indicating that no indep. I/O will be performed on this file */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
|
||||
/* gpfs is not implementing file realms (ADIOI_IOStridedColl),
|
||||
initialize to disabled it. */
|
||||
/* hint instructing the use of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_pfr", "disable");
|
||||
fd->hints->cb_pfr = ADIOI_HINT_DISABLE;
|
||||
|
||||
/* hint guiding the assignment of persistent file realms */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
|
||||
fd->hints->cb_fr_type = ADIOI_FR_AAR;
|
||||
|
||||
/* hint to align file realms with a certain byte value */
|
||||
ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
|
||||
fd->hints->cb_fr_alignment = 1;
|
||||
|
||||
/* hint to set a threshold percentage for a datatype's size/extent at
|
||||
* which data sieving should be done in collective I/O */
|
||||
ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
|
||||
fd->hints->cb_ds_threshold = 0;
|
||||
|
||||
/* hint to switch between point-to-point or all-to-all for two-phase */
|
||||
ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
|
||||
fd->hints->cb_alltoall = ADIOI_HINT_AUTO;
|
||||
|
||||
/* deferred_open derived from no_indep_rw and cb_{read,write} */
|
||||
fd->hints->deferred_open = 0;
|
||||
|
||||
/* buffer size for data sieving in independent reads */
|
||||
ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
|
||||
|
||||
/* buffer size for data sieving in independent writes */
|
||||
ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
|
||||
fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
|
||||
|
||||
|
||||
ADIOI_Info_set(info, "romio_ds_read", "automatic");
|
||||
fd->hints->ds_read = ADIOI_HINT_AUTO;
|
||||
ADIOI_Info_set(info, "romio_ds_write", "automatic");
|
||||
fd->hints->ds_write = ADIOI_HINT_AUTO;
|
||||
|
||||
/* still to do: tune this a bit for a variety of file systems. there's
|
||||
* no good default value so just leave it unset */
|
||||
fd->hints->min_fdomain_size = 0;
|
||||
fd->hints->striping_unit = 0;
|
||||
|
||||
fd->hints->initialized = 1;
|
||||
}
|
||||
|
||||
/* add in user's info if supplied */
|
||||
if (users_info != MPI_INFO_NULL) {
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
|
||||
&(fd->hints->cb_buffer_size), myname, error_code);
|
||||
/* new hints for enabling/disabling coll. buffering on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
|
||||
&(fd->hints->cb_read), myname, error_code);
|
||||
if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
|
||||
/* romio_cb_read overrides no_indep_rw */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
|
||||
&(fd->hints->cb_write), myname, error_code);
|
||||
if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
|
||||
/* romio_cb_write overrides no_indep_rw */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
/* Has the user indicated all I/O will be done collectively? */
|
||||
ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
|
||||
&(fd->hints->no_indep_rw), myname, error_code);
|
||||
if (fd->hints->no_indep_rw == 1) {
|
||||
/* if 'no_indep_rw' set, also hint that we will do
|
||||
* collective buffering: if we aren't doing independent io,
|
||||
* then we have to do collective */
|
||||
ADIOI_Info_set(info, "romio_cb_write", "enable");
|
||||
ADIOI_Info_set(info, "romio_cb_read", "enable");
|
||||
fd->hints->cb_read = 1;
|
||||
fd->hints->cb_write = 1;
|
||||
}
|
||||
|
||||
/* new hints for enabling/disabling data sieving on
|
||||
* reads/writes
|
||||
*/
|
||||
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
|
||||
&(fd->hints->ds_read), myname, error_code);
|
||||
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
|
||||
&(fd->hints->ds_write), myname, error_code);
|
||||
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
|
||||
&(fd->hints->ind_wr_buffer_size), myname, error_code);
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
|
||||
&(fd->hints->ind_rd_buffer_size), myname, error_code);
|
||||
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if ( flag && ((intval = atoi(value)) > 0) ) {
|
||||
ADIOI_Info_set(info, "romio_min_fdomain_size", value);
|
||||
fd->hints->min_fdomain_size = intval;
|
||||
}
|
||||
/* Now we use striping unit in common code so we should
|
||||
process hints for it. */
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
|
||||
&(fd->hints->striping_unit), myname, error_code);
|
||||
|
||||
#ifdef BGQPLATFORM
|
||||
memset( value, 0, MPI_MAX_INFO_VAL+1 );
|
||||
ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && ((intval = atoi(value)) > 0)) {
|
||||
|
||||
did_anything = 1;
|
||||
ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
|
||||
fd->hints->cb_nodes = intval;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* special CB aggregator assignment */
|
||||
if (did_anything) {
|
||||
#ifdef BGQPLATFORM
|
||||
ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
|
||||
#elif PEPLATFORM
|
||||
ADIOI_PE_gen_agg_ranklist(fd);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* deferred_open won't be set by callers, but if the user doesn't
|
||||
* explicitly disable collecitve buffering (two-phase) and does hint that
|
||||
* io w/o independent io is going on, we'll set this internal hint as a
|
||||
* convenience */
|
||||
if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \
|
||||
&& (fd->hints->cb_write != ADIOI_HINT_DISABLE)\
|
||||
&& fd->hints->no_indep_rw ) ) {
|
||||
fd->hints->deferred_open = 1;
|
||||
} else {
|
||||
/* setting romio_no_indep_rw enable and romio_cb_{read,write}
|
||||
* disable at the same time doesn't make sense. honor
|
||||
* romio_cb_{read,write} and force the no_indep_rw hint to
|
||||
* 'disable' */
|
||||
ADIOI_Info_set(info, "romio_no_indep_rw", "false");
|
||||
fd->hints->no_indep_rw = 0;
|
||||
fd->hints->deferred_open = 0;
|
||||
}
|
||||
|
||||
/* BobC commented this out, but since hint processing runs on both bg and
|
||||
* bglockless, we need to keep DS writes enabled on gpfs and disabled on
|
||||
* PVFS */
|
||||
if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
|
||||
/* disable data sieving for fs that do not
|
||||
support file locking */
|
||||
ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
/* get rid of this value if it is set */
|
||||
ADIOI_Info_delete(info, "ind_wr_buffer_size");
|
||||
}
|
||||
/* note: leave ind_wr_buffer_size alone; used for other cases
|
||||
* as well. -- Rob Ross, 04/22/2003
|
||||
*/
|
||||
ADIOI_Info_set(info, "romio_ds_write", "disable");
|
||||
fd->hints->ds_write = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
|
||||
ADIOI_Free(value);
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
}
|
156
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_open.c
Обычный файл
156
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_open.c
Обычный файл
@ -0,0 +1,156 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_open.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
#include "ad_gpfs.h"
|
||||
#include "ad_gpfs_tuning.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
|
||||
#ifdef HAVE_GPFS_H
|
||||
#include <gpfs.h>
|
||||
#endif
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
#include <gpfs_fcntl.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
static void gpfs_free_all_locks(int fd)
|
||||
{
|
||||
int rc;
|
||||
struct {
|
||||
gpfsFcntlHeader_t header;
|
||||
gpfsFreeRange_t release;
|
||||
} release_all;
|
||||
|
||||
release_all.header.totalLength = sizeof(release_all);
|
||||
release_all.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
|
||||
release_all.header.fcntlReserved = 0;
|
||||
|
||||
release_all.release.structLen = sizeof(release_all.release);
|
||||
release_all.release.structType = GPFS_FREE_RANGE;
|
||||
release_all.release.start = 0;
|
||||
release_all.release.length = 0;
|
||||
|
||||
rc = gpfs_fcntl(fd, &release_all);
|
||||
if (rc != 0) {
|
||||
DBGV_FPRINTF(stderr,"GPFS fcntl release failed with rc=%d, errno=%d\n",
|
||||
rc,errno);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void ADIOI_GPFS_Open(ADIO_File fd, int *error_code)
|
||||
{
|
||||
int perm, old_mask, amode, rank, rc;
|
||||
static char myname[] = "ADIOI_GPFS_OPEN";
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
ad_gpfs_get_env_vars();
|
||||
|
||||
if (fd->perm == ADIO_PERM_NULL) {
|
||||
old_mask = umask(022);
|
||||
umask(old_mask);
|
||||
perm = old_mask ^ 0666;
|
||||
}
|
||||
else perm = fd->perm;
|
||||
|
||||
amode = 0;
|
||||
if (fd->access_mode & ADIO_CREATE)
|
||||
amode = amode | O_CREAT;
|
||||
if (fd->access_mode & ADIO_RDONLY)
|
||||
amode = amode | O_RDONLY;
|
||||
if (fd->access_mode & ADIO_WRONLY)
|
||||
amode = amode | O_WRONLY;
|
||||
if (fd->access_mode & ADIO_RDWR)
|
||||
amode = amode | O_RDWR;
|
||||
if (fd->access_mode & ADIO_EXCL)
|
||||
amode = amode | O_EXCL;
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
|
||||
#endif
|
||||
fd->fd_sys = open(fd->filename, amode, perm);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
|
||||
#endif
|
||||
DBG_FPRINTF(stderr,"open('%s',%#X,%#X) rc=%d, errno=%d\n",fd->filename,amode,perm,fd->fd_sys,errno);
|
||||
fd->fd_direct = -1;
|
||||
|
||||
if (gpfsmpio_devnullio == 1) {
|
||||
fd->null_fd = open("/dev/null", O_RDWR);
|
||||
} else {
|
||||
fd->null_fd = -1;
|
||||
}
|
||||
|
||||
if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
|
||||
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
|
||||
if(fd->fd_sys != -1)
|
||||
{
|
||||
|
||||
fd->blksize = 1048576; /* default to 1M */
|
||||
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_a, 0, NULL);
|
||||
#endif
|
||||
/* in this fs-specific routine, we might not be called over entire
|
||||
* communicator (deferred open). Collect statistics on one process.
|
||||
* ADIOI_GEN_Opencoll (common-code caller) will take care of the
|
||||
* broadcast */
|
||||
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
if ((rank == fd->hints->ranklist[0]) || (fd->comm == MPI_COMM_SELF)) {
|
||||
struct stat64 gpfs_statbuf;
|
||||
/* Get the (real) underlying file system block size */
|
||||
rc = stat64(fd->filename, &gpfs_statbuf);
|
||||
if (rc >= 0)
|
||||
{
|
||||
fd->blksize = gpfs_statbuf.st_blksize;
|
||||
DBGV_FPRINTF(stderr,"Successful stat '%s'. Blocksize=%ld\n",
|
||||
fd->filename,gpfs_statbuf.st_blksize);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBGV_FPRINTF(stderr,"Stat '%s' failed with rc=%d, errno=%d\n",
|
||||
fd->filename,rc,errno);
|
||||
}
|
||||
}
|
||||
/* all other ranks have incorrect fd->blocksize, but ADIOI_GEN_Opencoll
|
||||
* will take care of that in both standard and deferred-open case */
|
||||
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_stat_b, 0, NULL);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
/* in parallel workload, might be helpful to immediately release block
|
||||
* tokens. Or, system call overhead will outweigh any benefits... */
|
||||
if (getenv("ROMIO_GPFS_FREE_LOCKS")!=NULL)
|
||||
gpfs_free_all_locks(fd->fd_sys);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
if (fd->fd_sys == -1) {
|
||||
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
/*
|
||||
*vim: ts=8 sts=4 sw=4 noexpandtab
|
||||
*/
|
@ -2,7 +2,7 @@
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_rdcoll.c
|
||||
* \file ad_gpfs_rdcoll.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
@ -15,9 +15,8 @@
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_extern.h"
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_pset.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
#include "ad_gpfs.h"
|
||||
#include "ad_gpfs_aggrs.h"
|
||||
|
||||
#ifdef PROFILE
|
||||
#include "mpe.h"
|
||||
@ -87,7 +86,9 @@ extern void ADIOI_Calc_my_off_len(ADIO_File fd, int bufcount, MPI_Datatype
|
||||
ADIO_Offset *end_offset_ptr, int
|
||||
*contig_access_count_ptr);
|
||||
|
||||
void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
|
||||
|
||||
void ADIOI_GPFS_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
@ -112,20 +113,19 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset start_offset, end_offset, orig_fp, fd_size, min_st_offset, off;
|
||||
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
|
||||
*fd_end = NULL, *end_offsets = NULL;
|
||||
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
|
||||
ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
|
||||
int ii;
|
||||
ADIO_Offset *len_list = NULL;
|
||||
int *buf_idx = NULL;
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_RESET( 0, r )
|
||||
#endif
|
||||
|
||||
GPFSMPIO_T_CIO_RESET( r);
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
int bufsize, size;
|
||||
MPI_Count bufsize, size;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/* From common code - not implemented for bgl. */
|
||||
/* From common code - not implemented for bg. */
|
||||
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
|
||||
ADIOI_IOStridedColl (fd, buf, count, ADIOI_READ, datatype,
|
||||
file_ptr_type, offset, status, error_code);
|
||||
@ -143,9 +143,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
nprocs_for_coll = fd->hints->cb_nodes;
|
||||
orig_fp = fd->fp_ind;
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
|
||||
|
||||
/* only check for interleaving if cb_read isn't disabled */
|
||||
if (fd->hints->cb_read != ADIOI_HINT_DISABLE) {
|
||||
@ -157,11 +156,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
|
||||
ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
|
||||
&offset_list, &len_list, &start_offset,
|
||||
&end_offset, &contig_access_count);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
|
||||
#endif
|
||||
&end_offset, &contig_access_count);
|
||||
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
|
||||
|
||||
#ifdef RDCOLL_DEBUG
|
||||
for (i=0; i<contig_access_count; i++) {
|
||||
@ -177,24 +174,24 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
|
||||
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
|
||||
|
||||
if (bglmpio_tunegather) {
|
||||
bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
bgl_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
if (gpfsmpio_tunegather) {
|
||||
gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
gpfs_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
for (ii=0; ii<nprocs; ii++) {
|
||||
bgl_offsets0[ii*2] = 0;
|
||||
bgl_offsets0[ii*2+1] = 0;
|
||||
gpfs_offsets0[ii*2] = 0;
|
||||
gpfs_offsets0[ii*2+1] = 0;
|
||||
}
|
||||
bgl_offsets0[myrank*2] = start_offset;
|
||||
bgl_offsets0[myrank*2+1] = end_offset;
|
||||
gpfs_offsets0[myrank*2] = start_offset;
|
||||
gpfs_offsets0[myrank*2+1] = end_offset;
|
||||
|
||||
MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
|
||||
MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
|
||||
|
||||
for (ii=0; ii<nprocs; ii++) {
|
||||
st_offsets [ii] = bgl_offsets[ii*2] ;
|
||||
end_offsets[ii] = bgl_offsets[ii*2+1];
|
||||
st_offsets [ii] = gpfs_offsets[ii*2] ;
|
||||
end_offsets[ii] = gpfs_offsets[ii*2+1];
|
||||
}
|
||||
ADIOI_Free( bgl_offsets0 );
|
||||
ADIOI_Free( bgl_offsets );
|
||||
ADIOI_Free( gpfs_offsets0 );
|
||||
ADIOI_Free( gpfs_offsets );
|
||||
} else {
|
||||
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
|
||||
ADIO_OFFSET, fd->comm);
|
||||
@ -202,9 +199,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIO_OFFSET, fd->comm);
|
||||
}
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
|
||||
|
||||
/* are the accesses of different processes interleaved? */
|
||||
for (i=1; i<nprocs; i++)
|
||||
@ -246,9 +241,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
return;
|
||||
}
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
|
||||
|
||||
/* We're going to perform aggregation of I/O. Here we call
|
||||
* ADIOI_Calc_file_domains() to determine what processes will handle I/O
|
||||
@ -266,8 +259,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
* needs to be mapped to an actual rank in the communicator later.
|
||||
*
|
||||
*/
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
|
||||
nprocs_for_coll, &min_st_offset,
|
||||
&fd_start, &fd_end, &fd_size, fd->fs_ptr);
|
||||
else
|
||||
@ -277,9 +270,39 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
fd->hints->min_fdomain_size, &fd_size,
|
||||
fd->hints->striping_unit);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
|
||||
if (gpfsmpio_p2pcontig==1) {
|
||||
/* For some simple yet common(?) workloads, full-on two-phase I/O is
|
||||
* overkill. We can establish sub-groups of processes and their
|
||||
* aggregator, and then these sub-groups will carry out a simplified
|
||||
* two-phase over that sub-group.
|
||||
*
|
||||
* First verify that the filetype is contig and the offsets are
|
||||
* increasing in rank order*/
|
||||
int x, inOrderAndNoGaps = 1;
|
||||
for (x=0;x<(nprocs-1);x++) {
|
||||
if (end_offsets[x] != (st_offsets[x+1]-1))
|
||||
inOrderAndNoGaps = 0;
|
||||
}
|
||||
if (inOrderAndNoGaps && buftype_is_contig) {
|
||||
/* if these conditions exist then execute the P2PContig code else
|
||||
* execute the original code */
|
||||
ADIOI_P2PContigReadAggregation(fd, buf,
|
||||
error_code, st_offsets, end_offsets, fd_start, fd_end);
|
||||
|
||||
/* NOTE: we are skipping the rest of two-phase in this path */
|
||||
GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
|
||||
|
||||
ADIOI_Free(offset_list);
|
||||
ADIOI_Free(len_list);
|
||||
ADIOI_Free(st_offsets);
|
||||
ADIOI_Free(end_offsets);
|
||||
ADIOI_Free(fd_start);
|
||||
ADIOI_Free(fd_end);
|
||||
goto fn_exit;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* calculate where the portions of the access requests of this process
|
||||
* are located in terms of the file domains. this could be on the same
|
||||
@ -293,8 +316,8 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
* buf_idx[] - array of locations into which data can be directly moved;
|
||||
* this is only valid for contiguous buffer case
|
||||
*/
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
|
||||
min_st_offset, fd_start, fd_end, fd_size,
|
||||
nprocs, &count_my_req_procs,
|
||||
&count_my_req_per_proc, &my_req,
|
||||
@ -306,9 +329,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
&count_my_req_per_proc, &my_req,
|
||||
&buf_idx);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
|
||||
|
||||
/* perform a collective communication in order to distribute the
|
||||
* data calculated above. fills in the following:
|
||||
@ -317,11 +338,11 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
* count_others_req_per_proc[] - number of separate contiguous
|
||||
* requests from proc i lie in this process's file domain.
|
||||
*/
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_Calc_others_req(fd, count_my_req_procs,
|
||||
count_my_req_per_proc, my_req,
|
||||
nprocs, myrank, &count_others_req_procs,
|
||||
&others_req);
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
|
||||
count_my_req_per_proc, my_req,
|
||||
nprocs, myrank, &count_others_req_procs,
|
||||
&others_req);
|
||||
|
||||
else
|
||||
ADIOI_Calc_others_req(fd, count_my_req_procs,
|
||||
@ -329,9 +350,7 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
nprocs, myrank, &count_others_req_procs,
|
||||
&others_req);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
|
||||
|
||||
/* my_req[] and count_my_req_per_proc aren't needed at this point, so
|
||||
* let's free the memory
|
||||
@ -354,12 +373,10 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
len_list, contig_access_count, min_st_offset,
|
||||
fd_size, fd_start, fd_end, buf_idx, error_code);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
|
||||
BGLMPIO_T_CIO_SET_GET( 0, r, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
|
||||
GPFSMPIO_T_CIO_SET_GET( r, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
|
||||
|
||||
BGLMPIO_T_CIO_REPORT( 0, r, fd, myrank )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_REPORT( 0, fd, myrank, nprocs)
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
|
||||
@ -381,8 +398,9 @@ void ADIOI_BGL_ReadStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIOI_Free(fd_start);
|
||||
ADIOI_Free(fd_end);
|
||||
|
||||
fn_exit:
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPI_Type_size(datatype, &size);
|
||||
MPI_Type_size_x(datatype, &size);
|
||||
bufsize = size * count;
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
/* This is a temporary way of filling in status. The right way is to
|
||||
@ -470,7 +488,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
|
||||
MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
|
||||
|
||||
if (ntimes) read_buf = (char *) ADIOI_Malloc(coll_bufsize);
|
||||
read_buf = fd->io_buf;
|
||||
|
||||
curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
|
||||
/* its use is explained below. calloc initializes to 0. */
|
||||
@ -627,6 +645,9 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
MPE_Log_event(14, 0, "end computation");
|
||||
#endif
|
||||
if (flag) {
|
||||
char round[50];
|
||||
sprintf(round, "two-phase-round=%d", m);
|
||||
setenv("LIBIOLOG_EXTRA_INFO", round, 1);
|
||||
ADIOI_Assert(size == (int)size);
|
||||
ADIO_ReadContig(fd, read_buf+for_curr_iter, (int)size, MPI_BYTE,
|
||||
ADIO_EXPLICIT_OFFSET, off, &status, error_code);
|
||||
@ -644,17 +665,17 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(7, 0, "start communication");
|
||||
#endif
|
||||
if (bglmpio_comm == 1)
|
||||
if (gpfsmpio_comm == 1)
|
||||
ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
|
||||
send_size, recv_size, count,
|
||||
start_pos, partial_send, recd_from_proc, nprocs,
|
||||
myrank,
|
||||
buftype_is_contig, contig_access_count,
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
others_req,
|
||||
m, buftype_extent, buf_idx);
|
||||
else
|
||||
if (bglmpio_comm == 0) {
|
||||
others_req,
|
||||
m, buftype_extent, buf_idx);
|
||||
else
|
||||
if (gpfsmpio_comm == 0) {
|
||||
ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
|
||||
send_size, recv_size, count,
|
||||
start_pos, partial_send, recd_from_proc, nprocs,
|
||||
@ -675,9 +696,10 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
ADIOI_Assert((((ADIO_Offset)(MPIR_Upint)read_buf)+real_size-for_next_iter) == (ADIO_Offset)(MPIR_Upint)(read_buf+real_size-for_next_iter));
|
||||
ADIOI_Assert((for_next_iter+coll_bufsize) == (size_t)(for_next_iter+coll_bufsize));
|
||||
memcpy(tmp_buf, read_buf+real_size-for_next_iter, for_next_iter);
|
||||
ADIOI_Free(read_buf);
|
||||
read_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
|
||||
memcpy(read_buf, tmp_buf, for_next_iter);
|
||||
ADIOI_Free(fd->io_buf);
|
||||
fd->io_buf = (char *) ADIOI_Malloc(for_next_iter+coll_bufsize);
|
||||
memcpy(fd->io_buf, tmp_buf, for_next_iter);
|
||||
read_buf = fd->io_buf;
|
||||
ADIOI_Free(tmp_buf);
|
||||
}
|
||||
|
||||
@ -692,7 +714,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
for (m=ntimes; m<max_ntimes; m++)
|
||||
/* nothing to send, but check for recv. */
|
||||
|
||||
if (bglmpio_comm == 1)
|
||||
if (gpfsmpio_comm == 1)
|
||||
ADIOI_R_Exchange_data(fd, buf, flat_buf, offset_list, len_list,
|
||||
send_size, recv_size, count,
|
||||
start_pos, partial_send, recd_from_proc, nprocs,
|
||||
@ -702,7 +724,7 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
others_req, m,
|
||||
buftype_extent, buf_idx);
|
||||
else /* strncmp( env_switch, "alltoall", 8 ) == 0 */
|
||||
if (bglmpio_comm == 0)
|
||||
if (gpfsmpio_comm == 0)
|
||||
ADIOI_R_Exchange_data_alltoallv(fd, buf, flat_buf, offset_list, len_list,
|
||||
send_size, recv_size, count,
|
||||
start_pos, partial_send, recd_from_proc, nprocs,
|
||||
@ -716,7 +738,6 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
MPE_Log_event(8, 0, "end communication");
|
||||
#endif
|
||||
|
||||
if (ntimes) ADIOI_Free(read_buf);
|
||||
ADIOI_Free(curr_offlen_ptr);
|
||||
ADIOI_Free(count);
|
||||
ADIOI_Free(partial_send);
|
||||
@ -724,6 +745,8 @@ static void ADIOI_Read_and_exch(ADIO_File fd, void *buf, MPI_Datatype
|
||||
ADIOI_Free(recv_size);
|
||||
ADIOI_Free(recd_from_proc);
|
||||
ADIOI_Free(start_pos);
|
||||
|
||||
unsetenv("LIBIOLOG_EXTRA_INFO");
|
||||
}
|
||||
|
||||
static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
@ -807,8 +830,8 @@ static void ADIOI_R_Exchange_data(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
tmp = others_req[i].lens[k];
|
||||
others_req[i].lens[k] = partial_send[i];
|
||||
}
|
||||
MPI_Type_hindexed(count[i],
|
||||
&(others_req[i].lens[start_pos[i]]),
|
||||
ADIOI_Type_create_hindexed_x(count[i],
|
||||
&(others_req[i].lens[start_pos[i]]),
|
||||
&(others_req[i].mem_ptrs[start_pos[i]]),
|
||||
MPI_BYTE, &send_type);
|
||||
/* absolute displacement; use MPI_BOTTOM in send */
|
||||
@ -968,7 +991,7 @@ static void ADIOI_Fill_user_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
* longer than the single region that processor "p" is responsible
|
||||
* for.
|
||||
*/
|
||||
p = ADIOI_BGL_Calc_aggregator(fd,
|
||||
p = ADIOI_GPFS_Calc_aggregator(fd,
|
||||
off,
|
||||
min_st_offset,
|
||||
&len,
|
||||
@ -1101,7 +1124,8 @@ static void ADIOI_R_Exchange_data_alltoallv(
|
||||
DBG_FPRINTF(stderr, "\ttails = %4d, %4d\n", stail, rtail );
|
||||
if (nprocs_send) {
|
||||
DBG_FPRINTF(stderr, "\tall_send_buf = [%d]%2d,",0,all_send_buf[0]);
|
||||
for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
|
||||
/* someone at some point found it useful to look at the 128th kilobyte of data from each processor, but this segfaults in many situations if "all debugging" enabled */
|
||||
//for (i=1; i<nprocs; i++) if(all_send_buf[(i-1)*131072]!=all_send_buf[i*131072]){ DBG_FPRINTF(stderr, "\t\t[%d]%2d,", i, all_send_buf [i*131072] ); }
|
||||
}
|
||||
#endif
|
||||
|
277
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_tuning.c
Обычный файл
277
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_tuning.c
Обычный файл
@ -0,0 +1,277 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_tuning.c
|
||||
* \brief Defines ad_gpfs performance tuning
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 2008 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_gpfs_tuning.c
|
||||
*
|
||||
* defines global variables and functions for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#include "ad_gpfs_tuning.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#if !defined(PVFS2_SUPER_MAGIC)
|
||||
#define PVFS2_SUPER_MAGIC (0x20030528)
|
||||
#endif
|
||||
|
||||
|
||||
int gpfsmpio_timing;
|
||||
int gpfsmpio_timing2;
|
||||
int gpfsmpio_timing_cw_level;
|
||||
int gpfsmpio_comm;
|
||||
int gpfsmpio_tunegather;
|
||||
int gpfsmpio_tuneblocking;
|
||||
long bglocklessmpio_f_type;
|
||||
int gpfsmpio_bg_nagg_pset;
|
||||
int gpfsmpio_pthreadio;
|
||||
int gpfsmpio_p2pcontig;
|
||||
int gpfsmpio_balancecontig;
|
||||
int gpfsmpio_devnullio;
|
||||
int gpfsmpio_bridgeringagg;
|
||||
|
||||
double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
|
||||
double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
/** \page mpiio_vars MPIIO Configuration
|
||||
\section env_sec Environment Variables
|
||||
* - GPFSMPIO_COMM - Define how data is exchanged on collective
|
||||
* reads and writes. Possible values:
|
||||
* - 0 - Use MPI_Alltoallv.
|
||||
* - 1 - Use MPI_Isend/MPI_Irecv.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
|
||||
* Possible values:
|
||||
* - 0 - Do not collect/report timing.
|
||||
* - 1 - Collect/report timing.
|
||||
* - Default is 0.
|
||||
*
|
||||
* - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
|
||||
* for aggregator collective i/o. Possible values:
|
||||
* - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
|
||||
* - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
|
||||
* calculated (block size). Possible values:
|
||||
* - 0 - Evenly calculate file domains across aggregators. Also use
|
||||
* MPI_Isend/MPI_Irecv to exchange domain information.
|
||||
* - 1 - Align file domains with the underlying file system's block size. Also use
|
||||
* MPI_Alltoallv to exchange domain information.
|
||||
* - Default is 1.
|
||||
*
|
||||
* - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
|
||||
* the ad_bglockless driver. NOTE: Using romio prefixes (such as
|
||||
* "bg:" or "bglockless:") on a file name will override this environment
|
||||
* variable. Possible values:
|
||||
* - 0xnnnnnnnn - Any valid file system type (or "magic number") from
|
||||
* statfs() field f_type.
|
||||
* - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
|
||||
*
|
||||
* - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
|
||||
* compute group (compute nodes + i/o nodes). Possible values:
|
||||
* - any integer
|
||||
* - Default is 8
|
||||
*
|
||||
* - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
|
||||
* pthread is spawned to do the posix writes while the main thread does the
|
||||
* data aggregation - useful for large files where multiple rounds are
|
||||
* required (more that the cb_buffer_size of data per aggregator). User
|
||||
* must ensure there is hw resource available for the thread to run. I
|
||||
* am sure there is a better way to do this involving comm threads - this is
|
||||
* just a start. NOTE: For some reason the stats collected when this is
|
||||
* enabled misses some of the data so the data sizes are off a bit - this is
|
||||
* a statistical issue only, the data is still accurately written out
|
||||
*
|
||||
* - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
|
||||
* aggregator and the procs that feed it. Performance could be enhanced by a
|
||||
* one-sided put algorithm. Current implementation allows only 1 round of
|
||||
* data. Useful/allowed only when:
|
||||
* 1.) The datatype is contiguous.
|
||||
* 2.) The offsets are increasing in rank-order.
|
||||
* 3.) There are no gaps between the offsets.
|
||||
* 4.) No single rank has a data size which spans multiple file domains.
|
||||
*
|
||||
* - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
|
||||
* to aggregators in a breadth-first fashion relative to the ions - additionally,
|
||||
* file domains on the aggregators sharing the same bridgeset and ion have contiguous
|
||||
* offsets. The breadth-first assignment improves performance in the case of
|
||||
* a relatively small file of size less than the gpfs block size multiplied
|
||||
* by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
|
||||
* - 0 - assign file domain blocks in the traditional manner
|
||||
* - 1 - if there are variable sized file domain blocks, spread them out
|
||||
* (balance) across bridge nodes
|
||||
*
|
||||
* - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
|
||||
* system. When experimenting with different two-phase I/O strategies, it's
|
||||
* helpful to remove the highly variable file system from the experiment.
|
||||
* - 0 (disabled) or 1 (enabled)
|
||||
* - Default is 0
|
||||
*
|
||||
* - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
|
||||
* optimization whch forms a 5-d ring around the bridge node starting at
|
||||
* GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
|
||||
* suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
|
||||
* and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
|
||||
* GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
|
||||
*
|
||||
*/
|
||||
|
||||
void ad_gpfs_get_env_vars() {
|
||||
char *x, *dummy;
|
||||
|
||||
gpfsmpio_comm = 0;
|
||||
x = getenv( "GPFSMPIO_COMM" );
|
||||
if (x) gpfsmpio_comm = atoi(x);
|
||||
gpfsmpio_timing = 0;
|
||||
x = getenv( "GPFSMPIO_TIMING" );
|
||||
if (x) gpfsmpio_timing = atoi(x);
|
||||
gpfsmpio_tunegather = 1;
|
||||
x = getenv( "GPFSMPIO_TUNEGATHER" );
|
||||
if (x) gpfsmpio_tunegather = atoi(x);
|
||||
gpfsmpio_tuneblocking = 1;
|
||||
x = getenv( "GPFSMPIO_TUNEBLOCKING" );
|
||||
if (x) gpfsmpio_tuneblocking = atoi(x);
|
||||
bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
|
||||
x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
|
||||
if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
|
||||
DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
|
||||
bglocklessmpio_f_type,bglocklessmpio_f_type);
|
||||
/* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
|
||||
* when we know a bit more about what "largest possible value" and
|
||||
* "smallest possible value" should be */
|
||||
gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
|
||||
x = getenv("GPFSMPIO_NAGG_PSET");
|
||||
if (x) gpfsmpio_bg_nagg_pset = atoi(x);
|
||||
|
||||
gpfsmpio_pthreadio = 0;
|
||||
x = getenv( "GPFSMPIO_PTHREADIO" );
|
||||
if (x) gpfsmpio_pthreadio = atoi(x);
|
||||
|
||||
gpfsmpio_p2pcontig = 0;
|
||||
x = getenv( "GPFSMPIO_P2PCONTIG" );
|
||||
if (x) gpfsmpio_p2pcontig = atoi(x);
|
||||
|
||||
gpfsmpio_balancecontig = 0;
|
||||
x = getenv( "GPFSMPIO_BALANCECONTIG" );
|
||||
if (x) gpfsmpio_balancecontig = atoi(x);
|
||||
|
||||
gpfsmpio_devnullio = 0;
|
||||
x = getenv( "GPFSMPIO_DEVNULLIO" );
|
||||
if (x) gpfsmpio_devnullio = atoi(x);
|
||||
|
||||
gpfsmpio_bridgeringagg = 0;
|
||||
x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
|
||||
if (x) gpfsmpio_bridgeringagg = atoi(x);
|
||||
}
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
|
||||
{
|
||||
int i;
|
||||
|
||||
if (gpfsmpio_timing) {
|
||||
/* Timing across the whole communicator is a little bit interesting,
|
||||
* but what is *more* interesting is if we single out the aggregators
|
||||
* themselves. non-aggregators spend a lot of time in "exchange" not
|
||||
* exchanging data, but blocked because they are waiting for
|
||||
* aggregators to finish writing. If we focus on just the aggregator
|
||||
* processes we will get a more clear picture about the data exchange
|
||||
* vs. i/o time breakdown */
|
||||
|
||||
/* if deferred open enabled, we could use the aggregator communicator */
|
||||
MPI_Comm agg_comm;
|
||||
int nr_aggs, agg_rank;
|
||||
MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
|
||||
if(agg_comm != MPI_COMM_NULL) {
|
||||
MPI_Comm_size(agg_comm, &nr_aggs);
|
||||
MPI_Comm_rank(agg_comm, &agg_rank);
|
||||
}
|
||||
|
||||
double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
|
||||
if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
|
||||
|
||||
double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
|
||||
double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
|
||||
|
||||
if( agg_comm != MPI_COMM_NULL) {
|
||||
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
|
||||
MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
|
||||
}
|
||||
if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
|
||||
|
||||
for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
|
||||
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] =
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ];
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] =
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ];
|
||||
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
|
||||
|
||||
fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
|
||||
fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
|
||||
fprintf(stderr,"SEEK-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] );
|
||||
fprintf(stderr,"SEEK-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] );
|
||||
fprintf(stderr,"LOCAL-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] );
|
||||
fprintf(stderr,"GATHER-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] );
|
||||
fprintf(stderr,"PATTERN-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] );
|
||||
fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] );
|
||||
fprintf(stderr,"MYREQ-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] );
|
||||
fprintf(stderr,"OTHERREQ-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] );
|
||||
fprintf(stderr,"EXCHANGE-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] );
|
||||
fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] );
|
||||
fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] );
|
||||
fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] );
|
||||
fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] );
|
||||
fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] );
|
||||
fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] );
|
||||
fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
|
||||
gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] );
|
||||
fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] );
|
||||
fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
|
||||
fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] );
|
||||
fprintf(stderr,"MPI-BW-avg: %10.3f , ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] );
|
||||
fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
|
||||
gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
|
||||
}
|
||||
if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
|
||||
}
|
||||
|
||||
}
|
114
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_tuning.h
Обычный файл
114
ompi/mca/io/romio/romio/adio/ad_gpfs/ad_gpfs_tuning.h
Обычный файл
@ -0,0 +1,114 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_gpfs_tuning.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
* ad_gpfs_tuning.h
|
||||
*
|
||||
* declares global variables and macros for performance tuning and
|
||||
* functional debugging.
|
||||
*---------------------------------------------------------------------*/
|
||||
|
||||
#ifndef AD_GPFS_TUNING_H_
|
||||
#define AD_GPFS_TUNING_H_
|
||||
|
||||
#include "adio.h"
|
||||
|
||||
|
||||
/*-----------------------------------------
|
||||
* Global variables for the control of
|
||||
* 1. timing
|
||||
* 2. select specific optimizations
|
||||
*-----------------------------------------*/
|
||||
|
||||
/* timing fields */
|
||||
enum {
|
||||
GPFSMPIO_CIO_DATA_SIZE=0,
|
||||
GPFSMPIO_CIO_T_SEEK,
|
||||
GPFSMPIO_CIO_T_LCOMP, /* time for ADIOI_Calc_my_off_len(), local */
|
||||
GPFSMPIO_CIO_T_GATHER, /* time for previous MPI_Allgather, now Allreduce */
|
||||
GPFSMPIO_CIO_T_PATANA, /* time for a quick test if access is contiguous or not, local */
|
||||
GPFSMPIO_CIO_T_FD_PART, /* time for file domain partitioning, local */
|
||||
GPFSMPIO_CIO_T_MYREQ, /* time for ADIOI_Calc_my_req(), local */
|
||||
GPFSMPIO_CIO_T_OTHREQ, /* time for ADIOI_Calc_others_req(), short Alltoall */
|
||||
GPFSMPIO_CIO_T_DEXCH, /* time for I/O data exchange */
|
||||
/* the next DEXCH_* timers capture finer-grained portions of T_DEXCH */
|
||||
GPFSMPIO_CIO_T_DEXCH_RECV_EXCH,/* time for each process to exchange recieve
|
||||
size info with everyone else */
|
||||
GPFSMPIO_CIO_T_DEXCH_SETUP, /* time for setup portion of I/O data exchange */
|
||||
GPFSMPIO_CIO_T_DEXCH_NET, /* time for network portion of I/O data exchange */
|
||||
GPFSMPIO_CIO_T_DEXCH_SORT, /* time to sort requesst in I/O data exchange */
|
||||
GPFSMPIO_CIO_T_DEXCH_SIEVE, /* time for read portion of RMW in two phase */
|
||||
GPFSMPIO_CIO_T_POSI_RW,
|
||||
GPFSMPIO_CIO_B_POSI_RW,
|
||||
GPFSMPIO_CIO_T_MPIO_RW, /* time for ADIOI_WriteContig() */
|
||||
GPFSMPIO_CIO_B_MPIO_RW,
|
||||
GPFSMPIO_CIO_T_MPIO_CRW, /* time for ADIOI_GPFS_WriteStridedColl() */
|
||||
GPFSMPIO_CIO_B_MPIO_CRW,
|
||||
GPFSMPIO_CIO_LAST
|
||||
};
|
||||
|
||||
/* +1 because GPFSMPIO_CIO_LAST is actually used to say "zero this counter"" */
|
||||
extern double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
|
||||
extern double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
|
||||
|
||||
|
||||
/* corresponds to environment variables to select optimizations and timing level */
|
||||
extern int gpfsmpio_timing;
|
||||
extern int gpfsmpio_timing_cw_level;
|
||||
extern int gpfsmpio_comm;
|
||||
extern int gpfsmpio_tunegather;
|
||||
extern int gpfsmpio_tuneblocking;
|
||||
extern long bglocklessmpio_f_type;
|
||||
extern int gpfsmpio_pthreadio;
|
||||
extern int gpfsmpio_p2pcontig;
|
||||
extern int gpfsmpio_balancecontig;
|
||||
extern int gpfsmpio_devnullio;
|
||||
extern int gpfsmpio_bridgeringagg;
|
||||
|
||||
/* Default is, well, kind of complicated. Blue Gene /L and /P had "psets": one
|
||||
* i/o node and all compute nodes wired to it. On Blue Gene /Q that
|
||||
* relationship is a lot more fluid. There are still I/O nodes, and compute
|
||||
* nodes are assigned to an i/o node, but there are two routes to the i/o node,
|
||||
* via compute nodes designated as "bridge nodes". In this code, what we used
|
||||
* to call a "pset" is actually "compute nodes associated with and including a
|
||||
* bridge node". So, "nAgg" is roughly "number of aggregators per bridge", but
|
||||
* look closely at ADIOI_BG_persInfo_init() for the details */
|
||||
|
||||
#define ADIOI_BG_NAGG_PSET_DFLT 16
|
||||
|
||||
extern int gpfsmpio_bg_nagg_pset;
|
||||
|
||||
|
||||
/* set internal variables for tuning environment variables */
|
||||
void ad_gpfs_get_env_vars(void);
|
||||
|
||||
/* report timing breakdown for MPI I/O collective call */
|
||||
void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs );
|
||||
|
||||
/* note:
|
||||
* T := timing;
|
||||
* CIO := collective I/O
|
||||
*/
|
||||
#define GPFSMPIO_T_CIO_RESET( RW ) \
|
||||
{ \
|
||||
int _i; \
|
||||
for ( _i = 0; _i < GPFSMPIO_CIO_LAST; _i ++ ) \
|
||||
gpfsmpio_prof_c##RW [ _i ] = 0; \
|
||||
}
|
||||
|
||||
#define GPFSMPIO_T_CIO_REPORT( RW, FD, MYRANK, NPROCS ) \
|
||||
ad_gpfs_timing_crw_report ( RW, FD, MYRANK, NPROCS ); \
|
||||
|
||||
#define GPFSMPIO_T_CIO_SET_GET(RW, ISSET, ISGET, VAR1, VAR2 ) \
|
||||
{\
|
||||
double temp = MPI_Wtime(); \
|
||||
if ( ISSET ) gpfsmpio_prof_c##RW [ VAR1 ] = temp; \
|
||||
if ( ISGET ) gpfsmpio_prof_c##RW [ VAR2 ] = temp - gpfsmpio_prof_c##RW [ VAR2 ] ;\
|
||||
}
|
||||
|
||||
#endif /* AD_GPFS_TUNING_H_ */
|
@ -2,7 +2,7 @@
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bgl_wrcoll.c
|
||||
* \file ad_gpfs_wrcoll.c
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
@ -14,9 +14,12 @@
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_extern.h"
|
||||
#include "ad_bgl.h"
|
||||
#include "ad_bgl_pset.h"
|
||||
#include "ad_bgl_aggrs.h"
|
||||
#include "ad_gpfs.h"
|
||||
#include "ad_gpfs_aggrs.h"
|
||||
|
||||
#ifdef BGQPLATFORM
|
||||
#include <mpix.h>
|
||||
#endif
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
@ -25,6 +28,16 @@
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#ifdef HAVE_GPFS_H
|
||||
#include <gpfs.h>
|
||||
#endif
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
#include <gpfs_fcntl.h>
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
/* prototypes of functions used for collective writes only. */
|
||||
static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
datatype, int nprocs, int myrank, ADIOI_Access
|
||||
@ -33,7 +46,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
min_st_offset, ADIO_Offset fd_size,
|
||||
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
|
||||
int *buf_idx, int *error_code);
|
||||
static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
|
||||
static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf,
|
||||
ADIOI_Flatlist_node *flat_buf, ADIO_Offset
|
||||
*offset_list, ADIO_Offset *len_list, int *send_size,
|
||||
int *recv_size, ADIO_Offset off, int size,
|
||||
@ -45,10 +58,10 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, void *buf, char *write_buf,
|
||||
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
|
||||
ADIOI_Access *others_req,
|
||||
int *send_buf_idx, int *curr_to_proc,
|
||||
int *done_to_proc, int *hole, int iter,
|
||||
int *done_to_proc, int *hole, int iter,
|
||||
MPI_Aint buftype_extent, int *buf_idx, int *error_code);
|
||||
static void ADIOI_W_Exchange_data_alltoallv(
|
||||
ADIO_File fd, void *buf,
|
||||
ADIO_File fd, const void *buf,
|
||||
char *write_buf, /* 1 */
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
@ -66,7 +79,7 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
int *done_to_proc, int *hole, /* 4 */
|
||||
int iter, MPI_Aint buftype_extent, int *buf_idx,
|
||||
int *error_code);
|
||||
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
|
||||
*flat_buf, char **send_buf, ADIO_Offset
|
||||
*offset_list, ADIO_Offset *len_list, int *send_size,
|
||||
MPI_Request *requests, int *sent_to_proc,
|
||||
@ -77,7 +90,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
int *send_buf_idx, int *curr_to_proc,
|
||||
int *done_to_proc, int iter,
|
||||
MPI_Aint buftype_extent);
|
||||
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
|
||||
*flat_buf, char **send_buf, ADIO_Offset
|
||||
*offset_list, ADIO_Offset *len_list, int *send_size,
|
||||
MPI_Request *requests, int *sent_to_proc,
|
||||
@ -93,7 +106,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
|
||||
int nprocs, int nprocs_recv, int total_elements);
|
||||
|
||||
|
||||
void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_GPFS_WriteStridedColl(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code)
|
||||
@ -118,30 +131,16 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off;
|
||||
ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL,
|
||||
*fd_end = NULL, *end_offsets = NULL;
|
||||
ADIO_Offset *bgl_offsets0 = NULL, *bgl_offsets = NULL;
|
||||
ADIO_Offset *gpfs_offsets0 = NULL, *gpfs_offsets = NULL;
|
||||
int ii;
|
||||
|
||||
int *buf_idx = NULL;
|
||||
ADIO_Offset *len_list = NULL;
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_RESET( 0, w )
|
||||
#endif
|
||||
#if 0
|
||||
/* From common code - not implemented for bgl.*/
|
||||
int old_error, tmp_error;
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_RESET( w )
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(13, 0, "start computation");
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/* From common code - not implemented for bgl. */
|
||||
if (fd->hints->cb_pfr != ADIOI_HINT_DISABLE) {
|
||||
ADIOI_IOStridedColl (fd, buf, count, ADIOI_WRITE, datatype,
|
||||
file_ptr_type, offset, status, error_code);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &myrank);
|
||||
|
||||
@ -151,9 +150,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
nprocs_for_coll = fd->hints->cb_nodes;
|
||||
orig_fp = fd->fp_ind;
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 0, BGLMPIO_CIO_LCOMP, BGLMPIO_CIO_LAST )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_MPIO_CRW, GPFSMPIO_CIO_LAST)
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 0, GPFSMPIO_CIO_T_LCOMP, GPFSMPIO_CIO_LAST )
|
||||
|
||||
|
||||
/* only check for interleaving if cb_write isn't disabled */
|
||||
@ -168,9 +166,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
&offset_list, &len_list, &start_offset,
|
||||
&end_offset, &contig_access_count);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_GATHER, BGLMPIO_CIO_LCOMP )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_GATHER, GPFSMPIO_CIO_T_LCOMP )
|
||||
|
||||
/* each process communicates its start and end offsets to other
|
||||
processes. The result is an array each of start and end offsets stored
|
||||
@ -179,24 +175,24 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
|
||||
end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs*sizeof(ADIO_Offset));
|
||||
|
||||
if (bglmpio_tunegather) {
|
||||
bgl_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
bgl_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
if (gpfsmpio_tunegather) {
|
||||
gpfs_offsets0 = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
gpfs_offsets = (ADIO_Offset *) ADIOI_Malloc(2*nprocs*sizeof(ADIO_Offset));
|
||||
for (ii=0; ii<nprocs; ii++) {
|
||||
bgl_offsets0[ii*2] = 0;
|
||||
bgl_offsets0[ii*2+1] = 0;
|
||||
gpfs_offsets0[ii*2] = 0;
|
||||
gpfs_offsets0[ii*2+1] = 0;
|
||||
}
|
||||
bgl_offsets0[myrank*2] = start_offset;
|
||||
bgl_offsets0[myrank*2+1] = end_offset;
|
||||
gpfs_offsets0[myrank*2] = start_offset;
|
||||
gpfs_offsets0[myrank*2+1] = end_offset;
|
||||
|
||||
MPI_Allreduce( bgl_offsets0, bgl_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
|
||||
MPI_Allreduce( gpfs_offsets0, gpfs_offsets, nprocs*2, ADIO_OFFSET, MPI_MAX, fd->comm );
|
||||
|
||||
for (ii=0; ii<nprocs; ii++) {
|
||||
st_offsets [ii] = bgl_offsets[ii*2] ;
|
||||
end_offsets[ii] = bgl_offsets[ii*2+1];
|
||||
st_offsets [ii] = gpfs_offsets[ii*2] ;
|
||||
end_offsets[ii] = gpfs_offsets[ii*2+1];
|
||||
}
|
||||
ADIOI_Free( bgl_offsets0 );
|
||||
ADIOI_Free( bgl_offsets );
|
||||
ADIOI_Free( gpfs_offsets0 );
|
||||
ADIOI_Free( gpfs_offsets );
|
||||
} else {
|
||||
MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
|
||||
ADIO_OFFSET, fd->comm);
|
||||
@ -204,9 +200,7 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIO_OFFSET, fd->comm);
|
||||
}
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_PATANA, BGLMPIO_CIO_GATHER )
|
||||
#endif
|
||||
GPFSMPIO_T_CIO_SET_GET(w, 1, 1, GPFSMPIO_CIO_T_PATANA, GPFSMPIO_CIO_T_GATHER )
|
||||
|
||||
/* are the accesses of different processes interleaved? */
|
||||
for (i=1; i<nprocs; i++)
|
||||
@ -250,16 +244,14 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
return;
|
||||
}
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_FD_PART, BGLMPIO_CIO_PATANA )
|
||||
#endif
|
||||
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_FD_PART, GPFSMPIO_CIO_T_PATANA )
|
||||
|
||||
/* Divide the I/O workload among "nprocs_for_coll" processes. This is
|
||||
done by (logically) dividing the file into file domains (FDs); each
|
||||
process may directly access only its own file domain. */
|
||||
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_GPFS_Calc_file_domains(st_offsets, end_offsets, nprocs,
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_file_domains(fd, st_offsets, end_offsets, nprocs,
|
||||
nprocs_for_coll, &min_st_offset,
|
||||
&fd_start, &fd_end, &fd_size, fd->fs_ptr);
|
||||
else
|
||||
@ -269,15 +261,42 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
fd->hints->min_fdomain_size, &fd_size,
|
||||
fd->hints->striping_unit);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 1, 1, BGLMPIO_CIO_MYREQ, BGLMPIO_CIO_FD_PART )
|
||||
#endif
|
||||
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_MYREQ, GPFSMPIO_CIO_T_FD_PART );
|
||||
|
||||
if (gpfsmpio_p2pcontig==1) {
|
||||
/* For some simple yet common(?) workloads, full-on two-phase I/O is overkill. We can establish sub-groups of processes and their aggregator, and then these sub-groups will carry out a simplified two-phase over that sub-group.
|
||||
*
|
||||
* First verify that the filetype is contig and the offsets are
|
||||
* increasing in rank order*/
|
||||
int i, inOrderAndNoGaps = 1;
|
||||
for (i=0;i<(nprocs-1);i++) {
|
||||
if (end_offsets[i] != (st_offsets[i+1]-1))
|
||||
inOrderAndNoGaps = 0;
|
||||
}
|
||||
if (inOrderAndNoGaps && buftype_is_contig) {
|
||||
/* if these conditions exist then execute the P2PContig code else
|
||||
* execute the original code */
|
||||
ADIOI_P2PContigWriteAggregation(fd, buf,
|
||||
error_code, st_offsets, end_offsets, fd_start, fd_end);
|
||||
/* NOTE: we are skipping the rest of two-phase in this path */
|
||||
GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
|
||||
|
||||
ADIOI_Free(offset_list);
|
||||
ADIOI_Free(len_list);
|
||||
ADIOI_Free(st_offsets);
|
||||
ADIOI_Free(end_offsets);
|
||||
ADIOI_Free(fd_start);
|
||||
ADIOI_Free(fd_end);
|
||||
|
||||
goto fn_exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* calculate what portions of the access requests of this process are
|
||||
located in what file domains */
|
||||
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_Calc_my_req(fd, offset_list, len_list, contig_access_count,
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_my_req(fd, offset_list, len_list, contig_access_count,
|
||||
min_st_offset, fd_start, fd_end, fd_size,
|
||||
nprocs, &count_my_req_procs,
|
||||
&count_my_req_per_proc, &my_req,
|
||||
@ -287,12 +306,10 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
min_st_offset, fd_start, fd_end, fd_size,
|
||||
nprocs, &count_my_req_procs,
|
||||
&count_my_req_per_proc, &my_req,
|
||||
&buf_idx);
|
||||
&buf_idx);
|
||||
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_OTHREQ, GPFSMPIO_CIO_T_MYREQ )
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_OTHREQ, BGLMPIO_CIO_MYREQ )
|
||||
#endif
|
||||
|
||||
/* based on everyone's my_req, calculate what requests of other
|
||||
processes lie in this process's file domain.
|
||||
count_others_req_procs = number of processes whose requests lie in
|
||||
@ -300,8 +317,8 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
count_others_req_per_proc[i] indicates how many separate contiguous
|
||||
requests of proc. i lie in this process's file domain. */
|
||||
|
||||
if (bglmpio_tuneblocking)
|
||||
ADIOI_BGL_Calc_others_req(fd, count_my_req_procs,
|
||||
if (gpfsmpio_tuneblocking)
|
||||
ADIOI_GPFS_Calc_others_req(fd, count_my_req_procs,
|
||||
count_my_req_per_proc, my_req,
|
||||
nprocs, myrank,
|
||||
&count_others_req_procs, &others_req);
|
||||
@ -309,11 +326,9 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIOI_Calc_others_req(fd, count_my_req_procs,
|
||||
count_my_req_per_proc, my_req,
|
||||
nprocs, myrank,
|
||||
&count_others_req_procs, &others_req);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 1, 1, BGLMPIO_CIO_DEXCH, BGLMPIO_CIO_OTHREQ )
|
||||
#endif
|
||||
&count_others_req_procs, &others_req);
|
||||
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 1, 1, GPFSMPIO_CIO_T_DEXCH, GPFSMPIO_CIO_T_OTHREQ )
|
||||
|
||||
ADIOI_Free(count_my_req_per_proc);
|
||||
for (i=0; i < nprocs; i++) {
|
||||
@ -330,54 +345,11 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
len_list, contig_access_count, min_st_offset,
|
||||
fd_size, fd_start, fd_end, buf_idx, error_code);
|
||||
|
||||
#if BGL_PROFILE
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 1, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_DEXCH )
|
||||
BGLMPIO_T_CIO_SET_GET( 0, w, 0, 0, 1, BGLMPIO_CIO_LAST, BGLMPIO_CIO_T_MPIO_CRW )
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_DEXCH )
|
||||
GPFSMPIO_T_CIO_SET_GET( w, 0, 1, GPFSMPIO_CIO_LAST, GPFSMPIO_CIO_T_MPIO_CRW )
|
||||
|
||||
BGLMPIO_T_CIO_REPORT( 0, w, fd, myrank )
|
||||
#endif
|
||||
#if 0
|
||||
/* From common code - not implemented for bgl.
|
||||
*
|
||||
* If this collective write is followed by an independent write,
|
||||
* it's possible to have those subsequent writes on other processes
|
||||
* race ahead and sneak in before the read-modify-write completes.
|
||||
* We carry out a collective communication at the end here so no one
|
||||
* can start independent i/o before collective I/O completes.
|
||||
*
|
||||
* need to do some gymnastics with the error codes so that if something
|
||||
* went wrong, all processes report error, but if a process has a more
|
||||
* specific error code, we can still have that process report the
|
||||
* additional information */
|
||||
GPFSMPIO_T_CIO_REPORT( 1, fd, myrank, nprocs)
|
||||
|
||||
old_error = *error_code;
|
||||
if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO;
|
||||
|
||||
/* optimization: if only one process performing i/o, we can perform
|
||||
* a less-expensive Bcast */
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event( ADIOI_MPE_postwrite_a, 0, NULL );
|
||||
#endif
|
||||
if (fd->hints->cb_nodes == 1)
|
||||
MPI_Bcast(error_code, 1, MPI_INT,
|
||||
fd->hints->ranklist[0], fd->comm);
|
||||
else {
|
||||
tmp_error = *error_code;
|
||||
MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
|
||||
MPI_MAX, fd->comm);
|
||||
}
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event( ADIOI_MPE_postwrite_b, 0, NULL );
|
||||
#endif
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
MPE_Log_event (5012, 0, NULL);
|
||||
#endif
|
||||
|
||||
if ( (old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO) )
|
||||
*error_code = old_error;
|
||||
|
||||
|
||||
#endif
|
||||
/* free all memory allocated for collective I/O */
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
|
||||
@ -398,11 +370,12 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
ADIOI_Free(fd_start);
|
||||
ADIOI_Free(fd_end);
|
||||
|
||||
fn_exit:
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
if (status) {
|
||||
int bufsize, size;
|
||||
MPI_Count bufsize, size;
|
||||
/* Don't set status if it isn't needed */
|
||||
MPI_Type_size(datatype, &size);
|
||||
MPI_Type_size_x(datatype, &size);
|
||||
bufsize = size * count;
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
}
|
||||
@ -416,6 +389,100 @@ void ADIOI_BGL_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
#endif
|
||||
}
|
||||
|
||||
static void gpfs_wr_access_start(int fd, ADIO_Offset offset, ADIO_Offset length)
|
||||
{
|
||||
int rc=0;
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
struct {
|
||||
gpfsFcntlHeader_t header;
|
||||
gpfsAccessRange_t access;
|
||||
} take_locks;
|
||||
|
||||
take_locks.header.totalLength = sizeof(take_locks);
|
||||
take_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
|
||||
take_locks.header.fcntlReserved = 0;
|
||||
|
||||
take_locks.access.structLen = sizeof(take_locks.access);
|
||||
take_locks.access.structType = GPFS_ACCESS_RANGE;
|
||||
take_locks.access.start = offset;
|
||||
take_locks.access.length = length;
|
||||
take_locks.access.isWrite = 1;
|
||||
|
||||
rc = gpfs_fcntl(fd, &take_locks);
|
||||
#endif
|
||||
ADIOI_Assert(rc == 0);
|
||||
}
|
||||
|
||||
static void gpfs_wr_access_end(int fd, ADIO_Offset offset, ADIO_Offset length)
|
||||
{
|
||||
int rc=0;
|
||||
#ifdef HAVE_GPFS_FCNTL_H
|
||||
struct {
|
||||
gpfsFcntlHeader_t header;
|
||||
gpfsFreeRange_t free;
|
||||
} free_locks;
|
||||
|
||||
|
||||
free_locks.header.totalLength = sizeof(free_locks);
|
||||
free_locks.header.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
|
||||
free_locks.header.fcntlReserved = 0;
|
||||
|
||||
free_locks.free.structLen = sizeof(free_locks.free);
|
||||
free_locks.free.structType = GPFS_FREE_RANGE;
|
||||
free_locks.free.start = offset;
|
||||
free_locks.free.length = length;
|
||||
|
||||
rc = gpfs_fcntl(fd, &free_locks);
|
||||
#endif
|
||||
ADIOI_Assert(rc == 0);
|
||||
}
|
||||
|
||||
#ifdef BGQPLATFORM
|
||||
/* my_start, my_end: this processes file domain. coudd be -1,-1 for "no i/o"
|
||||
* fd_start, fd_end: arrays of length fd->hints->cb_nodes specifying all file domains */
|
||||
static int gpfs_find_access_for_ion(ADIO_File fd,
|
||||
ADIO_Offset my_start, ADIO_Offset my_end,
|
||||
ADIO_Offset *fd_start, ADIO_Offset *fd_end,
|
||||
ADIO_Offset *start, ADIO_Offset *end)
|
||||
{
|
||||
int my_ionode = MPIX_IO_node_id();
|
||||
int *rank_to_ionode;
|
||||
int i, nprocs, rank;
|
||||
ADIO_Offset group_start=LLONG_MAX, group_end=0;
|
||||
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &rank);
|
||||
|
||||
rank_to_ionode = ADIOI_Calloc(nprocs, sizeof(int));
|
||||
MPI_Allgather(&my_ionode, 1, MPI_INT, rank_to_ionode, 1, MPI_INT, fd->comm);
|
||||
|
||||
/* rank_to_ionode now contains a mapping from MPI rank to IO node */
|
||||
/* fd->hints->ranklist[] contains a list of MPI ranks that are aggregators */
|
||||
/* fd_start[] and fd_end[] contain a list of file domains. */
|
||||
|
||||
/* what we really want to do is take all the file domains associated
|
||||
* with a given i/o node and find the begin/end of that range.
|
||||
*
|
||||
* Because gpfs_fcntl hints are expected to be released, we'll pass this
|
||||
* start/end back to the caller, who will both declare and free this range
|
||||
*/
|
||||
if (my_start == -1 || my_end == -1) {
|
||||
ADIOI_Free(rank_to_ionode);
|
||||
return 0; /* no work to do */
|
||||
}
|
||||
|
||||
for (i=0; i<fd->hints->cb_nodes; i++ ){
|
||||
if (my_ionode == rank_to_ionode[fd->hints->ranklist[i]] ) {
|
||||
group_start = ADIOI_MIN(fd_start[i], group_start);
|
||||
group_end = ADIOI_MAX(fd_end[i], group_end);
|
||||
}
|
||||
}
|
||||
*start = group_start;
|
||||
*end = group_end;
|
||||
ADIOI_Free(rank_to_ionode);
|
||||
return 1;
|
||||
}
|
||||
#endif // BGQPLATFORM
|
||||
|
||||
|
||||
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
|
||||
@ -444,7 +511,7 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
ADIO_Offset size=0;
|
||||
int hole, i, j, m, ntimes, max_ntimes, buftype_is_contig;
|
||||
ADIO_Offset st_loc=-1, end_loc=-1, off, done, req_off;
|
||||
char *write_buf=NULL;
|
||||
char *write_buf=NULL, *write_buf2=NULL;
|
||||
int *curr_offlen_ptr, *count, *send_size, req_len, *recv_size;
|
||||
int *partial_recv, *sent_to_proc, *start_pos, flag;
|
||||
int *send_buf_idx, *curr_to_proc, *done_to_proc;
|
||||
@ -454,6 +521,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
int info_flag, coll_bufsize;
|
||||
char *value;
|
||||
static char myname[] = "ADIOI_EXCH_AND_WRITE";
|
||||
pthread_t io_thread;
|
||||
void *thread_ret;
|
||||
ADIOI_IO_ThreadFuncData io_thread_args;
|
||||
|
||||
*error_code = MPI_SUCCESS; /* changed below if error */
|
||||
/* only I/O errors are currently reported */
|
||||
@ -468,6 +538,11 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
coll_bufsize = atoi(value);
|
||||
ADIOI_Free(value);
|
||||
|
||||
if (gpfsmpio_pthreadio == 1){
|
||||
/* ROMIO will spawn an additional thread. both threads use separate
|
||||
* halves of the collective buffer*/
|
||||
coll_bufsize = coll_bufsize/2;
|
||||
}
|
||||
|
||||
for (i=0; i < nprocs; i++) {
|
||||
if (others_req[i].count) {
|
||||
@ -491,11 +566,35 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
if ((st_loc==-1) && (end_loc==-1)) {
|
||||
ntimes = 0; /* this process does no writing. */
|
||||
}
|
||||
if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
|
||||
an aggregator -- otherwise will fail for deferred open */
|
||||
if (getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
|
||||
gpfs_wr_access_start(fd->fd_sys, st_loc, end_loc - st_loc);
|
||||
}
|
||||
}
|
||||
|
||||
ADIO_Offset st_loc_ion=0, end_loc_ion=0, needs_gpfs_access_cleanup=0;
|
||||
#ifdef BGQPLATFORM
|
||||
if (ntimes > 0) { /* only set the gpfs hint if we have io - ie this rank is
|
||||
an aggregator -- otherwise will fail for deferred open */
|
||||
|
||||
if (getenv("ROMIO_GPFS_DECLARE_ION_ACCESS")!=NULL) {
|
||||
if (gpfs_find_access_for_ion(fd, st_loc, end_loc, fd_start, fd_end,
|
||||
&st_loc_ion, &end_loc_ion)) {
|
||||
gpfs_wr_access_start(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
|
||||
needs_gpfs_access_cleanup=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX,
|
||||
fd->comm);
|
||||
|
||||
if (ntimes) write_buf = (char *) ADIOI_Malloc(coll_bufsize);
|
||||
write_buf = fd->io_buf;
|
||||
if (gpfsmpio_pthreadio == 1) {
|
||||
write_buf2 = fd->io_buf + coll_bufsize;
|
||||
}
|
||||
|
||||
curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
|
||||
/* its use is explained below. calloc initializes to 0. */
|
||||
@ -552,6 +651,9 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
done = 0;
|
||||
off = st_loc;
|
||||
|
||||
if(gpfsmpio_pthreadio == 1)
|
||||
io_thread = pthread_self();
|
||||
|
||||
#ifdef PROFILE
|
||||
MPE_Log_event(14, 0, "end computation");
|
||||
#endif
|
||||
@ -642,22 +744,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
MPE_Log_event(14, 0, "end computation");
|
||||
MPE_Log_event(7, 0, "start communication");
|
||||
#endif
|
||||
if (bglmpio_comm == 1)
|
||||
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
if (gpfsmpio_comm == 1)
|
||||
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
buftype_is_contig, contig_access_count,
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
others_req, send_buf_idx, curr_to_proc,
|
||||
done_to_proc, &hole, m, buftype_extent, buf_idx,
|
||||
error_code);
|
||||
else
|
||||
if (bglmpio_comm == 0)
|
||||
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
if (gpfsmpio_comm == 0)
|
||||
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
buftype_is_contig, contig_access_count,
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
others_req, send_buf_idx, curr_to_proc,
|
||||
@ -673,15 +775,52 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
if (count[i]) flag = 1;
|
||||
|
||||
if (flag) {
|
||||
char round[50];
|
||||
sprintf(round, "two-phase-round=%d", m);
|
||||
setenv("LIBIOLOG_EXTRA_INFO", round, 1);
|
||||
ADIOI_Assert(size == (int)size);
|
||||
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE, ADIO_EXPLICIT_OFFSET,
|
||||
off, &status, error_code);
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
if (gpfsmpio_pthreadio == 1) {
|
||||
/* there is no such thing as "invalid pthread identifier", so
|
||||
* we'll use pthread_self() instead. Before we do I/O we want
|
||||
* to complete I/O from any previous iteration -- but only a
|
||||
* previous iteration that had I/O work to do (i.e. set 'flag')
|
||||
*/
|
||||
if(!pthread_equal(io_thread, pthread_self())) {
|
||||
pthread_join(io_thread, &thread_ret);
|
||||
*error_code = *(int *)thread_ret;
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
io_thread = pthread_self();
|
||||
|
||||
}
|
||||
io_thread_args.fd = fd;
|
||||
/* do a little pointer shuffling: background I/O works from one
|
||||
* buffer while two-phase machinery fills up another */
|
||||
io_thread_args.buf = write_buf;
|
||||
ADIOI_SWAP(write_buf, write_buf2, char*);
|
||||
io_thread_args.io_kind = ADIOI_WRITE;
|
||||
io_thread_args.size = size;
|
||||
io_thread_args.offset = off;
|
||||
io_thread_args.status = status;
|
||||
io_thread_args.error_code = *error_code;
|
||||
if ( (pthread_create(&io_thread, NULL,
|
||||
ADIOI_IO_Thread_Func, &(io_thread_args))) != 0)
|
||||
io_thread = pthread_self();
|
||||
} else {
|
||||
ADIO_WriteContig(fd, write_buf, (int)size, MPI_BYTE,
|
||||
ADIO_EXPLICIT_OFFSET, off, &status, error_code);
|
||||
if (*error_code != MPI_SUCCESS) return;
|
||||
}
|
||||
}
|
||||
|
||||
off += size;
|
||||
done += size;
|
||||
}
|
||||
if (gpfsmpio_pthreadio == 1) {
|
||||
if ( !pthread_equal(io_thread, pthread_self()) ) {
|
||||
pthread_join(io_thread, &thread_ret);
|
||||
*error_code = *(int *)thread_ret;
|
||||
}
|
||||
}
|
||||
|
||||
for (i=0; i<nprocs; i++) count[i] = recv_size[i] = 0;
|
||||
#ifdef PROFILE
|
||||
@ -689,22 +828,22 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
#endif
|
||||
for (m=ntimes; m<max_ntimes; m++)
|
||||
/* nothing to recv, but check for send. */
|
||||
if (bglmpio_comm == 1)
|
||||
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
if (gpfsmpio_comm == 1)
|
||||
ADIOI_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
buftype_is_contig, contig_access_count,
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
others_req, send_buf_idx,
|
||||
curr_to_proc, done_to_proc, &hole, m,
|
||||
buftype_extent, buf_idx, error_code);
|
||||
else
|
||||
if (bglmpio_comm == 0)
|
||||
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
if (gpfsmpio_comm == 0)
|
||||
ADIOI_W_Exchange_data_alltoallv(fd, buf, write_buf, flat_buf, offset_list,
|
||||
len_list, send_size, recv_size, off, size, count,
|
||||
start_pos, partial_recv,
|
||||
sent_to_proc, nprocs, myrank,
|
||||
buftype_is_contig, contig_access_count,
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
others_req, send_buf_idx,
|
||||
@ -715,7 +854,6 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
MPE_Log_event(8, 0, "end communication");
|
||||
#endif
|
||||
|
||||
if (ntimes) ADIOI_Free(write_buf);
|
||||
ADIOI_Free(curr_offlen_ptr);
|
||||
ADIOI_Free(count);
|
||||
ADIOI_Free(partial_recv);
|
||||
@ -726,6 +864,17 @@ static void ADIOI_Exch_and_write(ADIO_File fd, const void *buf, MPI_Datatype
|
||||
ADIOI_Free(send_buf_idx);
|
||||
ADIOI_Free(curr_to_proc);
|
||||
ADIOI_Free(done_to_proc);
|
||||
|
||||
if (ntimes != 0 && getenv("ROMIO_GPFS_DECLARE_ACCESS")!=NULL) {
|
||||
gpfs_wr_access_end(fd->fd_sys, st_loc, end_loc-st_loc);
|
||||
}
|
||||
|
||||
if (needs_gpfs_access_cleanup) {
|
||||
gpfs_wr_access_end(fd->fd_sys, st_loc_ion, end_loc_ion-st_loc_ion);
|
||||
needs_gpfs_access_cleanup=0;
|
||||
}
|
||||
|
||||
unsetenv("LIBIOLOG_EXTRA_INFO");
|
||||
}
|
||||
|
||||
|
||||
@ -783,8 +932,8 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
|
||||
tmp_len[i] = others_req[i].lens[k];
|
||||
others_req[i].lens[k] = partial_recv[i];
|
||||
}
|
||||
MPI_Type_hindexed(count[i],
|
||||
&(others_req[i].lens[start_pos[i]]),
|
||||
ADIOI_Type_create_hindexed_x(count[i],
|
||||
&(others_req[i].lens[start_pos[i]]),
|
||||
&(others_req[i].mem_ptrs[start_pos[i]]),
|
||||
MPI_BYTE, recv_types+j);
|
||||
/* absolute displacements; use MPI_BOTTOM in recv */
|
||||
@ -799,15 +948,12 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
|
||||
|
||||
sum = 0;
|
||||
for (i=0; i<nprocs; i++) sum += count[i];
|
||||
/* valgrind-detcted optimization: if there is no work on this process we do
|
||||
* not need to search for holes */
|
||||
if (sum) {
|
||||
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum)*sizeof(ADIO_Offset));
|
||||
srt_len = (int *) ADIOI_Malloc((sum)*sizeof(int));
|
||||
srt_off = (ADIO_Offset *) ADIOI_Malloc((sum+1)*sizeof(ADIO_Offset));
|
||||
srt_len = (int *) ADIOI_Malloc((sum+1)*sizeof(int));
|
||||
/* +1 to avoid a 0-size malloc */
|
||||
|
||||
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
|
||||
nprocs, nprocs_recv, sum);
|
||||
}
|
||||
ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
|
||||
nprocs, nprocs_recv, sum);
|
||||
|
||||
/* for partial recvs, restore original lengths */
|
||||
for (i=0; i<nprocs; i++)
|
||||
@ -824,28 +970,28 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
|
||||
* #835). Missing these holes would result in us writing more data than
|
||||
* recieved by everyone else. */
|
||||
*hole = 0;
|
||||
if (sum) {
|
||||
if (off != srt_off[0]) /* hole at the front */
|
||||
if (off != srt_off[0]) /* hole at the front */
|
||||
*hole = 1;
|
||||
else { /* coalesce the sorted offset-length pairs */
|
||||
for (i=1; i<sum; i++) {
|
||||
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
|
||||
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
|
||||
if (new_len > srt_len[0]) srt_len[0] = new_len;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (i < sum || size != srt_len[0]) /* hole in middle or end */
|
||||
*hole = 1;
|
||||
else { /* coalesce the sorted offset-length pairs */
|
||||
for (i=1; i<sum; i++) {
|
||||
if (srt_off[i] <= srt_off[0] + srt_len[0]) {
|
||||
int new_len = srt_off[i] + srt_len[i] - srt_off[0];
|
||||
if (new_len > srt_len[0]) srt_len[0] = new_len;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (i < sum || size != srt_len[0]) /* hole in middle or end */
|
||||
*hole = 1;
|
||||
}
|
||||
}
|
||||
|
||||
ADIOI_Free(srt_off);
|
||||
ADIOI_Free(srt_len);
|
||||
}
|
||||
|
||||
if (nprocs_recv) {
|
||||
if (*hole) {
|
||||
const char * stuff = "data-sieve-in-two-phase";
|
||||
setenv("LIBIOLOG_EXTRA_INFO", stuff, 1);
|
||||
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
|
||||
ADIO_EXPLICIT_OFFSET, off, &status, &err);
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
@ -857,6 +1003,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
|
||||
return;
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
unsetenv("LIBIOLOG_EXTRA_INFO");
|
||||
}
|
||||
}
|
||||
|
||||
@ -1027,7 +1174,7 @@ static void ADIOI_W_Exchange_data(ADIO_File fd, const void *buf, char *write_buf
|
||||
ADIOI_BUF_INCR \
|
||||
}
|
||||
|
||||
static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
static void ADIOI_Fill_send_buffer(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
|
||||
*flat_buf, char **send_buf, ADIO_Offset
|
||||
*offset_list, ADIO_Offset *len_list, int *send_size,
|
||||
MPI_Request *requests, int *sent_to_proc,
|
||||
@ -1079,7 +1226,7 @@ static void ADIOI_Fill_send_buffer(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
* longer than the single region that processor "p" is responsible
|
||||
* for.
|
||||
*/
|
||||
p = ADIOI_BGL_Calc_aggregator(fd,
|
||||
p = ADIOI_GPFS_Calc_aggregator(fd,
|
||||
off,
|
||||
min_st_offset,
|
||||
&len,
|
||||
@ -1140,7 +1287,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
|
||||
{
|
||||
typedef struct {
|
||||
ADIO_Offset *off_list;
|
||||
int *len_list;
|
||||
ADIO_Offset *len_list;
|
||||
int nelem;
|
||||
} heap_struct;
|
||||
|
||||
@ -1256,7 +1403,7 @@ static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
|
||||
|
||||
|
||||
static void ADIOI_W_Exchange_data_alltoallv(
|
||||
ADIO_File fd, void *buf,
|
||||
ADIO_File fd, const void *buf,
|
||||
char *write_buf, /* 1 */
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
@ -1287,11 +1434,15 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
int *srt_len, sum;
|
||||
ADIO_Offset *srt_off;
|
||||
static char myname[] = "ADIOI_W_EXCHANGE_DATA";
|
||||
double io_time;
|
||||
|
||||
|
||||
io_time = MPI_Wtime();
|
||||
/* exchange recv_size info so that each process knows how much to
|
||||
send to whom. */
|
||||
MPI_Alltoall(recv_size, 1, MPI_INT, send_size, 1, MPI_INT, fd->comm);
|
||||
|
||||
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] += MPI_Wtime() - io_time;
|
||||
io_time = MPI_Wtime();
|
||||
|
||||
nprocs_recv = 0;
|
||||
for (i=0; i<nprocs; i++) if (recv_size[i]) { nprocs_recv++; }
|
||||
@ -1334,14 +1485,23 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
min_st_offset, fd_size, fd_start, fd_end,
|
||||
send_buf_idx, curr_to_proc, done_to_proc, iter,
|
||||
buftype_extent);
|
||||
ADIOI_Free(send_buf);
|
||||
}
|
||||
|
||||
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SETUP] += MPI_Wtime() - io_time;
|
||||
|
||||
io_time = MPI_Wtime();
|
||||
/* alltoallv */
|
||||
MPI_Alltoallv(
|
||||
all_send_buf, send_size, sdispls, MPI_BYTE,
|
||||
all_recv_buf, recv_size, rdispls, MPI_BYTE,
|
||||
fd->comm );
|
||||
|
||||
ADIOI_Free( all_send_buf );
|
||||
ADIOI_Free(sdispls);
|
||||
|
||||
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_NET] += MPI_Wtime() - io_time;
|
||||
io_time = MPI_Wtime();
|
||||
/* data sieving pre-read */
|
||||
/* To avoid a read-modify-write, check if there are holes in the
|
||||
data to be written. For this, merge the (sorted) offset lists
|
||||
@ -1373,6 +1533,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
ADIOI_Free(srt_off);
|
||||
ADIOI_Free(srt_len);
|
||||
|
||||
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SORT] += MPI_Wtime() - io_time;
|
||||
io_time = MPI_Wtime();
|
||||
if (nprocs_recv) {
|
||||
if (*hole) {
|
||||
ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
|
||||
@ -1388,7 +1550,8 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
}
|
||||
|
||||
gpfsmpio_prof_cw[GPFSMPIO_CIO_T_DEXCH_SIEVE] += MPI_Wtime() - io_time;
|
||||
|
||||
/* scater all_recv_buf into 4M cb_buffer */
|
||||
tmp_len = (int *) ADIOI_Malloc(nprocs*sizeof(int));
|
||||
for (i=0; i<nprocs; i++)
|
||||
@ -1419,14 +1582,12 @@ static void ADIOI_W_Exchange_data_alltoallv(
|
||||
}
|
||||
|
||||
ADIOI_Free( tmp_len );
|
||||
ADIOI_Free( all_send_buf );
|
||||
ADIOI_Free( all_recv_buf );
|
||||
ADIOI_Free(sdispls);
|
||||
ADIOI_Free(rdispls);
|
||||
return;
|
||||
}
|
||||
|
||||
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlist_node
|
||||
static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, const void *buf, ADIOI_Flatlist_node
|
||||
*flat_buf, char **send_buf, ADIO_Offset
|
||||
*offset_list, ADIO_Offset *len_list, int *send_size,
|
||||
MPI_Request *requests, int *sent_to_proc,
|
||||
@ -1478,7 +1639,7 @@ static void ADIOI_Fill_send_buffer_nosend(ADIO_File fd, void *buf, ADIOI_Flatlis
|
||||
* longer than the single region that processor "p" is responsible
|
||||
* for.
|
||||
*/
|
||||
p = ADIOI_BGL_Calc_aggregator(fd,
|
||||
p = ADIOI_GPFS_Calc_aggregator(fd,
|
||||
off,
|
||||
min_st_offset,
|
||||
&len,
|
18
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/Makefile.mk
Обычный файл
18
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/Makefile.mk
Обычный файл
@ -0,0 +1,18 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2012 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_BG
|
||||
|
||||
noinst_HEADERS += \
|
||||
adio/ad_gpfs/bg/ad_bg_aggrs.h \
|
||||
adio/ad_gpfs/bg/ad_bg_pset.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_gpfs/bg/ad_bg_aggrs.c \
|
||||
adio/ad_gpfs/bg/ad_bg_pset.c
|
||||
|
||||
endif BUILD_AD_BG
|
675
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
Обычный файл
675
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/ad_bg_aggrs.c
Обычный файл
@ -0,0 +1,675 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_aggrs.c
|
||||
* \brief The externally used function from this file is is declared in ad_bg_aggrs.h
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997-2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*#define TRACE_ON */
|
||||
|
||||
// Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
|
||||
// #define balancecontigtrace 1
|
||||
// #define bridgeringaggtrace 1
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_cb_config_list.h"
|
||||
#include "../ad_gpfs.h"
|
||||
#include "ad_bg_pset.h"
|
||||
#include "ad_bg_aggrs.h"
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef USE_DBG_LOGGING
|
||||
#define AGG_DEBUG 1
|
||||
#endif
|
||||
|
||||
#ifndef TRACE_ERR
|
||||
# define TRACE_ERR(format...)
|
||||
#endif
|
||||
|
||||
/* Comments copied from common:
|
||||
* This file contains four functions:
|
||||
*
|
||||
* ADIOI_Calc_aggregator()
|
||||
* ADIOI_Calc_file_domains()
|
||||
* ADIOI_Calc_my_req()
|
||||
* ADIOI_Calc_others_req()
|
||||
*
|
||||
* The last three of these were originally in ad_read_coll.c, but they are
|
||||
* also shared with ad_write_coll.c. I felt that they were better kept with
|
||||
* the rest of the shared aggregation code.
|
||||
*/
|
||||
|
||||
/* Discussion of values available from above:
|
||||
*
|
||||
* ADIO_Offset st_offsets[0..nprocs-1]
|
||||
* ADIO_Offset end_offsets[0..nprocs-1]
|
||||
* These contain a list of start and end offsets for each process in
|
||||
* the communicator. For example, an access at loc 10, size 10 would
|
||||
* have a start offset of 10 and end offset of 19.
|
||||
* int nprocs
|
||||
* number of processors in the collective I/O communicator
|
||||
* ADIO_Offset min_st_offset
|
||||
* ADIO_Offset fd_start[0..nprocs_for_coll-1]
|
||||
* starting location of "file domain"; region that a given process will
|
||||
* perform aggregation for (i.e. actually do I/O)
|
||||
* ADIO_Offset fd_end[0..nprocs_for_coll-1]
|
||||
* start + size - 1 roughly, but it can be less, or 0, in the case of
|
||||
* uneven distributions
|
||||
*/
|
||||
|
||||
/* forward declaration */
|
||||
static void
|
||||
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo);
|
||||
|
||||
/*
|
||||
* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
|
||||
* The parameters are
|
||||
* . the number of aggregators (proxies) : fd->hints->cb_nodes
|
||||
* . the ranks of the aggregators : fd->hints->ranklist
|
||||
* By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
|
||||
* ADIO can work more efficiently.
|
||||
*/
|
||||
int
|
||||
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
|
||||
{
|
||||
int r, s;
|
||||
ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
|
||||
ADIOI_BG_ConfInfo_t *confInfo;
|
||||
TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
|
||||
|
||||
MPI_Comm_size( fd->comm, &s );
|
||||
MPI_Comm_rank( fd->comm, &r );
|
||||
|
||||
/* Collect individual BG personality information */
|
||||
confInfo = ADIOI_BG_ConfInfo_new ();
|
||||
procInfo = ADIOI_BG_ProcInfo_new ();
|
||||
ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
|
||||
|
||||
/* Gather BG personality infomation onto process 0 */
|
||||
/* if (r == 0) */
|
||||
all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
|
||||
|
||||
MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
|
||||
(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
|
||||
0,
|
||||
fd->comm );
|
||||
|
||||
/* Compute a list of the ranks of chosen IO proxy CN on process 0 */
|
||||
if (r == 0) {
|
||||
ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
|
||||
/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
|
||||
}
|
||||
ADIOI_BG_ProcInfo_free (all_procInfo);
|
||||
|
||||
/* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
|
||||
Declared in adio_cb_config_list.h */
|
||||
ADIOI_cb_bcast_rank_map(fd);
|
||||
if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
|
||||
bridgelistnum and numbridges to all
|
||||
ranks */
|
||||
if (r != 0) {
|
||||
fd->hints->fs_hints.bg.bridgelist =
|
||||
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
|
||||
if (fd->hints->fs_hints.bg.bridgelist == NULL) {
|
||||
/* NEED TO HANDLE ENOMEM */
|
||||
}
|
||||
}
|
||||
MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
|
||||
fd->comm);
|
||||
|
||||
if (r != 0) {
|
||||
fd->hints->fs_hints.bg.bridgelistnum =
|
||||
ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
|
||||
if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
|
||||
/* NEED TO HANDLE ENOMEM */
|
||||
}
|
||||
}
|
||||
MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
|
||||
MPI_INT, 0, fd->comm);
|
||||
|
||||
MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
|
||||
fd->comm);
|
||||
|
||||
}
|
||||
|
||||
|
||||
ADIOI_BG_persInfo_free( confInfo, procInfo );
|
||||
TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* There are some number of bridge nodes (randomly) distributed through the job
|
||||
* We need to split the nodes among the bridge nodes */
|
||||
/* Maybe find which bridge node is closer (manhattan distance) and try to
|
||||
* distribute evenly.
|
||||
*/
|
||||
/*
|
||||
* Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
|
||||
* The first order of tmp_ranklist is : PSET number
|
||||
* The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int rank;
|
||||
int bridge;
|
||||
} sortstruct;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int bridgeRank;
|
||||
int numAggsAssigned;
|
||||
} bridgeAggAssignment;
|
||||
|
||||
static int intsort(const void *p1, const void *p2)
|
||||
{
|
||||
sortstruct *i1, *i2;
|
||||
i1 = (sortstruct *)p1;
|
||||
i2 = (sortstruct *)p2;
|
||||
return(i1->bridge - i2->bridge);
|
||||
}
|
||||
|
||||
static int
|
||||
ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo,
|
||||
int *tmp_ranklist)
|
||||
{
|
||||
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
|
||||
/* BES: This should be done in the init routines probably. */
|
||||
int i, j;
|
||||
int aggTotal;
|
||||
int *aggList;
|
||||
|
||||
if (gpfsmpio_bridgeringagg > 0) {
|
||||
|
||||
int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
|
||||
/* the number of aggregators is (numAggs per bridgenode) */
|
||||
if(numAggs == 1)
|
||||
aggTotal = 1;
|
||||
else
|
||||
aggTotal = confInfo->numBridgeRanks * numAggs;
|
||||
|
||||
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
|
||||
if(aggTotal == 1) { /* special case when we only have one bridge node */
|
||||
|
||||
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
|
||||
for(i=0; i < confInfo->nProcs; i++)
|
||||
{
|
||||
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
|
||||
bridgelist[i].rank = i;
|
||||
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
|
||||
}
|
||||
|
||||
/* This list contains rank->bridge info. Now, we need to sort this list. */
|
||||
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
|
||||
|
||||
aggList[0] = bridgelist[0].bridge;
|
||||
ADIOI_Free(bridgelist);
|
||||
|
||||
}
|
||||
else { // aggTotal > 1
|
||||
|
||||
int currentAggListSize = 0;
|
||||
int numBridgesWithAggAssignments = 0;
|
||||
bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
|
||||
|
||||
int partitionSize = all_procInfo[0].numNodesInPartition;
|
||||
int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
|
||||
for (i=0;i<partitionSize;i++)
|
||||
nodesAssigned[i] = 0;
|
||||
|
||||
int currentNumHops = gpfsmpio_bridgeringagg;
|
||||
int allAggsAssigned = 0;
|
||||
|
||||
/* Iterate thru the process infos and select aggregators starting at currentNumHops
|
||||
away. Increase the currentNumHops until all bridges have numAggs assigned to them.
|
||||
*/
|
||||
while (!allAggsAssigned) {
|
||||
/* track whether any aggs are selected durng this round */
|
||||
int startingCurrentAggListSize = currentAggListSize;
|
||||
int numIterForHopsWithNoAggs = 0;
|
||||
for (i=0;i<confInfo->nProcs;i++) {
|
||||
if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
|
||||
if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
|
||||
int foundBridge = 0;
|
||||
for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
|
||||
if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
|
||||
foundBridge = 1;
|
||||
if (aggAssignments[j].numAggsAssigned < numAggs) {
|
||||
aggAssignments[j].numAggsAssigned++;
|
||||
nodesAssigned[all_procInfo[i].nodeRank] = 1;
|
||||
aggList[currentAggListSize] = all_procInfo[i].rank;
|
||||
currentAggListSize++;
|
||||
#ifdef bridgeringaggtrace
|
||||
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!foundBridge) {
|
||||
aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
|
||||
aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
|
||||
numBridgesWithAggAssignments++;
|
||||
nodesAssigned[all_procInfo[i].nodeRank] = 1;
|
||||
aggList[currentAggListSize] = all_procInfo[i].rank;
|
||||
currentAggListSize++;
|
||||
#ifdef bridgeringaggtrace
|
||||
printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
|
||||
allAggsAssigned = 1;
|
||||
for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
|
||||
if (aggAssignments[i].numAggsAssigned < numAggs)
|
||||
allAggsAssigned = 0;
|
||||
}
|
||||
}
|
||||
currentNumHops++;
|
||||
/* If 3 rounds go by without selecting an agg abort to avoid
|
||||
infinite loop.
|
||||
*/
|
||||
if (startingCurrentAggListSize == currentAggListSize)
|
||||
numIterForHopsWithNoAggs++;
|
||||
else
|
||||
numIterForHopsWithNoAggs = 0;
|
||||
ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
|
||||
}
|
||||
|
||||
ADIOI_Free(aggAssignments);
|
||||
ADIOI_Free(nodesAssigned);
|
||||
|
||||
} // else aggTotal > 1
|
||||
|
||||
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
|
||||
} // gpfsmpio_bridgeringagg > 0
|
||||
|
||||
else { // gpfsmpio_bridgeringagg unset - default code
|
||||
|
||||
int distance, numAggs;
|
||||
|
||||
/* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
|
||||
* bridge node */
|
||||
|
||||
sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
|
||||
for(i=0; i < confInfo->nProcs; i++)
|
||||
{
|
||||
bridgelist[i].bridge = all_procInfo[i].bridgeRank;
|
||||
bridgelist[i].rank = i;
|
||||
TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
|
||||
}
|
||||
|
||||
/* This list contains rank->bridge info. Now, we need to sort this list. */
|
||||
qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
|
||||
|
||||
/* In this array, we can pick an appropriate number of midpoints based on
|
||||
* our bridgenode index and the number of aggregators */
|
||||
|
||||
numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
|
||||
if(numAggs == 1)
|
||||
aggTotal = 1;
|
||||
else
|
||||
/* the number of aggregators is (numAggs per bridgenode) plus each
|
||||
* bridge node is an aggregator */
|
||||
aggTotal = confInfo->numBridgeRanks * (numAggs+1);
|
||||
|
||||
if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
|
||||
|
||||
TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
|
||||
aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
|
||||
|
||||
|
||||
/* For each bridge node, determine who the aggregators will be */
|
||||
/* basically, the n*distance and bridge node */
|
||||
if(aggTotal == 1) /* special case when we only have one bridge node */
|
||||
aggList[0] = bridgelist[0].bridge;
|
||||
else
|
||||
{
|
||||
int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
|
||||
int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
|
||||
int psetSize = 0;
|
||||
int procIndex;
|
||||
for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
|
||||
{
|
||||
TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex, bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
|
||||
if(lastBridge == bridgelist[procIndex].bridge)
|
||||
{
|
||||
psetSize++;
|
||||
if(procIndex) continue;
|
||||
else procIndex--;/* procIndex == 0 */
|
||||
}
|
||||
/* Sets up a list of nodes which will act as aggregators. numAggs
|
||||
* per bridge node total. The list of aggregators is
|
||||
* bridgeNode 0
|
||||
* bridgeNode 1
|
||||
* bridgeNode ...
|
||||
* bridgeNode N
|
||||
* bridgeNode[0]aggr[0]
|
||||
* bridgeNode[0]aggr[1]...
|
||||
* bridgeNode[0]aggr[N]...
|
||||
* ...
|
||||
* bridgeNode[N]aggr[0]..
|
||||
* bridgeNode[N]aggr[N]
|
||||
*/
|
||||
aggList[nextBridge]=lastBridge;
|
||||
distance = psetSize/numAggs;
|
||||
TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
|
||||
if(numAggs>1)
|
||||
{
|
||||
for(j = 0; j < numAggs; j++)
|
||||
{
|
||||
ADIOI_Assert(nextAggr<aggTotal);
|
||||
aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
|
||||
TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
|
||||
if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
|
||||
{
|
||||
aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
|
||||
TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
|
||||
}
|
||||
nextAggr++;
|
||||
}
|
||||
}
|
||||
if(procIndex<0) break;
|
||||
lastBridge = bridgelist[procIndex].bridge;
|
||||
psetSize = 1;
|
||||
nextBridge++;
|
||||
}
|
||||
}
|
||||
|
||||
TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
|
||||
memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
|
||||
for(i=0;i<aggTotal;i++)
|
||||
{
|
||||
TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
|
||||
}
|
||||
|
||||
|
||||
ADIOI_Free (bridgelist);
|
||||
|
||||
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
|
||||
}
|
||||
|
||||
ADIOI_Free (aggList);
|
||||
return aggTotal;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* compute aggregators ranklist and put it into fd->hints struct
|
||||
*/
|
||||
static void
|
||||
ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
|
||||
const ADIOI_BG_ConfInfo_t *confInfo,
|
||||
ADIOI_BG_ProcInfo_t *all_procInfo)
|
||||
{
|
||||
TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
|
||||
int i;
|
||||
int naggs;
|
||||
int size;
|
||||
int *tmp_ranklist;
|
||||
|
||||
/* compute the ranklist of IO aggregators and put into tmp_ranklist */
|
||||
tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<confInfo->nProcs; i++) {
|
||||
DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
|
||||
}
|
||||
# endif
|
||||
|
||||
naggs=
|
||||
ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
|
||||
|
||||
# define VERIFY 1
|
||||
# if VERIFY
|
||||
DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
|
||||
confInfo->ioMinSize ,
|
||||
confInfo->ioMaxSize ,
|
||||
confInfo->nAggrs ,
|
||||
confInfo->numBridgeRanks ,
|
||||
confInfo->nProcs ,
|
||||
confInfo->ioMaxSize /*virtualPsetSize*/ ,
|
||||
confInfo->cpuIDsize,
|
||||
confInfo->aggRatio ,
|
||||
naggs );
|
||||
# endif
|
||||
MPI_Comm_size( fd->comm, &size );
|
||||
/* This fix is for when the bridgenode rnk is not part of the particular
|
||||
* subcomm associated with this MPI File operation. I don't know if
|
||||
* this is the best/right answer but it passes the test cases at least.
|
||||
* I don't know how common file IO in subcomms is anyway... */
|
||||
for(i=0;i<naggs;i++)
|
||||
{
|
||||
if(tmp_ranklist[i] > size)
|
||||
{
|
||||
TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
|
||||
i, tmp_ranklist[i], fd->comm);
|
||||
tmp_ranklist[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
# if AGG_DEBUG
|
||||
for (i=0; i<naggs; i++) {
|
||||
DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
|
||||
}
|
||||
# endif
|
||||
if (gpfsmpio_balancecontig == 1) {
|
||||
/* what comes out of this code block is the agg ranklist sorted by
|
||||
* bridge set and ion id with associated bridge info stored in the
|
||||
* hints structure for later access during file domain assignment */
|
||||
|
||||
// sort the agg ranklist by ions and bridges
|
||||
|
||||
int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
|
||||
/* list of all bridge ranks */
|
||||
int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
|
||||
/* each entry here is the number of aggregators associated with the
|
||||
* bridge rank of the same index in bridgelist */
|
||||
int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
/* list of all ion IDs corresponding with bridgelist entries of same index */
|
||||
int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
|
||||
int numbridges = 0;
|
||||
|
||||
for (i=0;i<naggs;i++)
|
||||
bridgelistnum[i] = 0;
|
||||
|
||||
/* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
|
||||
* agg rank on that ion. */
|
||||
int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
for (i=0;i<naggs;i++)
|
||||
summarybridgeminionaggrank[i] = -1;
|
||||
|
||||
/* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
|
||||
* entry and find the associated bridge list index - at the end we will
|
||||
* know how many aggs belong to each bridge in each ion */
|
||||
for (i=0;i<naggs;i++) {
|
||||
int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
|
||||
int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
|
||||
int foundrank = 0;
|
||||
int summaryranklistbridgeindex = 0;
|
||||
int j;
|
||||
for (j=0;(j<numbridges && !foundrank);j++) {
|
||||
if (bridgelist[j] == aggbridgerank) {
|
||||
foundrank = 1;
|
||||
summaryranklistbridgeindex = j;
|
||||
}
|
||||
else
|
||||
summaryranklistbridgeindex++;
|
||||
}
|
||||
if (!foundrank) {
|
||||
bridgelist[summaryranklistbridgeindex] = aggbridgerank;
|
||||
ionlist[summaryranklistbridgeindex] = aggionid;
|
||||
|
||||
if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
|
||||
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
|
||||
else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
|
||||
summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
|
||||
numbridges++;
|
||||
}
|
||||
|
||||
bridgelistnum[summaryranklistbridgeindex]++;
|
||||
}
|
||||
|
||||
/* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
|
||||
* need to make each entry the minimum bridge rank for the entire ion. */
|
||||
for (i=0;i<numbridges;i++) {
|
||||
int aggIonId = ionlist[i];
|
||||
int j;
|
||||
for (j=0;j<numbridges;j++) {
|
||||
if (ionlist[j] == aggIonId) {
|
||||
if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
|
||||
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// resort by io node minimum bridge rank
|
||||
int x;
|
||||
for (x=0;x<numbridges;x++) {
|
||||
for (i=0;i<(numbridges-1);i++) {
|
||||
if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
|
||||
int tmpminionaggrank = summarybridgeminionaggrank[i];
|
||||
summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
|
||||
summarybridgeminionaggrank[i+1] = tmpminionaggrank;
|
||||
int tmpionid = ionlist[i];
|
||||
ionlist[i] = ionlist[i+1];
|
||||
ionlist[i+1] = tmpionid;
|
||||
int tmpbridgerank = bridgelist[i];
|
||||
bridgelist[i] = bridgelist[i+1];
|
||||
bridgelist[i+1] = tmpbridgerank;
|
||||
int tmpbridgeranknum = bridgelistnum[i];
|
||||
bridgelistnum[i] = bridgelistnum[i+1];
|
||||
bridgelistnum[i+1] = tmpbridgeranknum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// for each io node make sure bridgelist is in rank order
|
||||
int startSortIndex = -1;
|
||||
int endSortIndex = -1;
|
||||
int currentBridgeIndex = 0;
|
||||
|
||||
while (currentBridgeIndex < numbridges) {
|
||||
int currentIonId = ionlist[currentBridgeIndex];
|
||||
startSortIndex = currentBridgeIndex;
|
||||
while (ionlist[currentBridgeIndex] == currentIonId)
|
||||
currentBridgeIndex++;
|
||||
endSortIndex = currentBridgeIndex-1;
|
||||
for (x=startSortIndex;x<=endSortIndex;x++) {
|
||||
for (i=startSortIndex;i<endSortIndex;i++) {
|
||||
if (bridgelist[i] > bridgelist[i+1]) {
|
||||
int tmpbridgerank = bridgelist[i];
|
||||
bridgelist[i] = bridgelist[i+1];
|
||||
bridgelist[i+1] = tmpbridgerank;
|
||||
int tmpbridgeranknum = bridgelistnum[i];
|
||||
bridgelistnum[i] = bridgelistnum[i+1];
|
||||
bridgelistnum[i+1] = tmpbridgeranknum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* populate interleavedbridgeranklist - essentially the agg rank list
|
||||
* is now sorted by the ion minimum bridge rank and bridge node */
|
||||
int currentrankoffset = 0;
|
||||
for (i=0;i<numbridges;i++) {
|
||||
int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
int numAggsForThisBridge = 0;
|
||||
|
||||
int k;
|
||||
for (k=0;k<naggs;k++) {
|
||||
int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
|
||||
if (aggbridgerank == bridgelist[i]) {
|
||||
thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
|
||||
numAggsForThisBridge++;
|
||||
}
|
||||
}
|
||||
|
||||
// sort thisBridgeAggList
|
||||
for (x=0;x<numAggsForThisBridge;x++) {
|
||||
int n;
|
||||
for (n=0;n<(numAggsForThisBridge-1);n++) {
|
||||
if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
|
||||
int tmpthisBridgeAggList = thisBridgeAggList[n];
|
||||
thisBridgeAggList[n] = thisBridgeAggList[n+1];
|
||||
thisBridgeAggList[n+1] = tmpthisBridgeAggList;
|
||||
}
|
||||
}
|
||||
}
|
||||
int n;
|
||||
for (n=0;n<numAggsForThisBridge;n++) {
|
||||
interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
|
||||
currentrankoffset++;
|
||||
}
|
||||
ADIOI_Free(thisBridgeAggList);
|
||||
}
|
||||
|
||||
#ifdef balancecontigtrace
|
||||
fprintf(stderr,"Interleaved aggregator list:\n");
|
||||
for (i=0;i<naggs;i++) {
|
||||
fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
|
||||
}
|
||||
fprintf(stderr,"Bridges list:\n");
|
||||
for (i=0;i<numbridges;i++) {
|
||||
fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
|
||||
}
|
||||
|
||||
#endif
|
||||
/* copy the ranklist of IO aggregators to fd->hints */
|
||||
if(fd->hints->ranklist != NULL)
|
||||
ADIOI_Free (fd->hints->ranklist);
|
||||
if(fd->hints->fs_hints.bg.bridgelist != NULL)
|
||||
ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
|
||||
if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
|
||||
ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
|
||||
|
||||
fd->hints->cb_nodes = naggs;
|
||||
fd->hints->fs_hints.bg.numbridges = numbridges;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
|
||||
|
||||
fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
|
||||
|
||||
fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
|
||||
|
||||
ADIOI_Free(summarybridgeminionaggrank);
|
||||
ADIOI_Free( tmp_ranklist );
|
||||
ADIOI_Free( bridgelistnum );
|
||||
ADIOI_Free( bridgelist );
|
||||
ADIOI_Free( interleavedbridgeranklist );
|
||||
ADIOI_Free(ionlist);
|
||||
|
||||
} else {
|
||||
/* classic topology-agnostic copy of the ranklist of IO aggregators to
|
||||
* fd->hints */
|
||||
if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
|
||||
|
||||
fd->hints->cb_nodes = naggs;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
|
||||
memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
|
||||
|
||||
ADIOI_Free( tmp_ranklist );
|
||||
}
|
||||
TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
|
||||
return;
|
||||
}
|
33
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
Обычный файл
33
ompi/mca/io/romio/romio/adio/ad_gpfs/bg/ad_bg_aggrs.h
Обычный файл
@ -0,0 +1,33 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_aggrs.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Declares functions specific for the BlueGene platform within the GPFS
|
||||
* parallel I/O solution. Implements aligned file-domain partitioning
|
||||
* (7/28/2005); persistent file doamin work not implemented
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef AD_BG_AGGRS_H_
|
||||
#define AD_BG_AGGRS_H_
|
||||
|
||||
#include "adio.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifdef HAVE_GPFS_H
|
||||
#include <gpfs.h>
|
||||
#endif
|
||||
#if !defined(GPFS_SUPER_MAGIC)
|
||||
#define GPFS_SUPER_MAGIC (0x47504653)
|
||||
#endif
|
||||
|
||||
/* generate a list of I/O aggregators that utilizes BG-PSET orginization. */
|
||||
int ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset);
|
||||
|
||||
#endif /* AD_BG_AGGRS_H_ */
|
@ -3,28 +3,37 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_bg_pset.c
|
||||
* \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
|
||||
* \brief Definition of functions associated to structs ADIOI_BG_ProcInfo_t and ADIOI_BG_ConfInfo_t
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
/*
|
||||
* Copyright (C) 1997 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/* #define TRACE_ON */
|
||||
// #define bridgeringaggtrace 1
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "ad_bg.h"
|
||||
#include "../ad_gpfs.h"
|
||||
#include "ad_bg_pset.h"
|
||||
#include "mpidimpl.h"
|
||||
#include <spi/include/kernel/process.h>
|
||||
#include <firmware/include/personality.h>
|
||||
|
||||
#ifdef HAVE_MPIX_H
|
||||
#include <mpix.h>
|
||||
#endif
|
||||
|
||||
#ifndef TRACE_ERR
|
||||
# define TRACE_ERR(fmt...)
|
||||
#endif
|
||||
|
||||
ADIOI_BG_ProcInfo_t *
|
||||
ADIOI_BG_ProcInfo_new()
|
||||
{
|
||||
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ProcInfo_t));
|
||||
ADIOI_BG_assert ((p != NULL));
|
||||
ADIOI_Assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
@ -32,7 +41,7 @@ ADIOI_BG_ProcInfo_t *
|
||||
ADIOI_BG_ProcInfo_new_n( int n )
|
||||
{
|
||||
ADIOI_BG_ProcInfo_t *p = (ADIOI_BG_ProcInfo_t *) ADIOI_Malloc (n * sizeof(ADIOI_BG_ProcInfo_t));
|
||||
ADIOI_BG_assert ((p != NULL));
|
||||
ADIOI_Assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
@ -46,7 +55,7 @@ ADIOI_BG_ConfInfo_t *
|
||||
ADIOI_BG_ConfInfo_new ()
|
||||
{
|
||||
ADIOI_BG_ConfInfo_t *p = (ADIOI_BG_ConfInfo_t *) ADIOI_Malloc (sizeof(ADIOI_BG_ConfInfo_t));
|
||||
ADIOI_BG_assert ((p != NULL));
|
||||
ADIOI_Assert ((p != NULL));
|
||||
return p;
|
||||
}
|
||||
|
||||
@ -72,10 +81,40 @@ static int intsort(const void *p1, const void *p2)
|
||||
return(i1->bridgeCoord - i2->bridgeCoord);
|
||||
}
|
||||
|
||||
unsigned torusSize[MPIX_TORUS_MAX_DIMS];
|
||||
unsigned dimTorus[MPIX_TORUS_MAX_DIMS];
|
||||
|
||||
void
|
||||
ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
ADIOI_BG_ProcInfo_t *proc,
|
||||
/* This function computes the number of hops between the torus coordinates of the
|
||||
* aggCoords and bridgeCoords parameters.
|
||||
*/
|
||||
static unsigned procManhattanDistance(unsigned *aggCoords, unsigned *bridgeCoords) {
|
||||
|
||||
unsigned totalDistance = 0;
|
||||
int i;
|
||||
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
|
||||
unsigned dimDistance = abs((int)aggCoords[i] - (int)bridgeCoords[i]);
|
||||
if (dimDistance > 0) { // could torus make it closer?
|
||||
if (dimTorus[i]) {
|
||||
if (aggCoords[i] == torusSize[i]) { // is wrap-around closer
|
||||
if ((bridgeCoords[i]+1) < dimDistance) // assume will use torus link
|
||||
dimDistance = bridgeCoords[i]+1;
|
||||
}
|
||||
else if (bridgeCoords[i] == torusSize[i]) { // is wrap-around closer
|
||||
if ((aggCoords[i]+1) < dimDistance) // assume will use torus link
|
||||
dimDistance = aggCoords[i]+1;
|
||||
}
|
||||
}
|
||||
} /* else: dimDistance == 0, meaning aggCoords[i] and bridgeCoords[i] are
|
||||
the same and there's no closer point to pick */
|
||||
totalDistance += dimDistance;
|
||||
}
|
||||
return totalDistance;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
ADIOI_BG_ProcInfo_t *proc,
|
||||
int size, int rank, int n_aggrs, MPI_Comm comm)
|
||||
{
|
||||
int i, iambridge=0, bridgerank = -1, bridgeIndex;
|
||||
@ -95,11 +134,43 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
|
||||
proc->rank = rank;
|
||||
proc->coreID = hw.coreID;
|
||||
|
||||
if (gpfsmpio_bridgeringagg > 0) {
|
||||
#ifdef bridgeringaggtrace
|
||||
if (rank == 0)
|
||||
fprintf(stderr,"Block dimensions:\n");
|
||||
#endif
|
||||
|
||||
/* Set the numNodesInPartition and nodeRank for this proc
|
||||
*/
|
||||
proc->numNodesInPartition = 1;
|
||||
proc->nodeRank = 0;
|
||||
for (i=0;i<MPIX_TORUS_MAX_DIMS;i++) {
|
||||
torusSize[i] = hw.Size[i];
|
||||
dimTorus[i] = hw.isTorus[i];
|
||||
proc->numNodesInPartition *= hw.Size[i];
|
||||
int baseNum = 1, j;
|
||||
for (j=0;j<i;j++)
|
||||
baseNum *= hw.Size[j];
|
||||
proc->nodeRank += (hw.Coords[i] * baseNum);
|
||||
#ifdef bridgeringaggtrace
|
||||
if (rank == 0)
|
||||
fprintf(stderr,"Dimension %d has %d elements wrap-around value is %d\n",i,torusSize[i],dimTorus[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Comm_size(comm, &commsize);
|
||||
|
||||
proc->ionID = MPIX_IO_node_id ();
|
||||
|
||||
if(size == 1)
|
||||
{
|
||||
proc->iamBridge = 1;
|
||||
proc->bridgeRank = rank;
|
||||
if (gpfsmpio_bridgeringagg > 0) {
|
||||
proc->manhattanDistanceToBridge = 0;
|
||||
}
|
||||
|
||||
/* Set up the other parameters */
|
||||
proc->myIOSize = size;
|
||||
@ -111,7 +182,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
conf->cpuIDsize = hw.ppn;
|
||||
/*conf->virtualPsetSize = conf->ioMaxSize * conf->cpuIDsize;*/
|
||||
conf->nAggrs = 1;
|
||||
conf->aggRatio = 1. * conf->nAggrs / conf->ioMaxSize /*virtualPsetSize*/;
|
||||
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
|
||||
if(conf->aggRatio > 1) conf->aggRatio = 1.;
|
||||
TRACE_ERR("I am (single) Bridge rank\n");
|
||||
return;
|
||||
@ -120,21 +191,45 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
/* Find the nearest bridge node coords. We don't know the
|
||||
rank in our comm so we will collective find/pick a bridge
|
||||
rank later.
|
||||
*/
|
||||
*/
|
||||
int32_t bridgeCoords;
|
||||
bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
|
||||
pers.Network_Config.cnBridge_B << 18 |
|
||||
pers.Network_Config.cnBridge_C << 12 |
|
||||
pers.Network_Config.cnBridge_D << 6 |
|
||||
bridgeCoords = pers.Network_Config.cnBridge_A << 24 |
|
||||
pers.Network_Config.cnBridge_B << 18 |
|
||||
pers.Network_Config.cnBridge_C << 12 |
|
||||
pers.Network_Config.cnBridge_D << 6 |
|
||||
pers.Network_Config.cnBridge_E << 2;
|
||||
ADIOI_BG_assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
|
||||
ADIOI_Assert((bridgeCoords >= 0)); /* A dim is < 6 bits or sorting won't work */
|
||||
|
||||
if((hw.Coords[0] == pers.Network_Config.cnBridge_A) &&
|
||||
(hw.Coords[1] == pers.Network_Config.cnBridge_B) &&
|
||||
(hw.Coords[2] == pers.Network_Config.cnBridge_C) &&
|
||||
(hw.Coords[3] == pers.Network_Config.cnBridge_D) &&
|
||||
(hw.Coords[4] == pers.Network_Config.cnBridge_E))
|
||||
if((hw.Coords[0] == pers.Network_Config.cnBridge_A) &&
|
||||
(hw.Coords[1] == pers.Network_Config.cnBridge_B) &&
|
||||
(hw.Coords[2] == pers.Network_Config.cnBridge_C) &&
|
||||
(hw.Coords[3] == pers.Network_Config.cnBridge_D) &&
|
||||
(hw.Coords[4] == pers.Network_Config.cnBridge_E)) {
|
||||
iambridge = 1; /* I am bridge */
|
||||
if (gpfsmpio_bridgeringagg > 0) {
|
||||
proc->manhattanDistanceToBridge = 0;
|
||||
}
|
||||
}
|
||||
else { // calculate manhattan distance to bridge if gpfsmpio_bridgeringagg is set
|
||||
if (gpfsmpio_bridgeringagg > 0) {
|
||||
unsigned aggCoords[MPIX_TORUS_MAX_DIMS],manhattanBridgeCoords[MPIX_TORUS_MAX_DIMS];
|
||||
aggCoords[0] = hw.Coords[0];
|
||||
manhattanBridgeCoords[0] = pers.Network_Config.cnBridge_A;
|
||||
aggCoords[1] = hw.Coords[1];
|
||||
manhattanBridgeCoords[1] = pers.Network_Config.cnBridge_B;
|
||||
aggCoords[2] = hw.Coords[2];
|
||||
manhattanBridgeCoords[2] = pers.Network_Config.cnBridge_C;
|
||||
aggCoords[3] = hw.Coords[3];
|
||||
manhattanBridgeCoords[3] = pers.Network_Config.cnBridge_D;
|
||||
aggCoords[4] = hw.Coords[4];
|
||||
manhattanBridgeCoords[4] = pers.Network_Config.cnBridge_E;
|
||||
|
||||
proc->manhattanDistanceToBridge= procManhattanDistance(aggCoords, manhattanBridgeCoords);
|
||||
#ifdef bridgeringaggtrace
|
||||
fprintf(stderr,"agg coords are %u %u %u %u %u bridge coords are %u %u %u %u %u distance is %u\n",aggCoords[0],aggCoords[1],aggCoords[2],aggCoords[3],aggCoords[4],manhattanBridgeCoords[0],manhattanBridgeCoords[1],manhattanBridgeCoords[2],manhattanBridgeCoords[3],manhattanBridgeCoords[4], proc->manhattanDistanceToBridge);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
TRACE_ERR("Bridge coords(%8.8X): %d %d %d %d %d, %d. iambridge %d\n",bridgeCoords, pers.Network_Config.cnBridge_A,pers.Network_Config.cnBridge_B,pers.Network_Config.cnBridge_C,pers.Network_Config.cnBridge_D,pers.Network_Config.cnBridge_E,0, iambridge);
|
||||
|
||||
@ -143,16 +238,16 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
bridges = (sortstruct *) ADIOI_Malloc(sizeof(sortstruct) * size);
|
||||
|
||||
/* We're going to sort this structure by bridgeCoord:
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int rank;
|
||||
int bridgeCoord;
|
||||
} sortstruct;
|
||||
|
||||
and I want the rank that IS the bridge to sort first, so
|
||||
OR in '1' on non-bridge ranks that use a bridge coord.
|
||||
*/
|
||||
} sortstruct;
|
||||
|
||||
and I want the rank that IS the bridge to sort first, so
|
||||
OR in '1' on non-bridge ranks that use a bridge coord.
|
||||
*/
|
||||
|
||||
/* My input to the collective */
|
||||
bridges[rank].rank = rank;
|
||||
@ -173,18 +268,18 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
tempRank = bridges[0].rank;
|
||||
|
||||
countPset=1;
|
||||
bridgeIndex = 0;
|
||||
bridgeIndex = 0;
|
||||
mincompute = size+1;
|
||||
maxcompute = 1;
|
||||
|
||||
for(i=1; i<size; i++)
|
||||
{
|
||||
if((bridges[i].bridgeCoord & ~1) == tempCoords)
|
||||
if((bridges[i].bridgeCoord & ~1) == tempCoords)
|
||||
countPset++; /* same bridge (pset), count it */
|
||||
else /* new bridge found */
|
||||
{
|
||||
#ifdef TRACE_ON
|
||||
if(rank == 0)
|
||||
if(rank == 0)
|
||||
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
|
||||
bridgeIndex, tempRank, tempCoords, countPset);
|
||||
#endif
|
||||
@ -193,13 +288,13 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
if(countPset < mincompute)
|
||||
mincompute = countPset;
|
||||
|
||||
/* Is this my bridge? */
|
||||
/* Was this my bridge we finished? */
|
||||
if(tempCoords == bridgeCoords)
|
||||
{
|
||||
/* Am I the bridge rank? */
|
||||
if(tempRank == rank)
|
||||
iambridge = 1;
|
||||
else
|
||||
else
|
||||
iambridge = 0; /* Another rank on my node may have taken over */
|
||||
TRACE_ERR("Rank %u, bridge set %u, bridge rank %d (%#8.8X) has %d ranks, iambridge %u\n",
|
||||
rank, bridgeIndex, tempRank, tempCoords, countPset,iambridge);
|
||||
@ -207,6 +302,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
proc->myIOSize = countPset;
|
||||
proc->ioNodeIndex = bridgeIndex;
|
||||
}
|
||||
/* Setup next bridge */
|
||||
tempCoords = bridges[i].bridgeCoord & ~1;
|
||||
tempRank = bridges[i].rank;
|
||||
bridgeIndex++;
|
||||
@ -216,7 +312,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
/* Process last bridge */
|
||||
|
||||
#ifdef TRACE_ON
|
||||
if(rank == 0)
|
||||
if(rank == 0)
|
||||
TRACE_ERR("Bridge set %u, bridge rank %d (%#8.8X) has %d ranks\n",
|
||||
bridgeIndex, tempRank, tempCoords, countPset);
|
||||
#endif
|
||||
@ -225,21 +321,21 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
if(countPset < mincompute)
|
||||
mincompute = countPset;
|
||||
|
||||
/* Is this my bridge? */
|
||||
/* Was this my bridge? */
|
||||
if(tempCoords == bridgeCoords)
|
||||
{
|
||||
/* Am I the bridge rank? */
|
||||
if(tempRank == rank)
|
||||
iambridge = 1;
|
||||
else
|
||||
else
|
||||
iambridge = 0; /* Another rank on my node may have taken over */
|
||||
bridgerank = tempRank;
|
||||
proc->myIOSize = countPset;
|
||||
proc->ioNodeIndex = bridgeIndex;
|
||||
}
|
||||
|
||||
|
||||
if(rank == 0)
|
||||
|
||||
|
||||
if(rank == 0)
|
||||
{
|
||||
/* Only rank 0 has a conf structure, fill in stuff as appropriate */
|
||||
conf->ioMinSize = mincompute;
|
||||
@ -248,21 +344,23 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
conf->nProcs = size;
|
||||
conf->cpuIDsize = hw.ppn;
|
||||
/*conf->virtualPsetSize = maxcompute * conf->cpuIDsize;*/
|
||||
|
||||
|
||||
conf->nAggrs = n_aggrs;
|
||||
/* First pass gets nAggrs = -1 */
|
||||
if(conf->nAggrs <=0 ||
|
||||
MIN(conf->nProcs, conf->ioMaxSize /*virtualPsetSize*/) < conf->nAggrs)
|
||||
conf->nAggrs = ADIOI_BG_NAGG_PSET_DFLT;
|
||||
if(conf->nAggrs > conf->numBridgeRanks) /* maybe? * conf->cpuIDsize) */
|
||||
conf->nAggrs = conf->numBridgeRanks; /* * conf->cpuIDsize; */
|
||||
|
||||
conf->aggRatio = 1. * conf->nAggrs / conf->ioMaxSize /*virtualPsetSize*/;
|
||||
if(conf->aggRatio > 1) conf->aggRatio = 1.;
|
||||
TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, vps: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->ioMaxSize /*virtualPsetSize*/, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
|
||||
if(conf->nAggrs <=0)
|
||||
conf->nAggrs = gpfsmpio_bg_nagg_pset;
|
||||
if(conf->ioMinSize <= conf->nAggrs)
|
||||
conf->nAggrs = ADIOI_MAX(1,conf->ioMinSize-1); /* not including bridge itself */
|
||||
/* if(conf->nAggrs > conf->numBridgeRanks)
|
||||
conf->nAggrs = conf->numBridgeRanks;
|
||||
*/
|
||||
conf->aggRatio = 1. * conf->nAggrs / conf->ioMinSize /*virtualPsetSize*/;
|
||||
/* if(conf->aggRatio > 1) conf->aggRatio = 1.; */
|
||||
TRACE_ERR("n_aggrs %zd, conf->nProcs %zu, conf->ioMaxSize %zu, ADIOI_BG_NAGG_PSET_DFLT %zu,conf->numBridgeRanks %zu,conf->nAggrs %zu\n",(size_t)n_aggrs, (size_t)conf->nProcs, (size_t)conf->ioMaxSize, (size_t)ADIOI_BG_NAGG_PSET_DFLT,(size_t)conf->numBridgeRanks,(size_t)conf->nAggrs);
|
||||
TRACE_ERR("Maximum ranks under a bridge rank: %d, minimum: %d, nAggrs: %d, numBridgeRanks: %d pset dflt: %d naggrs: %d ratio: %f\n", maxcompute, mincompute, conf->nAggrs, conf->numBridgeRanks, ADIOI_BG_NAGG_PSET_DFLT, conf->nAggrs, conf->aggRatio);
|
||||
}
|
||||
|
||||
ADIOI_BG_assert((bridgerank != -1));
|
||||
ADIOI_Assert((bridgerank != -1));
|
||||
proc->bridgeRank = bridgerank;
|
||||
proc->iamBridge = iambridge;
|
||||
TRACE_ERR("Rank %d has bridge set index %d (bridge rank: %d) with %d other ranks, ioNodeIndex: %d\n", rank, proc->ioNodeIndex, bridgerank, proc->myIOSize, proc->ioNodeIndex);
|
||||
@ -271,7 +369,7 @@ ADIOI_BG_persInfo_init(ADIOI_BG_ConfInfo_t *conf,
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
void
|
||||
ADIOI_BG_persInfo_free( ADIOI_BG_ConfInfo_t *conf, ADIOI_BG_ProcInfo_t *proc )
|
||||
{
|
||||
ADIOI_BG_ConfInfo_free( conf );
|
@ -8,7 +8,7 @@
|
||||
|
||||
/* File: ad_bg_pset.h
|
||||
*
|
||||
* Defines two structures that keep BG/L PSET specific information and their public interfaces:
|
||||
* Defines two structures that keep BlueGene PSET specific information and their public interfaces:
|
||||
* . ADIOI_BG_ProcInfo_t object keeps specific information to each process
|
||||
* . ADIOI_BG_ConfInfo_t object keeps general information for the whole communicator, only kept
|
||||
* on process 0.
|
||||
@ -17,10 +17,15 @@
|
||||
#ifndef AD_BG_PSET_H_
|
||||
#define AD_BG_PSET_H_
|
||||
|
||||
#ifdef HAVE_MPIX_H
|
||||
#include <mpix.h>
|
||||
#endif
|
||||
|
||||
/* Keeps specific information to each process, will be exchanged among processes */
|
||||
typedef struct {
|
||||
int ioNodeIndex; /* similar to psetNum on BGL/BGP */
|
||||
int rank; /* my rank */
|
||||
int ionID; /* ion id this cn is using */
|
||||
/* int myCoords[5]; */
|
||||
int bridgeRank; /* my bridge node (or proxy) rank */
|
||||
unsigned char coreID;
|
||||
@ -30,6 +35,9 @@ typedef struct {
|
||||
node, i.e. psetsize*/
|
||||
int iamBridge; /* am *I* the bridge rank? */
|
||||
int __ipad[2];
|
||||
unsigned nodeRank; /* torus coords converted to an integer for use with gpfsmpio_bridgeringagg */
|
||||
unsigned numNodesInPartition; /* number of physical nodes in the job partition */
|
||||
unsigned manhattanDistanceToBridge; /* number of hops between this rank and the bridge node */
|
||||
} ADIOI_BG_ProcInfo_t __attribute__((aligned(16)));
|
||||
|
||||
/* Keeps general information for the whole communicator, only on process 0 */
|
||||
@ -48,15 +56,9 @@ typedef struct {
|
||||
|
||||
|
||||
#undef MIN
|
||||
#define MIN(a,b) ((a<b ? a : b))
|
||||
#define MIN(a,b) (((a)<(b) ? (a) : (b)))
|
||||
|
||||
|
||||
/* Default is to choose 8 aggregator nodes in each 32 CN pset.
|
||||
Also defines default ratio of aggregator nodes in each a pset.
|
||||
For Virtual Node Mode, the ratio is 8/64 */
|
||||
#define ADIOI_BG_NAGG_PSET_MIN 1
|
||||
#define ADIOI_BG_NAGG_PSET_DFLT 8
|
||||
#define ADIOI_BG_PSET_SIZE_DFLT 32
|
||||
|
||||
|
||||
/* public funcs for ADIOI_BG_ProcInfo_t objects */
|
16
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/Makefile.mk
Обычный файл
16
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/Makefile.mk
Обычный файл
@ -0,0 +1,16 @@
|
||||
## -*- Mode: Makefile; -*-
|
||||
## vim: set ft=automake :
|
||||
##
|
||||
## (C) 2012 by Argonne National Laboratory.
|
||||
## See COPYRIGHT in top-level directory.
|
||||
##
|
||||
|
||||
if BUILD_AD_PE
|
||||
|
||||
noinst_HEADERS += \
|
||||
adio/ad_gpfs/pe/ad_pe_aggrs.h
|
||||
|
||||
romio_other_sources += \
|
||||
adio/ad_gpfs/pe/ad_pe_aggrs.c
|
||||
|
||||
endif BUILD_AD_PE
|
276
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/ad_pe_aggrs.c
Обычный файл
276
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/ad_pe_aggrs.c
Обычный файл
@ -0,0 +1,276 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_pe_aggrs.c
|
||||
* \brief The externally used function from this file is is declared in ad_pe_aggrs.h
|
||||
*/
|
||||
|
||||
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
||||
/*
|
||||
* Copyright (C) 1997-2001 University of Chicago.
|
||||
* See COPYRIGHT notice in top-level directory.
|
||||
*/
|
||||
|
||||
/*#define TRACE_ON */
|
||||
|
||||
#include "adio.h"
|
||||
#include "adio_cb_config_list.h"
|
||||
#include "../ad_gpfs.h"
|
||||
#include "ad_pe_aggrs.h"
|
||||
#include "mpiimpl.h"
|
||||
|
||||
#ifdef AGGREGATION_PROFILE
|
||||
#include "mpe.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_DBG_LOGGING
|
||||
#define AGG_DEBUG 1
|
||||
#endif
|
||||
|
||||
#ifndef TRACE_ERR
|
||||
# define TRACE_ERR(format...)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Compute the aggregator-related parameters that are required in 2-phase
|
||||
* collective IO of ADIO.
|
||||
* The parameters are
|
||||
* . the number of aggregators (proxies) : fd->hints->cb_nodes
|
||||
* . the ranks of the aggregators : fd->hints->ranklist
|
||||
* If MP_IONODEFILE is defined, POE determines all tasks on every node listed
|
||||
* in the node file and defines MP_IOTASKLIST with them, making them all
|
||||
* aggregators. Alternatively, the user can explictly set MP_IOTASKLIST
|
||||
* themselves. The format of the MP_IOTASKLIST is a colon-delimited list of
|
||||
* task ids, the first entry being the total number of aggregators, for example
|
||||
* to specify 4 aggregators on task ids 0,8,16,24 the value would be:
|
||||
* 4:0:8:16:24. If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the
|
||||
* default aggregator selection is 1 task per node for every node of the job -
|
||||
* additionally, an environment variable MP_IOAGGR_CNT can be specified, which
|
||||
* defines the total number of aggregators, spread evenly across all the nodes.
|
||||
* The romio_cb_nodes and romio_cb_config_list hint user settings are ignored.
|
||||
*/
|
||||
int
|
||||
ADIOI_PE_gen_agg_ranklist(ADIO_File fd)
|
||||
{
|
||||
|
||||
int numAggs = 0;
|
||||
char *ioTaskList = getenv( "MP_IOTASKLIST" );
|
||||
char *ioAggrCount = getenv("MP_IOAGGR_CNT");
|
||||
int i,j;
|
||||
int inTERcommFlag = 0;
|
||||
|
||||
int myRank,commSize;
|
||||
MPI_Comm_rank(fd->comm, &myRank);
|
||||
MPI_Comm_size(fd->comm, &commSize);
|
||||
|
||||
MPI_Comm_test_inter(fd->comm, &inTERcommFlag);
|
||||
if (inTERcommFlag) {
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n");
|
||||
perror("ADIOI_PE_gen_agg_ranklist:");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
if (ioTaskList) {
|
||||
int ioTaskListLen = strlen(ioTaskList);
|
||||
int ioTaskListPos = 0;
|
||||
char tmpBuf[8]; /* Big enough for 1M tasks (7 digits task ID). */
|
||||
tmpBuf[7] = '\0';
|
||||
for (i=0; i<7; i++) {
|
||||
tmpBuf[i] = *ioTaskList++; /* Maximum is 7 digits for 1 million. */
|
||||
ioTaskListPos++;
|
||||
if (*ioTaskList == ':') { /* If the next char is a ':' ends it. */
|
||||
tmpBuf[i+1] = '\0';
|
||||
break;
|
||||
}
|
||||
}
|
||||
numAggs = atoi(tmpBuf);
|
||||
if (numAggs == 0)
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n");
|
||||
else if (!((numAggs > 0 ) && (numAggs <= commSize))) {
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",tmpBuf,commSize);
|
||||
numAggs = commSize;
|
||||
}
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
|
||||
|
||||
int aggIndex = 0;
|
||||
while (aggIndex < numAggs) {
|
||||
ioTaskList++; /* Advance past the ':' */
|
||||
ioTaskListPos++;
|
||||
int allDigits=1;
|
||||
for (i=0; i<7; i++) {
|
||||
if (*ioTaskList < '0' || *ioTaskList > '9')
|
||||
allDigits=0;
|
||||
tmpBuf[i] = *ioTaskList++;
|
||||
ioTaskListPos++;
|
||||
if ( (*ioTaskList == ':') || (*ioTaskList == '\0') ) {
|
||||
tmpBuf[i+1] = '\0';
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allDigits) {
|
||||
int newAggRank = atoi(tmpBuf);
|
||||
if (!((newAggRank >= 0 ) && (newAggRank < commSize))) {
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d - it will be ignored.\n",tmpBuf,commSize-1);
|
||||
}
|
||||
else {
|
||||
int aggAlreadyAdded = 0;
|
||||
for (i=0;i<aggIndex;i++)
|
||||
if (fd->hints->ranklist[i] == newAggRank) {
|
||||
aggAlreadyAdded = 1;
|
||||
break;
|
||||
}
|
||||
if (!aggAlreadyAdded)
|
||||
fd->hints->ranklist[aggIndex++] = newAggRank;
|
||||
else
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",newAggRank);
|
||||
}
|
||||
}
|
||||
else {
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id - it will be ignored.\n",tmpBuf);
|
||||
}
|
||||
|
||||
/* At the end check whether the list is shorter than specified. */
|
||||
if (ioTaskListPos == ioTaskListLen) {
|
||||
if (aggIndex == 0) {
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n");
|
||||
ADIOI_Free(fd->hints->ranklist);
|
||||
}
|
||||
else if (aggIndex < numAggs)
|
||||
FPRINTF(stderr,"ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",numAggs, aggIndex,aggIndex);
|
||||
numAggs = aggIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numAggs == 0) {
|
||||
MPID_Comm *mpidCommData;
|
||||
|
||||
MPID_Comm_get_ptr(fd->comm,mpidCommData);
|
||||
int localSize = mpidCommData->local_size;
|
||||
|
||||
// get my node rank
|
||||
int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank];
|
||||
|
||||
int *allNodeRanks = (int *) ADIOI_Malloc (localSize * sizeof(int));
|
||||
|
||||
allNodeRanks[myRank] = myNodeRank;
|
||||
MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm);
|
||||
|
||||
#ifdef AGG_DEBUG
|
||||
printf("MPID_Comm data: local_size is %d\nintranode_table entries:\n",mpidCommData->local_size);
|
||||
for (i=0;i<localSize;i++) {
|
||||
printf("%d ",mpidCommData->intranode_table[i]);
|
||||
}
|
||||
printf("\ninternode_table entries:\n");
|
||||
for (i=0;i<localSize;i++) {
|
||||
printf("%d ",mpidCommData->internode_table[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("\nallNodeRanks entries:\n");
|
||||
for (i=0;i<localSize;i++) {
|
||||
printf("%d ",allNodeRanks[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
#endif
|
||||
|
||||
if (ioAggrCount) {
|
||||
int cntType = -1;
|
||||
|
||||
if ( strcasecmp(ioAggrCount, "ALL") ) {
|
||||
if ( (cntType = atoi(ioAggrCount)) <= 0 ) {
|
||||
/* Input is other non-digit or less than 1 the assume */
|
||||
/* 1 aggregator per node. Note: atoi(-1) reutns -1. */
|
||||
/* No warning message given here -- done earlier. */
|
||||
cntType = -1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ALL is specified set aggr count to localSize */
|
||||
cntType = -2;
|
||||
}
|
||||
switch(cntType) {
|
||||
case -1:
|
||||
/* 1 aggr/node case */
|
||||
{
|
||||
int rankListIndex = 0;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
|
||||
for (i=0;i<localSize;i++) {
|
||||
if (allNodeRanks[i] == 0) {
|
||||
fd->hints->ranklist[rankListIndex++] = i;
|
||||
numAggs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case -2:
|
||||
/* ALL tasks case */
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
|
||||
for (i=0;i<localSize;i++) {
|
||||
fd->hints->ranklist[i] = i;
|
||||
numAggs++;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */
|
||||
if (cntType > localSize)
|
||||
cntType = localSize;
|
||||
|
||||
numAggs = cntType;
|
||||
// Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc
|
||||
int currentNodeRank = 0; // node rank currently being selected as aggregator
|
||||
int rankListIndex = 0;
|
||||
int currentAllNodeIndex = 0;
|
||||
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int));
|
||||
|
||||
while (rankListIndex < numAggs) {
|
||||
int foundEntry = 0;
|
||||
while (!foundEntry && (currentAllNodeIndex < localSize)) {
|
||||
if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) {
|
||||
fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex;
|
||||
foundEntry = 1;
|
||||
}
|
||||
currentAllNodeIndex++;
|
||||
}
|
||||
if (!foundEntry) {
|
||||
currentNodeRank++;
|
||||
currentAllNodeIndex = 0;
|
||||
}
|
||||
} // while
|
||||
break;
|
||||
} // switch(cntType)
|
||||
} // if (ioAggrCount)
|
||||
|
||||
else { // default is 1 aggregator per node
|
||||
// take the 0 entries from allNodeRanks
|
||||
int rankListIndex = 0;
|
||||
fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int));
|
||||
for (i=0;i<localSize;i++) {
|
||||
if (allNodeRanks[i] == 0) {
|
||||
fd->hints->ranklist[rankListIndex++] = i;
|
||||
numAggs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ADIOI_Free(allNodeRanks);
|
||||
|
||||
}
|
||||
|
||||
if ( getenv("MP_I_SHOW_AGGRS") ) {
|
||||
if (myRank == 0) {
|
||||
printf("Agg rank list of %d generated:\n", numAggs);
|
||||
for (i=0;i<numAggs;i++) {
|
||||
printf("%d ",fd->hints->ranklist[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
fd->hints->cb_nodes = numAggs;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
30
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/ad_pe_aggrs.h
Обычный файл
30
ompi/mca/io/romio/romio/adio/ad_gpfs/pe/ad_pe_aggrs.h
Обычный файл
@ -0,0 +1,30 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* (C)Copyright IBM Corp. 2007, 2008 */
|
||||
/* ---------------------------------------------------------------- */
|
||||
/**
|
||||
* \file ad_pe_aggrs.h
|
||||
* \brief ???
|
||||
*/
|
||||
|
||||
/*
|
||||
*
|
||||
* Declares functions specific for the PE platform within the GPFS
|
||||
* parallel I/O solution. For now simply processes the MP_IOTASKLIST
|
||||
* env var.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef AD_PE_AGGRS_H_
|
||||
#define AD_PE_AGGRS_H_
|
||||
|
||||
#include "adio.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
#if !defined(GPFS_SUPER_MAGIC)
|
||||
#define GPFS_SUPER_MAGIC (0x47504653)
|
||||
#endif
|
||||
|
||||
/* generate a list of I/O aggregators following a methodology specific for PE */
|
||||
int ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
|
||||
|
||||
#endif /* AD_PE_AGGRS_H_ */
|
@ -34,4 +34,6 @@ struct ADIOI_Fns_struct ADIO_GRIDFTP_operations = {
|
||||
ADIOI_GRIDFTP_Resize, /* Resize */
|
||||
ADIOI_GRIDFTP_Delete, /* Delete */
|
||||
ADIOI_GRIDFTP_Feature, /* Features */
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -1,3 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
*
|
||||
* (C) 2008 by Argonne National Laboratory.
|
||||
* See COPYRIGHT in top-level directory.
|
||||
*/
|
||||
int ADIOI_GRIDFTP_Feature (ADIO_File fd, int flag)
|
||||
{
|
||||
switch(flag) {
|
||||
|
@ -106,7 +106,8 @@ void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
*error_code)
|
||||
{
|
||||
static char myname[]="ADIOI_GRIDFTP_ReadContig";
|
||||
int myrank, nprocs, datatype_size;
|
||||
int myrank, nprocs;
|
||||
MPI_Count datatype_size;
|
||||
globus_size_t len,bytes_read=0;
|
||||
globus_off_t goff;
|
||||
globus_result_t result;
|
||||
@ -121,7 +122,7 @@ void ADIOI_GRIDFTP_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &myrank);
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
|
||||
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
|
||||
{
|
||||
@ -219,11 +220,11 @@ void ADIOI_GRIDFTP_ReadDiscontig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Comm_size(fd->comm,&nprocs);
|
||||
|
||||
etype_size=fd->etype_size;
|
||||
MPI_Type_size(fd->filetype,&ftype_size);
|
||||
MPI_Type_size_x(fd->filetype,&ftype_size);
|
||||
MPI_Type_extent(fd->filetype,&ftype_extent);
|
||||
/* This is arguably unnecessary, as this routine assumes that the
|
||||
buffer in memory is contiguous */
|
||||
MPI_Type_size(datatype,&btype_size);
|
||||
MPI_Type_size_x(datatype,&btype_size);
|
||||
MPI_Type_extent(datatype,&btype_extent);
|
||||
ADIOI_Datatype_iscontig(datatype,&buf_contig);
|
||||
|
||||
@ -415,7 +416,7 @@ void ADIOI_GRIDFTP_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &myrank);
|
||||
|
||||
MPI_Type_size(datatype,&btype_size);
|
||||
MPI_Type_size_x(datatype,&btype_size);
|
||||
bufsize=count*btype_size;
|
||||
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
|
||||
ADIOI_Datatype_iscontig(datatype,&buf_contig);
|
||||
|
@ -112,7 +112,8 @@ void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
*error_code)
|
||||
{
|
||||
char myname[]="ADIOI_GRIDFTP_WriteContig";
|
||||
int myrank, nprocs, datatype_size;
|
||||
int myrank, nprocs;
|
||||
MPI_Count datatype_size;
|
||||
globus_size_t len,bytes_written=0;
|
||||
globus_off_t goff;
|
||||
globus_result_t result;
|
||||
@ -127,7 +128,7 @@ void ADIOI_GRIDFTP_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &myrank);
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
|
||||
if (file_ptr_type != ADIO_EXPLICIT_OFFSET)
|
||||
{
|
||||
@ -219,11 +220,11 @@ void ADIOI_GRIDFTP_WriteDiscontig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Comm_rank(fd->comm,&myrank);
|
||||
MPI_Comm_size(fd->comm,&nprocs);
|
||||
etype_size=fd->etype_size;
|
||||
MPI_Type_size(fd->filetype,&ftype_size);
|
||||
MPI_Type_size_x(fd->filetype,&ftype_size);
|
||||
MPI_Type_extent(fd->filetype,&ftype_extent);
|
||||
/* This is arguably unnecessary, as this routine assumes that the
|
||||
buffer in memory is contiguous */
|
||||
MPI_Type_size(datatype,&btype_size);
|
||||
MPI_Type_size_x(datatype,&btype_size);
|
||||
MPI_Type_extent(datatype,&btype_extent);
|
||||
ADIOI_Datatype_iscontig(datatype,&buf_contig);
|
||||
|
||||
@ -406,7 +407,7 @@ void ADIOI_GRIDFTP_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
MPI_Comm_size(fd->comm, &nprocs);
|
||||
MPI_Comm_rank(fd->comm, &myrank);
|
||||
|
||||
MPI_Type_size(datatype,&btype_size);
|
||||
MPI_Type_size_x(datatype,&btype_size);
|
||||
bufsize=count*btype_size;
|
||||
ADIOI_Datatype_iscontig(fd->filetype,&file_contig);
|
||||
ADIOI_Datatype_iscontig(datatype,&buf_contig);
|
||||
|
@ -33,4 +33,6 @@ struct ADIOI_Fns_struct ADIO_HFS_operations = {
|
||||
ADIOI_GEN_Flush, /* Flush */
|
||||
ADIOI_HFS_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -15,12 +15,12 @@ void ADIOI_HFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size, len;
|
||||
MPI_Count err=-1, datatype_size, len;
|
||||
#ifndef PRINT_ERR_MSG
|
||||
static char myname[] = "ADIOI_HFS_READCONTIG";
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
#ifdef SPPUX
|
||||
|
@ -15,12 +15,12 @@ void ADIOI_HFS_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size, len;
|
||||
MPI_Count err=-1, datatype_size, len;
|
||||
#ifndef PRINT_ERR_MSG
|
||||
static char myname[] = "ADIOI_HFS_WRITECONTIG";
|
||||
#endif
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
#ifdef SPPUX
|
||||
|
@ -40,4 +40,7 @@ struct ADIOI_Fns_struct ADIO_LUSTRE_operations = {
|
||||
ADIOI_GEN_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_GEN_Feature, /* Features */
|
||||
"LUSTRE:",
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -48,15 +48,15 @@ void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
|
@ -215,8 +215,8 @@ void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
|
||||
my_req[i].offsets = (ADIO_Offset *)
|
||||
ADIOI_Malloc(count_my_req_per_proc[i] *
|
||||
sizeof(ADIO_Offset));
|
||||
my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
|
||||
sizeof(int));
|
||||
my_req[i].lens = ADIOI_Malloc(count_my_req_per_proc[i] *
|
||||
sizeof(ADIO_Offset));
|
||||
count_my_req_procs++;
|
||||
}
|
||||
my_req[i].count = 0; /* will be incremented where needed later */
|
||||
|
@ -10,14 +10,18 @@
|
||||
|
||||
#include "ad_lustre.h"
|
||||
#include "adio_extern.h"
|
||||
#include "hint_fns.h"
|
||||
#ifdef HAVE_LIMITS_H
|
||||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
{
|
||||
char *value;
|
||||
int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
|
||||
int flag;
|
||||
ADIO_Offset stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
|
||||
struct lov_user_md lum = { 0 };
|
||||
int err, myrank, fd_sys, perm, amode, old_mask;
|
||||
int int_val, tmp_val;
|
||||
static char myname[] = "ADIOI_LUSTRE_SETINFO";
|
||||
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
@ -44,17 +48,17 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
ADIOI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag)
|
||||
str_unit=atoi(value);
|
||||
str_unit=atoll(value);
|
||||
|
||||
ADIOI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag)
|
||||
str_factor=atoi(value);
|
||||
str_factor=atoll(value);
|
||||
|
||||
ADIOI_Info_get(users_info, "romio_lustre_start_iodevice",
|
||||
MPI_MAX_INFO_VAL, value, &flag);
|
||||
if (flag)
|
||||
start_iodev=atoi(value);
|
||||
start_iodev=atoll(value);
|
||||
|
||||
/* direct read and write */
|
||||
ADIOI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
|
||||
@ -78,7 +82,7 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
stripe_val[1] = str_unit;
|
||||
stripe_val[2] = start_iodev;
|
||||
}
|
||||
MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
|
||||
MPI_Bcast(stripe_val, 3, MPI_OFFSET, 0, fd->comm);
|
||||
|
||||
if (stripe_val[0] != str_factor
|
||||
|| stripe_val[1] != str_unit
|
||||
@ -121,8 +125,20 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
lum.lmm_magic = LOV_USER_MAGIC;
|
||||
lum.lmm_pattern = 0;
|
||||
lum.lmm_stripe_size = str_unit;
|
||||
/* crude check for overflow of lustre internal datatypes.
|
||||
* Silently cap to large value if user provides a value
|
||||
* larger than lustre supports */
|
||||
if (lum.lmm_stripe_size != str_unit) {
|
||||
lum.lmm_stripe_size = UINT_MAX;
|
||||
}
|
||||
lum.lmm_stripe_count = str_factor;
|
||||
if ( lum.lmm_stripe_count != str_factor) {
|
||||
lum.lmm_stripe_count = USHRT_MAX;
|
||||
}
|
||||
lum.lmm_stripe_offset = start_iodev;
|
||||
if (lum.lmm_stripe_offset != start_iodev) {
|
||||
lum.lmm_stripe_offset = USHRT_MAX;
|
||||
}
|
||||
|
||||
err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
|
||||
if (err == -1 && errno != EEXIST) {
|
||||
@ -138,56 +154,19 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
if (users_info != MPI_INFO_NULL) {
|
||||
/* CO: IO Clients/OST,
|
||||
* to keep the load balancing between clients and OSTs */
|
||||
ADIOI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag && (int_val = atoi(value)) > 0) {
|
||||
tmp_val = int_val;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != int_val) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_lustre_co_ratio",
|
||||
error_code);
|
||||
ADIOI_Free(value);
|
||||
return;
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "romio_lustre_co_ratio", value);
|
||||
fd->hints->fs_hints.lustre.co_ratio = atoi(value);
|
||||
}
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_co_ratio",
|
||||
&(fd->hints->fs_hints.lustre.co_ratio), myname, error_code );
|
||||
|
||||
/* coll_threshold:
|
||||
* if the req size is bigger than this, collective IO may not be performed.
|
||||
*/
|
||||
ADIOI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
|
||||
&flag);
|
||||
if (flag && (int_val = atoi(value)) > 0) {
|
||||
tmp_val = int_val;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != int_val) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_lustre_coll_threshold",
|
||||
error_code);
|
||||
ADIOI_Free(value);
|
||||
return;
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
|
||||
fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
|
||||
}
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "romio_lustre_coll_threshold",
|
||||
&(fd->hints->fs_hints.lustre.coll_threshold), myname, error_code );
|
||||
|
||||
/* ds_in_coll: disable data sieving in collective IO */
|
||||
ADIOI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && (!strcmp(value, "disable") ||
|
||||
!strcmp(value, "DISABLE"))) {
|
||||
tmp_val = int_val = 2;
|
||||
MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != int_val) {
|
||||
MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
|
||||
"romio_lustre_ds_in_coll",
|
||||
error_code);
|
||||
ADIOI_Free(value);
|
||||
return;
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
|
||||
fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
|
||||
}
|
||||
ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_lustre_ds_in_coll",
|
||||
&(fd->hints->fs_hints.lustre.ds_in_coll), myname, error_code );
|
||||
|
||||
}
|
||||
/* set the values for collective I/O and data sieving parameters */
|
||||
ADIOI_GEN_SetInfo(fd, users_info, error_code);
|
||||
|
@ -105,50 +105,7 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
|
||||
/* --BEGIN ERROR HANDLING-- */
|
||||
if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
|
||||
(fd->direct_write || fd->direct_read))) {
|
||||
if (errno == ENAMETOOLONG)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamelong",
|
||||
"**filenamelong %s %d",
|
||||
fd->filename,
|
||||
strlen(fd->filename));
|
||||
else if (errno == ENOENT)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_NO_SUCH_FILE,
|
||||
"**filenoexist",
|
||||
"**filenoexist %s",
|
||||
fd->filename);
|
||||
else if (errno == ENOTDIR || errno == ELOOP)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_BAD_FILE,
|
||||
"**filenamedir",
|
||||
"**filenamedir %s",
|
||||
fd->filename);
|
||||
else if (errno == EACCES) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_ACCESS,
|
||||
"**fileaccess",
|
||||
"**fileaccess %s",
|
||||
fd->filename );
|
||||
}
|
||||
else if (errno == EROFS) {
|
||||
/* Read only file or file system and write access requested */
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_READ_ONLY,
|
||||
"**ioneedrd", 0 );
|
||||
}
|
||||
else {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
|
||||
}
|
||||
/* --END ERROR HANDLING-- */
|
||||
else *error_code = MPI_SUCCESS;
|
||||
|
@ -8,15 +8,22 @@
|
||||
* Copyright (C) 2008 Sun Microsystems, Lustre group
|
||||
*/
|
||||
|
||||
#ifdef _STDC_C99
|
||||
#define _XOPEN_SOURCE 600
|
||||
#else
|
||||
#define _XOPEN_SOURCE 500
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include "ad_lustre.h"
|
||||
|
||||
#define LUSTRE_MEMALIGN (1<<12) /* to use page_shift */
|
||||
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
|
||||
ADIO_Offset offset, int *err);
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len,
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, const void *buf, int len,
|
||||
ADIO_Offset offset, int *err)
|
||||
{
|
||||
int rem, size, nbytes;
|
||||
@ -33,29 +40,29 @@ static void ADIOI_LUSTRE_Aligned_Mem_File_Write(ADIO_File fd, void *buf, int len
|
||||
}
|
||||
}
|
||||
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
|
||||
ADIO_Offset offset, int *err);
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, void *buf, int len,
|
||||
static void ADIOI_LUSTRE_Aligned_Mem_File_Read(ADIO_File fd, const void *buf, int len,
|
||||
ADIO_Offset offset, int *err)
|
||||
{
|
||||
int rem, size, nbytes;
|
||||
if (!(len % fd->d_miniosz) && (len >= fd->d_miniosz))
|
||||
*err = pread(fd->fd_direct, buf, len, offset);
|
||||
*err = pread(fd->fd_direct, (void *)buf, len, offset);
|
||||
else if (len < fd->d_miniosz)
|
||||
*err = pread(fd->fd_sys, buf, len, offset);
|
||||
*err = pread(fd->fd_sys, (void *)buf, len, offset);
|
||||
else {
|
||||
rem = len % fd->d_miniosz;
|
||||
size = len - rem;
|
||||
nbytes = pread(fd->fd_direct, buf, size, offset);
|
||||
nbytes = pread(fd->fd_direct, (void *)buf, size, offset);
|
||||
nbytes += pread(fd->fd_sys, ((char *)buf) + size, rem, offset+size);
|
||||
*err = nbytes;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
|
||||
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
|
||||
off_t offset, int rw);
|
||||
static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
|
||||
static int ADIOI_LUSTRE_Directio(ADIO_File fd, const void *buf, int len,
|
||||
off_t offset, int rw)
|
||||
{
|
||||
int err=-1, diff, size=len, nbytes = 0;
|
||||
@ -65,9 +72,9 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
|
||||
diff = fd->d_miniosz - (offset % fd->d_miniosz);
|
||||
diff = ADIOI_MIN(diff, len);
|
||||
if (rw)
|
||||
nbytes = pwrite(fd->fd_sys, buf, diff, offset);
|
||||
nbytes = pwrite(fd->fd_sys, (void *)buf, diff, offset);
|
||||
else
|
||||
nbytes = pread(fd->fd_sys, buf, diff, offset);
|
||||
nbytes = pread(fd->fd_sys, (void *)buf, diff, offset);
|
||||
buf = ((char *) buf) + diff;
|
||||
offset += diff;
|
||||
size = len - diff;
|
||||
@ -100,30 +107,31 @@ static int ADIOI_LUSTRE_Directio(ADIO_File fd, void *buf, int len,
|
||||
newbuf = (void *) memalign(LUSTRE_MEMALIGN, size);
|
||||
if (newbuf) {
|
||||
ADIOI_LUSTRE_Aligned_Mem_File_Read(fd, newbuf, size, offset, &err);
|
||||
if (err > 0) memcpy(buf, newbuf, err);
|
||||
if (err > 0) memcpy((void *)buf, newbuf, err);
|
||||
nbytes += err;
|
||||
ADIOI_Free(newbuf);
|
||||
}
|
||||
else nbytes += pread(fd->fd_sys, buf, size, offset);
|
||||
else nbytes += pread(fd->fd_sys, (void *)buf, size, offset);
|
||||
}
|
||||
err = nbytes;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
|
||||
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int io_mode, int *error_code);
|
||||
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
|
||||
static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int io_mode, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size, len;
|
||||
int err=-1;
|
||||
MPI_Count datatype_size, len;
|
||||
static char myname[] = "ADIOI_LUSTRE_IOCONTIG";
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) {
|
||||
@ -148,7 +156,7 @@ static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count,
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
|
||||
#endif
|
||||
err = read(fd->fd_sys, buf, len);
|
||||
err = read(fd->fd_sys, (void *)buf, len);
|
||||
#ifdef ADIOI_MPE_LOGGING
|
||||
MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
|
||||
#endif
|
||||
@ -183,7 +191,7 @@ ioerr:
|
||||
/* --END ERROR HANDLING-- */
|
||||
}
|
||||
|
||||
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include "adio_extern.h"
|
||||
|
||||
/* prototypes of functions used for collective writes only. */
|
||||
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
|
||||
MPI_Datatype datatype, int nprocs,
|
||||
int myrank,
|
||||
ADIOI_Access *others_req,
|
||||
@ -22,7 +22,7 @@ static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
|
||||
int contig_access_count,
|
||||
int *striping_info,
|
||||
int **buf_idx, int *error_code);
|
||||
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
char **send_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
@ -35,14 +35,14 @@ static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
|
||||
int *curr_to_proc,
|
||||
int *done_to_proc, int iter,
|
||||
MPI_Aint buftype_extent);
|
||||
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
|
||||
char *write_buf,
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
ADIO_Offset *len_list, int *send_size,
|
||||
int *recv_size, ADIO_Offset off,
|
||||
int size, int *count,
|
||||
int *start_pos,
|
||||
int *start_pos,
|
||||
int *sent_to_proc, int nprocs,
|
||||
int myrank, int buftype_is_contig,
|
||||
int contig_access_count,
|
||||
@ -59,7 +59,7 @@ void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
|
||||
ADIO_Offset *srt_off, int *srt_len, int *start_pos,
|
||||
int nprocs, int nprocs_recv, int total_elements);
|
||||
|
||||
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype,
|
||||
int file_ptr_type, ADIO_Offset offset,
|
||||
ADIO_Status *status, int *error_code)
|
||||
@ -266,9 +266,9 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
if (status) {
|
||||
int bufsize, size;
|
||||
MPI_Count bufsize, size;
|
||||
/* Don't set status if it isn't needed */
|
||||
MPI_Type_size(datatype, &size);
|
||||
MPI_Type_size_x(datatype, &size);
|
||||
bufsize = size * count;
|
||||
MPIR_Status_set_bytes(status, datatype, bufsize);
|
||||
}
|
||||
@ -283,7 +283,7 @@ void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
|
||||
/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
|
||||
* code is created and returned in error_code.
|
||||
*/
|
||||
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, const void *buf,
|
||||
MPI_Datatype datatype, int nprocs,
|
||||
int myrank, ADIOI_Access *others_req,
|
||||
ADIOI_Access *my_req,
|
||||
@ -613,14 +613,14 @@ over:
|
||||
/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
|
||||
* in the case of error.
|
||||
*/
|
||||
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, const void *buf,
|
||||
char *write_buf,
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
ADIO_Offset *len_list, int *send_size,
|
||||
int *recv_size, ADIO_Offset off,
|
||||
int size, int *count,
|
||||
int *start_pos,
|
||||
int *start_pos,
|
||||
int *sent_to_proc, int nprocs,
|
||||
int myrank, int buftype_is_contig,
|
||||
int contig_access_count,
|
||||
@ -656,7 +656,7 @@ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
|
||||
j = 0;
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
if (recv_size[i]) {
|
||||
MPI_Type_hindexed(count[i],
|
||||
ADIOI_Type_create_hindexed_x(count[i],
|
||||
&(others_req[i].lens[start_pos[i]]),
|
||||
&(others_req[i].mem_ptrs[start_pos[i]]),
|
||||
MPI_BYTE, recv_types + j);
|
||||
@ -885,7 +885,7 @@ static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
|
||||
ADIOI_BUF_INCR \
|
||||
}
|
||||
|
||||
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
|
||||
static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, const void *buf,
|
||||
ADIOI_Flatlist_node *flat_buf,
|
||||
char **send_buf,
|
||||
ADIO_Offset *offset_list,
|
||||
|
@ -144,7 +144,7 @@
|
||||
} \
|
||||
}
|
||||
|
||||
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status * status,
|
||||
int *error_code)
|
||||
@ -156,7 +156,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
int n_etypes_in_filetype;
|
||||
ADIO_Offset num, size, n_filetypes, etype_in_filetype, st_n_filetypes;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size;
|
||||
MPI_Count filetype_size, etype_size, buftype_size;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off;
|
||||
@ -186,7 +186,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
MPI_Type_size_x(fd->filetype, &filetype_size);
|
||||
if (!filetype_size) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
@ -196,7 +196,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_size_x(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
|
@ -37,4 +37,7 @@ struct ADIOI_Fns_struct ADIO_NFS_operations = {
|
||||
ADIOI_NFS_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_NFS_Feature, /* Features */
|
||||
"NFS:", /* fsname: just a string */
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -74,7 +74,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset offset, ADIO_Status *status, int
|
||||
*error_code);
|
||||
void ADIOI_NFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
|
||||
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset *shared_fp,
|
||||
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp,
|
||||
int *error_code);
|
||||
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code);
|
||||
void ADIOI_NFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
|
||||
|
@ -1,3 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
*
|
||||
* (C) 2008 by Argonne National Laboratory.
|
||||
* See COPYRIGHT in top-level directory.
|
||||
*/
|
||||
#include "adio.h"
|
||||
#include "ad_nfs.h"
|
||||
|
||||
@ -11,6 +17,7 @@ int ADIOI_NFS_Feature(ADIO_File fd, int flag)
|
||||
return 1;
|
||||
case ADIO_SCALABLE_OPEN:
|
||||
case ADIO_UNLINK_AFTER_CLOSE:
|
||||
case ADIO_SCALABLE_RESIZE:
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
@ -12,11 +12,11 @@
|
||||
shared_fp by the number of etypes to be accessed (incr) in the read
|
||||
or write following this function. */
|
||||
|
||||
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, int incr, ADIO_Offset *shared_fp,
|
||||
void ADIOI_NFS_Get_shared_fp(ADIO_File fd, ADIO_Offset incr, ADIO_Offset *shared_fp,
|
||||
int *error_code)
|
||||
{
|
||||
ADIO_Offset new_fp;
|
||||
int err;
|
||||
ssize_t err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_NFS_GET_SHARED_FP";
|
||||
|
||||
|
@ -13,11 +13,11 @@ void ADIOI_NFS_IreadContig(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset offset, ADIO_Request *request,
|
||||
int *error_code)
|
||||
{
|
||||
int len, typesize;
|
||||
MPI_Count len, typesize;
|
||||
int aio_errno = 0;
|
||||
static char myname[] = "ADIOI_NFS_IREADCONTIG";
|
||||
|
||||
MPI_Type_size(datatype, &typesize);
|
||||
MPI_Type_size_x(datatype, &typesize);
|
||||
len = count * typesize;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
|
||||
|
@ -20,11 +20,11 @@ void ADIOI_NFS_IwriteContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int *error_code)
|
||||
{
|
||||
int len, typesize;
|
||||
MPI_Count len, typesize;
|
||||
int aio_errno = 0;
|
||||
static char myname[] = "ADIOI_NFS_IWRITECONTIG";
|
||||
|
||||
MPI_Type_size(datatype, &typesize);
|
||||
MPI_Type_size_x(datatype, &typesize);
|
||||
len = count * typesize;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) offset = fd->fp_ind;
|
||||
|
@ -52,64 +52,7 @@ void ADIOI_NFS_Open(ADIO_File fd, int *error_code)
|
||||
}
|
||||
|
||||
if (fd->fd_sys == -1) {
|
||||
/* Check for special error codes for those MPI error
|
||||
classes that relate to particular problems */
|
||||
if (errno == ENAMETOOLONG)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamelong",
|
||||
"**filenamelong %s %d",
|
||||
fd->filename,
|
||||
strlen(fd->filename));
|
||||
else if (errno == ENOENT)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_NO_SUCH_FILE,
|
||||
"**filenoexist",
|
||||
"**filenoexist %s",
|
||||
fd->filename);
|
||||
else if (errno == ENOTDIR || errno == ELOOP)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamedir",
|
||||
"**filenamedir %s",
|
||||
fd->filename);
|
||||
else if (errno == EACCES) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_ACCESS,
|
||||
"**fileaccess",
|
||||
"**fileaccess %s",
|
||||
fd->filename);
|
||||
}
|
||||
else if (errno == EROFS) {
|
||||
/* Read only file or file system and write access requested */
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_READ_ONLY,
|
||||
"**ioneedrd", 0);
|
||||
}
|
||||
else if(errno == EISDIR) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filename", 0);
|
||||
}
|
||||
else if(errno == EEXIST) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_FILE_EXISTS,
|
||||
"**fileexist", 0);
|
||||
|
||||
}
|
||||
else {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
@ -12,10 +12,11 @@ void ADIOI_NFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size, len;
|
||||
int err=-1;
|
||||
MPI_Count datatype_size, len;
|
||||
static char myname[] = "ADIOI_NFS_READCONTIG";
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
@ -171,7 +172,8 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
|
||||
int n_filetypes, etype_in_filetype;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size, req_len, partial_read;
|
||||
int req_len, partial_read;
|
||||
MPI_Count filetype_size, etype_size, buftype_size;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off;
|
||||
@ -185,7 +187,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
MPI_Type_size_x(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
@ -195,7 +197,7 @@ void ADIOI_NFS_ReadStrided(ADIO_File fd, void *buf, int count,
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_size_x(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
|
@ -30,7 +30,7 @@ Unlock
|
||||
|
||||
void ADIOI_NFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code)
|
||||
{
|
||||
int err;
|
||||
ssize_t err;
|
||||
MPI_Comm dupcommself;
|
||||
static char myname[] = "ADIOI_NFS_SET_SHARED_FP";
|
||||
|
||||
|
@ -12,10 +12,11 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status, int *error_code)
|
||||
{
|
||||
int err=-1, datatype_size, len;
|
||||
int err=-1;
|
||||
MPI_Count datatype_size, len;
|
||||
static char myname[] = "ADIOI_NFS_WRITECONTIG";
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
|
||||
@ -110,7 +111,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
goto fn_exit; \
|
||||
} \
|
||||
} \
|
||||
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
@ -140,7 +141,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
goto fn_exit; \
|
||||
} \
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
@ -164,7 +165,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
goto fn_exit; \
|
||||
} \
|
||||
} \
|
||||
write_sz = (int) (ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off)); \
|
||||
@ -186,7 +187,7 @@ void ADIOI_NFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPIR_ERR_RECOVERABLE, myname, \
|
||||
__LINE__, MPI_ERR_IO, \
|
||||
"**ioRMWrdwr", 0); \
|
||||
return; \
|
||||
goto fn_exit; \
|
||||
} \
|
||||
write_sz = ADIOI_MIN(req_len, writebuf_len); \
|
||||
memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
|
||||
@ -275,12 +276,13 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
|
||||
int n_filetypes, etype_in_filetype;
|
||||
ADIO_Offset abs_off_in_filetype=0;
|
||||
int filetype_size, etype_size, buftype_size, req_len;
|
||||
int req_len;
|
||||
MPI_Count filetype_size, etype_size, buftype_size;
|
||||
MPI_Aint filetype_extent, buftype_extent;
|
||||
int buf_count, buftype_is_contig, filetype_is_contig;
|
||||
ADIO_Offset userbuf_off;
|
||||
ADIO_Offset off, req_off, disp, end_offset=0, writebuf_off, start_off;
|
||||
char *writebuf, *value;
|
||||
char *writebuf=NULL, *value;
|
||||
int st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
|
||||
int new_bwr_size, new_fwr_size, err_flag=0, info_flag, max_bufsize;
|
||||
static char myname[] = "ADIOI_NFS_WRITESTRIDED";
|
||||
@ -288,7 +290,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
|
||||
ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
|
||||
|
||||
MPI_Type_size(fd->filetype, &filetype_size);
|
||||
MPI_Type_size_x(fd->filetype, &filetype_size);
|
||||
if ( ! filetype_size ) {
|
||||
#ifdef HAVE_STATUS_SET_BYTES
|
||||
MPIR_Status_set_bytes(status, datatype, 0);
|
||||
@ -298,7 +300,7 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
}
|
||||
|
||||
MPI_Type_extent(fd->filetype, &filetype_extent);
|
||||
MPI_Type_size(datatype, &buftype_size);
|
||||
MPI_Type_size_x(datatype, &buftype_size);
|
||||
MPI_Type_extent(datatype, &buftype_extent);
|
||||
etype_size = fd->etype_size;
|
||||
|
||||
@ -364,8 +366,6 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
if (fd->atomicity)
|
||||
ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
@ -517,8 +517,8 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_IO,
|
||||
"ADIOI_NFS_WriteStrided: ROMIO tries to optimize this access by doing a read-modify-write, but is unable to read the file. Please give the file read permission and open it with MPI_MODE_RDWR.", 0);
|
||||
return;
|
||||
}
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
if (buftype_is_contig && !filetype_is_contig) {
|
||||
|
||||
@ -653,8 +653,6 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
|
||||
if (err == -1) err_flag = 1;
|
||||
|
||||
ADIOI_Free(writebuf); /* malloced in the buffered_write macro */
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;
|
||||
if (err_flag) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
@ -674,4 +672,8 @@ void ADIOI_NFS_WriteStrided(ADIO_File fd, const void *buf, int count,
|
||||
#endif
|
||||
|
||||
if (!buftype_is_contig) ADIOI_Delete_flattened(datatype);
|
||||
fn_exit:
|
||||
if (writebuf != NULL) ADIOI_Free(writebuf);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
struct ADIOI_Fns_struct ADIO_NTFS_operations = {
|
||||
ADIOI_NTFS_Open, /* Open */
|
||||
ADIOI_GEN_OpenColl, /* OpenColl */
|
||||
ADIOI_FAILSAFE_OpenColl, /* OpenColl */
|
||||
ADIOI_NTFS_ReadContig, /* ReadContig */
|
||||
ADIOI_NTFS_WriteContig, /* WriteContig */
|
||||
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
|
||||
@ -34,5 +34,7 @@ struct ADIOI_Fns_struct ADIO_NTFS_operations = {
|
||||
ADIOI_NTFS_Flush, /* Flush */
|
||||
ADIOI_NTFS_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_NTFS_Feature /* Features */
|
||||
ADIOI_NTFS_Feature, /* Features */
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -1,3 +1,9 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
||||
/*
|
||||
*
|
||||
* (C) 2008 by Argonne National Laboratory.
|
||||
* See COPYRIGHT in top-level directory.
|
||||
*/
|
||||
#include "adio.h"
|
||||
|
||||
int ADIOI_NTFS_Feature(ADIO_File fd, int flag)
|
||||
|
@ -10,11 +10,11 @@ void ADIOI_NTFS_IreadContig(ADIO_File fd, void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Request *request, int *error_code)
|
||||
{
|
||||
int len, typesize;
|
||||
MPI_Count len, typesize;
|
||||
int err;
|
||||
static char myname[] = "ADIOI_NTFS_IreadContig";
|
||||
|
||||
MPI_Type_size(datatype, &typesize);
|
||||
MPI_Type_size_x(datatype, &typesize);
|
||||
len = count * typesize;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL)
|
||||
|
@ -171,11 +171,11 @@ void ADIOI_NTFS_IwriteContig(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset offset, ADIO_Request *request,
|
||||
int *error_code)
|
||||
{
|
||||
int len, typesize;
|
||||
MPI_Count len, typesize;
|
||||
int err;
|
||||
static char myname[] = "ADIOI_NTFS_IwriteContig";
|
||||
|
||||
MPI_Type_size(datatype, &typesize);
|
||||
MPI_Type_size_x(datatype, &typesize);
|
||||
len = count * typesize;
|
||||
|
||||
if (file_ptr_type == ADIO_INDIVIDUAL)
|
||||
|
@ -13,7 +13,8 @@ void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
{
|
||||
LONG dwTemp;
|
||||
DWORD dwNumRead = 0;
|
||||
int err=-1, datatype_size, len;
|
||||
int err=-1;
|
||||
MPI_Count datatype_size, len;
|
||||
static char myname[] = "ADIOI_NTFS_ReadContig";
|
||||
OVERLAPPED *pOvl;
|
||||
|
||||
@ -23,7 +24,7 @@ void ADIOI_NTFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
offset = fd->fp_ind;
|
||||
}
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
|
||||
|
@ -14,7 +14,7 @@ void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
static char myname[] = "ADIOI_NTFS_WriteContig";
|
||||
LONG dwTemp;
|
||||
DWORD dwNumWritten = 0;
|
||||
int err=-1, datatype_size, len;
|
||||
MPI_Count err=-1, datatype_size, len;
|
||||
OVERLAPPED *pOvl;
|
||||
|
||||
/* If file pointer type in ADIO_INDIVIDUAL then offset should be
|
||||
@ -23,7 +23,7 @@ void ADIOI_NTFS_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
offset = fd->fp_ind;
|
||||
}
|
||||
|
||||
MPI_Type_size(datatype, &datatype_size);
|
||||
MPI_Type_size_x(datatype, &datatype_size);
|
||||
len = datatype_size * count;
|
||||
|
||||
pOvl = (OVERLAPPED *) ADIOI_Calloc(sizeof(OVERLAPPED), 1);
|
||||
|
@ -7,10 +7,6 @@
|
||||
|
||||
if BUILD_AD_PANFS
|
||||
|
||||
# I don't like this hard-coded path to the PANFS headers but I guess that's
|
||||
# where they always are?
|
||||
AM_CPPFLAGS += -I/opt/panfs/include
|
||||
|
||||
noinst_HEADERS += adio/ad_panfs/ad_panfs.h
|
||||
|
||||
romio_other_sources += \
|
||||
|
@ -41,4 +41,7 @@ struct ADIOI_Fns_struct ADIO_PANFS_operations = {
|
||||
ADIOI_PANFS_Resize, /* Resize */
|
||||
ADIOI_GEN_Delete, /* Delete */
|
||||
ADIOI_GEN_Feature,
|
||||
"PANFS: Panasas PanFS",
|
||||
ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */
|
||||
ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */
|
||||
};
|
||||
|
@ -32,11 +32,17 @@ void ADIOI_PANFS_ReadContig(ADIO_File fd, void *buf, int count,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
void ADIOI_PANFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code);
|
||||
void ADIOI_PANFS_WriteContig(ADIO_File fd, void *buf, int count,
|
||||
void ADIOI_PANFS_WriteContig(ADIO_File fd, const void *buf, int count,
|
||||
MPI_Datatype datatype, int file_ptr_type,
|
||||
ADIO_Offset offset, ADIO_Status *status,
|
||||
int *error_code);
|
||||
|
||||
/* TODO: move this to common code and have all routines retry. */
|
||||
/* TODO: also check for EWOULDBLOCK */
|
||||
#if defined(NEEDS_USLEEP_DECL)
|
||||
int usleep(useconds_t usec);
|
||||
#endif
|
||||
|
||||
/* Delay 1 ms */
|
||||
#define AD_PANFS_RETRY_DELAY 1000
|
||||
|
||||
|
@ -8,21 +8,13 @@
|
||||
|
||||
#include "ad_panfs.h"
|
||||
#include <pan_fs_client_cw_mode.h>
|
||||
#include "hint_fns.h"
|
||||
|
||||
void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
{
|
||||
#if defined(MPICH) || !defined(PRINT_ERR_MSG)
|
||||
static char myname[] = "ADIOI_PANFS_SETINFO";
|
||||
#endif
|
||||
char* value;
|
||||
int flag, tmp_val = -1;
|
||||
unsigned long int concurrent_write = 0;
|
||||
pan_fs_client_layout_agg_type_t layout_type = PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT;
|
||||
unsigned long int layout_stripe_unit = 0;
|
||||
unsigned long int layout_parity_stripe_width = 0;
|
||||
unsigned long int layout_parity_stripe_depth = 0;
|
||||
unsigned long int layout_total_num_comps = 0;
|
||||
pan_fs_client_layout_visit_t layout_visit_policy = PAN_FS_CLIENT_LAYOUT_VISIT__ROUND_ROBIN;
|
||||
int gen_error_code;
|
||||
|
||||
*error_code = MPI_SUCCESS;
|
||||
@ -33,104 +25,39 @@ void ADIOI_PANFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
|
||||
*/
|
||||
MPI_Info_create(&(fd->info));
|
||||
|
||||
/* anticipate concurrent writes in an MPI-IO application */
|
||||
ADIOI_Info_set (fd->info, "panfs_concurrent_write", "1");
|
||||
|
||||
/* has user specified striping parameters
|
||||
and do they have the same value on all processes? */
|
||||
if (users_info != MPI_INFO_NULL) {
|
||||
value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_concurrent_write", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
concurrent_write = strtoul(value,NULL,10);
|
||||
tmp_val = concurrent_write;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != concurrent_write) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_concurrent_write\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_concurrent_write", value);
|
||||
}
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_concurrent_write",
|
||||
NULL, myname, error_code);
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_type", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
layout_type = strtoul(value,NULL,10);
|
||||
tmp_val = layout_type;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_type) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_type\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_type", value);
|
||||
}
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_type",
|
||||
NULL, myname, error_code);
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_stripe_unit", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
layout_stripe_unit = strtoul(value,NULL,10);
|
||||
tmp_val = layout_stripe_unit;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_stripe_unit) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_stripe_unit\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_stripe_unit", value);
|
||||
}
|
||||
ADIOI_Info_check_and_install_int(fd, users_info, "panfs_layout_stripe_unit",
|
||||
NULL, myname, error_code);
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_width", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
|
||||
layout_parity_stripe_width = strtoul(value,NULL,10);
|
||||
tmp_val = layout_parity_stripe_width;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_parity_stripe_width) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_width\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_width", value);
|
||||
}
|
||||
/* strange: there was a check "layout_type ==
|
||||
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE, but
|
||||
* nothing ever touched layout_type */
|
||||
ADIOI_Info_check_and_install_int(fd, users_info,
|
||||
"panfs_layout_parity_stripe_width", NULL, myname, error_code);
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_parity_stripe_depth", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE)) {
|
||||
layout_parity_stripe_depth = strtoul(value,NULL,10);
|
||||
tmp_val = layout_parity_stripe_depth;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_parity_stripe_depth) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_parity_stripe_depth\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_parity_stripe_depth", value);
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_total_num_comps", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag) {
|
||||
layout_total_num_comps = strtoul(value,NULL,10);
|
||||
tmp_val = layout_total_num_comps;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_total_num_comps) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_total_num_comps\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_total_num_comps", value);
|
||||
}
|
||||
|
||||
ADIOI_Info_get(users_info, "panfs_layout_visit_policy", MPI_MAX_INFO_VAL,
|
||||
value, &flag);
|
||||
if (flag && (layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE || layout_type == PAN_FS_CLIENT_LAYOUT_TYPE__RAID10)) {
|
||||
layout_visit_policy = strtoul(value,NULL,10);
|
||||
tmp_val = layout_visit_policy;
|
||||
MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
|
||||
if (tmp_val != layout_visit_policy) {
|
||||
FPRINTF(stderr, "ADIOI_PANFS_SetInfo: the value for key \"panfs_layout_visit_policy\" must be the same on all processes\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", value);
|
||||
}
|
||||
|
||||
ADIOI_Free(value);
|
||||
ADIOI_Info_check_and_install_int(fd, users_info,
|
||||
"panfs_layout_parity_stripe_depth", NULL, myname, error_code);
|
||||
|
||||
ADIOI_Info_check_and_install_int(fd, users_info,
|
||||
"panfs_layout_total_num_comps", NULL, myname, error_code);
|
||||
/* this hint used to check for
|
||||
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID1_5_PARITY_STRIPE or
|
||||
* PAN_FS_CLIENT_LAYOUT_TYPE__RAID10, but again, layout_type never
|
||||
* gets updated */
|
||||
ADIOI_Info_check_and_install_int(fd, users_info,
|
||||
"panfs_layout_visit_policy", NULL, myname, error_code);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -191,7 +191,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
|
||||
}
|
||||
|
||||
/* create PanFS object */
|
||||
bzero(&file_create_args,sizeof(pan_fs_client_layout_create_args_t));
|
||||
memset(&file_create_args,0,sizeof(pan_fs_client_layout_create_args_t));
|
||||
/* open directory */
|
||||
fd_dir = open(path, O_RDONLY);
|
||||
if (fd_dir < 0) {
|
||||
@ -285,7 +285,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
|
||||
int rc;
|
||||
char temp_buffer[TEMP_BUFFER_SIZE];
|
||||
pan_fs_client_layout_query_args_t file_query_args;
|
||||
bzero(&file_query_args,sizeof(pan_fs_client_layout_query_args_t));
|
||||
memset(&file_query_args,0,sizeof(pan_fs_client_layout_query_args_t));
|
||||
file_query_args.version = PAN_FS_CLIENT_LAYOUT_VERSION;
|
||||
rc = ioctl(fd->fd_sys, PAN_FS_CLIENT_LAYOUT_QUERY_FILE, &file_query_args);
|
||||
if (rc < 0)
|
||||
@ -327,6 +327,10 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
|
||||
ADIOI_Snprintf(temp_buffer,TEMP_BUFFER_SIZE,"%u",file_query_args.layout.u.raid10.layout_visit_policy);
|
||||
ADIOI_Info_set(fd->info, "panfs_layout_visit_policy", temp_buffer);
|
||||
break;
|
||||
case PAN_FS_CLIENT_LAYOUT_TYPE__INVALID:
|
||||
case PAN_FS_CLIENT_LAYOUT_TYPE__DEFAULT:
|
||||
MPI_Info_set(fd->info, "panfs_layout_type",
|
||||
"PAN_FS_CLIENT_LAYOUT_TYPE__INVALID");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -338,50 +342,7 @@ void ADIOI_PANFS_Open(ADIO_File fd, int *error_code)
|
||||
fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
|
||||
|
||||
if (fd->fd_sys == -1) {
|
||||
if (errno == ENAMETOOLONG)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_BAD_FILE,
|
||||
"**filenamelong",
|
||||
"**filenamelong %s %d",
|
||||
fd->filename,
|
||||
strlen(fd->filename));
|
||||
else if (errno == ENOENT)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_NO_SUCH_FILE,
|
||||
"**filenoexist",
|
||||
"**filenoexist %s",
|
||||
fd->filename);
|
||||
else if (errno == ENOTDIR || errno == ELOOP)
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE,
|
||||
myname, __LINE__,
|
||||
MPI_ERR_BAD_FILE,
|
||||
"**filenamedir",
|
||||
"**filenamedir %s",
|
||||
fd->filename);
|
||||
else if (errno == EACCES) {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_ACCESS,
|
||||
"**fileaccess",
|
||||
"**fileaccess %s",
|
||||
fd->filename );
|
||||
}
|
||||
else if (errno == EROFS) {
|
||||
/* Read only file or file system and write access requested */
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_READ_ONLY,
|
||||
"**ioneedrd", 0 );
|
||||
}
|
||||
else {
|
||||
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
|
||||
MPIR_ERR_RECOVERABLE, myname,
|
||||
__LINE__, MPI_ERR_IO, "**io",
|
||||
"**io %s", strerror(errno));
|
||||
}
|
||||
*error_code = ADIOI_Err_create_code(myname, fd->filename, errno);
|
||||
}
|
||||
else *error_code = MPI_SUCCESS;
|
||||
}
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
x
Ссылка в новой задаче
Block a user