318 строки
10 KiB
C
318 строки
10 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2008-2014 University of Houston. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/* This code is based on the PVFS2 ADIO module in ROMIO
|
|
* Copyright (C) 1997 University of Chicago.
|
|
* See COPYRIGHT notice in top-level directory.
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
#include "fs_pvfs2.h"
|
|
|
|
#include "mpi.h"
|
|
#include "ompi/constants.h"
|
|
#include "ompi/mca/fs/fs.h"
|
|
#include "ompi/communicator/communicator.h"
|
|
#include "ompi/info/info.h"
|
|
#include "opal/datatype/opal_convertor.h"
|
|
#include "opal/datatype/opal_datatype.h"
|
|
#include "ompi/datatype/ompi_datatype.h"
|
|
|
|
struct open_status_s {
|
|
int error;
|
|
PVFS_object_ref object_ref;
|
|
};
|
|
typedef struct open_status_s open_status;
|
|
|
|
static void fake_an_open(PVFS_fs_id id,
|
|
char *pvfs2_name,
|
|
int access_mode,
|
|
int stripe_width,
|
|
PVFS_size stripe_size,
|
|
mca_fs_pvfs2 *pvfs2_fs,
|
|
open_status *o_status);
|
|
/*
|
|
* file_open_pvfs2: This is the same strategy as ROMIO's pvfs2 open
|
|
*
|
|
* Function: - opens a new file
|
|
* Accepts: - same arguments as MPI_File_open()
|
|
* Returns: - Success if new file handle
|
|
*/
|
|
int
|
|
mca_fs_pvfs2_file_open (struct ompi_communicator_t *comm,
|
|
char* filename,
|
|
int access_mode,
|
|
struct ompi_info_t *info,
|
|
mca_io_ompio_file_t *fh)
|
|
{
|
|
int ret;
|
|
mca_fs_pvfs2 *pvfs2_fs;
|
|
PVFS_fs_id pvfs2_id;
|
|
char pvfs2_path[OMPIO_MAX_NAME] = {0};
|
|
char * ncache_timeout;
|
|
open_status o_status = {0, {0, 0}};
|
|
struct ompi_datatype_t *open_status_type;
|
|
struct ompi_datatype_t *types[2] = {&ompi_mpi_int.dt, &ompi_mpi_byte.dt};
|
|
int lens[2] = {1, sizeof(PVFS_object_ref)};
|
|
OPAL_PTRDIFF_TYPE offsets[2];
|
|
char char_stripe[MPI_MAX_INFO_KEY];
|
|
int flag;
|
|
int fs_pvfs2_stripe_size = -1;
|
|
int fs_pvfs2_stripe_width = -1;
|
|
|
|
/* We are going to do what ROMIO does with one process resolving
|
|
* the name and broadcasting to others */
|
|
|
|
pvfs2_fs = (mca_fs_pvfs2 *) malloc(sizeof(mca_fs_pvfs2));
|
|
if (NULL == pvfs2_fs) {
|
|
opal_output (1, "OUT OF MEMORY\n");
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
if (!mca_fs_pvfs2_IS_INITIALIZED) {
|
|
/* disable the pvfs2 ncache */
|
|
ncache_timeout = getenv("PVFS2_NCACHE_TIMEOUT");
|
|
if (ncache_timeout == NULL ) {
|
|
setenv("PVFS2_NCACHE_TIMEOUT", "0", 1);
|
|
}
|
|
ret = PVFS_util_init_defaults();
|
|
if (ret < 0) {
|
|
PVFS_perror("PVFS_util_init_defaults", ret);
|
|
return OMPI_ERROR;
|
|
}
|
|
mca_fs_pvfs2_IS_INITIALIZED = 1;
|
|
}
|
|
|
|
memset(&(pvfs2_fs->credentials), 0, sizeof(PVFS_credentials));
|
|
PVFS_util_gen_credentials(&(pvfs2_fs->credentials));
|
|
|
|
/* check for stripe size and stripe depth in the info object and
|
|
update mca_fs_pvfs2_stripe_width and mca_fs_pvfs2_stripe_size
|
|
before calling fake_an_open() */
|
|
|
|
ompi_info_get (info, "stripe_size", MPI_MAX_INFO_VAL, char_stripe, &flag);
|
|
if ( flag ) {
|
|
sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_size );
|
|
}
|
|
|
|
ompi_info_get (info, "stripe_width", MPI_MAX_INFO_VAL, char_stripe, &flag);
|
|
if ( flag ) {
|
|
sscanf ( char_stripe, "%d", &fs_pvfs2_stripe_width );
|
|
}
|
|
|
|
if (fs_pvfs2_stripe_size < 0) {
|
|
fs_pvfs2_stripe_size = mca_fs_pvfs2_stripe_size;
|
|
}
|
|
|
|
if (fs_pvfs2_stripe_width < 0) {
|
|
fs_pvfs2_stripe_width = mca_fs_pvfs2_stripe_width;
|
|
}
|
|
|
|
|
|
if (OMPIO_ROOT == fh->f_rank) {
|
|
ret = PVFS_util_resolve(filename, &pvfs2_id, pvfs2_path, OMPIO_MAX_NAME);
|
|
if (ret < 0 ) {
|
|
PVFS_perror("PVFS_util_resolve", ret);
|
|
o_status.error = -1;
|
|
}
|
|
else {
|
|
fake_an_open (pvfs2_id,
|
|
pvfs2_path,
|
|
access_mode,
|
|
fs_pvfs2_stripe_width,
|
|
(PVFS_size)fs_pvfs2_stripe_size,
|
|
pvfs2_fs,
|
|
&o_status);
|
|
}
|
|
pvfs2_fs->object_ref = o_status.object_ref;
|
|
fh->f_fs_ptr = pvfs2_fs;
|
|
}
|
|
|
|
/* broadcast status and (possibly valid) object reference */
|
|
offsets[0] = (MPI_Aint)(&o_status.error);
|
|
offsets[1] = (MPI_Aint)(&o_status.object_ref);
|
|
|
|
ompi_datatype_create_struct (2, lens, offsets, types, &open_status_type);
|
|
ompi_datatype_commit (&open_status_type);
|
|
|
|
fh->f_comm->c_coll.coll_bcast (MPI_BOTTOM,
|
|
1,
|
|
open_status_type,
|
|
OMPIO_ROOT,
|
|
fh->f_comm,
|
|
fh->f_comm->c_coll.coll_bcast_module);
|
|
|
|
ompi_datatype_destroy (&open_status_type);
|
|
|
|
if (o_status.error != 0) {
|
|
/* No need to free the pvfs2_fs structure, since it will
|
|
be deallocated in file_close in case of an error */
|
|
fh->f_fs_ptr = NULL;
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
pvfs2_fs->object_ref = o_status.object_ref;
|
|
fh->f_fs_ptr = pvfs2_fs;
|
|
|
|
/* update the internal ompio structure to store stripe
|
|
size and stripe depth correctly.
|
|
Hadi(to be done): For this read the stripe size and stripe depth from
|
|
the file itself
|
|
*/
|
|
|
|
if (fs_pvfs2_stripe_size > 0 && fs_pvfs2_stripe_width > 0) {
|
|
fh->f_stripe_size = fs_pvfs2_stripe_size;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static void fake_an_open(PVFS_fs_id id,
|
|
char *pvfs2_name,
|
|
int access_mode,
|
|
int stripe_width,
|
|
PVFS_size stripe_size,
|
|
mca_fs_pvfs2 *pvfs2_fs,
|
|
open_status *o_status)
|
|
{
|
|
int ret;
|
|
PVFS_sysresp_lookup resp_lookup;
|
|
PVFS_sysresp_getparent resp_getparent;
|
|
PVFS_sysresp_create resp_create;
|
|
PVFS_sys_attr attribs;
|
|
PVFS_sys_dist *dist;
|
|
|
|
memset(&attribs, 0, sizeof(PVFS_sys_attr));
|
|
|
|
attribs.owner = geteuid();
|
|
attribs.group = getegid();
|
|
attribs.perms = 0644;
|
|
attribs.mask = PVFS_ATTR_SYS_ALL_SETABLE;
|
|
attribs.atime = time(NULL);
|
|
attribs.mtime = attribs.atime;
|
|
attribs.ctime = attribs.atime;
|
|
|
|
if (stripe_width > 0 ) {
|
|
attribs.dfile_count = stripe_width;
|
|
attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
|
|
}
|
|
|
|
dist = NULL;
|
|
|
|
memset(&resp_lookup, 0, sizeof(resp_lookup));
|
|
memset(&resp_getparent, 0, sizeof(resp_getparent));
|
|
memset(&resp_create, 0, sizeof(resp_create));
|
|
|
|
ret = PVFS_sys_lookup(id,
|
|
pvfs2_name,
|
|
&(pvfs2_fs->credentials),
|
|
&resp_lookup,
|
|
PVFS2_LOOKUP_LINK_FOLLOW);
|
|
|
|
if ( ret == (-PVFS_ENOENT)) {
|
|
if (access_mode & MPI_MODE_CREATE) {
|
|
ret = PVFS_sys_getparent(id,
|
|
pvfs2_name,
|
|
&(pvfs2_fs->credentials),
|
|
&resp_getparent);
|
|
if (ret < 0) {
|
|
opal_output (1, "pvfs_sys_getparent returns with %d\n", ret);
|
|
o_status->error = ret;
|
|
return;
|
|
}
|
|
|
|
/* Set the distribution stripe size if specified */
|
|
if (0 < stripe_size) {
|
|
/* Note that the distribution is hardcoded here */
|
|
dist = PVFS_sys_dist_lookup ("simple_stripe");
|
|
ret = PVFS_sys_dist_setparam (dist,
|
|
"strip_size",
|
|
&stripe_size);
|
|
if (ret < 0) {
|
|
opal_output (1,
|
|
"pvfs_sys_dist_setparam returns with %d\n", ret);
|
|
o_status->error = ret;
|
|
}
|
|
}
|
|
|
|
/* Perform file creation */
|
|
ret = PVFS_sys_create(resp_getparent.basename,
|
|
resp_getparent.parent_ref,
|
|
attribs,
|
|
&(pvfs2_fs->credentials),
|
|
dist,
|
|
&resp_create);
|
|
/*
|
|
#ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
|
|
ret = PVFS_sys_create(resp_getparent.basename,
|
|
resp_getparent.parent_ref,
|
|
attribs,
|
|
&(pvfs2_fs->credentials),
|
|
dist,
|
|
&resp_create);
|
|
#else
|
|
ret = PVFS_sys_create(resp_getparent.basename,
|
|
resp_getparent.parent_ref,
|
|
attribs,
|
|
&(pvfs2_fs->credentials),
|
|
dist,
|
|
NULL,
|
|
&resp_create);
|
|
#endif
|
|
*/
|
|
|
|
/* if many creates are happening in this directory, the earlier
|
|
* sys_lookup may have returned ENOENT, but the sys_create could
|
|
* return EEXISTS. That means the file has been created anyway, so
|
|
* less work for us and we can just open it up and return the
|
|
* handle */
|
|
if (ret == (-PVFS_EEXIST)) {
|
|
ret = PVFS_sys_lookup(id,
|
|
pvfs2_name,
|
|
&(pvfs2_fs->credentials),
|
|
&resp_lookup,
|
|
PVFS2_LOOKUP_LINK_FOLLOW);
|
|
if ( ret < 0 ) {
|
|
o_status->error = ret;
|
|
return;
|
|
}
|
|
o_status->error = ret;
|
|
o_status->object_ref = resp_lookup.ref;
|
|
return;
|
|
}
|
|
o_status->object_ref = resp_create.ref;
|
|
}
|
|
else {
|
|
opal_output (10, "cannot create file without MPI_MODE_CREATE\n");
|
|
o_status->error = ret;
|
|
return;
|
|
}
|
|
}
|
|
else if (access_mode & MPI_MODE_EXCL) {
|
|
/* lookup should not succeed if opened with EXCL */
|
|
o_status->error = -PVFS_EEXIST;
|
|
return;
|
|
}
|
|
else {
|
|
o_status->object_ref = resp_lookup.ref;
|
|
}
|
|
o_status->error = ret;
|
|
return;
|
|
}
|