1
1

fs/ime & fbtl/ime: Support of IME file system

Signed-off-by: raafatfeki <fekiraafat@gmail.com>
This commit is contained in:
raafatfeki 2020-06-15 11:28:34 -04:00
parent 465414953d
commit 6e145188d9
23 changed files with 1467 additions and 0 deletions

62
config/ompi_check_ime.m4 Normal file
View File

@ -0,0 +1,62 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2018 DataDirect Networks. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl
# OMPI_CHECK_IME(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if IME support can be found. sets prefix_{CPPFLAGS,
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_IME],[
check_ime_CPPFLAGS=
check_ime_LDFLAGS=
check_ime_LIBS=
check_ime_configuration="none"
ompi_check_ime_happy="yes"
# Get some configuration information
AC_ARG_WITH([ime],
[AC_HELP_STRING([--with-ime(=DIR)],
[Build IME support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])])
OPAL_CHECK_WITHDIR([ime], [$with_ime], [include/ime_native.h])
AS_IF([test "$with_ime" = "no"],
[ompi_check_ime_happy="no"],
[AS_IF([test -z "$with_ime"],
[ompi_check_ime_dir="/usr/local"],
[ompi_check_ime_dir=$with_ime])
if test -e "$ompi_check_ime_dir/lib64" ; then
ompi_check_ime_libdir="$ompi_check_ime_dir/lib64"
else
ompi_check_ime_libdir="$ompi_check_ime_dir/lib"
fi
# Add correct -I and -L flags
OPAL_CHECK_PACKAGE([$1], [ime_native.h], [im_client], [ime_client_native2_init], [],
[$ompi_check_ime_dir], [$ompi_check_ime_libdir],
[ompi_check_ime_happy="yes"],
[OPAL_CHECK_PACKAGE([$1], [ime_native.h], [im_client], [ime_native_init], [],
[$ompi_check_ime_dir], [$ompi_check_ime_libdir],
[ompi_check_ime_happy="yes"],
[ompi_check_ime_happy="no"])
])
])
AS_IF([test "$ompi_check_ime_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_ime" && test "$with_ime" != "no"],
[echo IME support not found])
$3])
])

View File

@ -107,6 +107,7 @@ enum ompio_fs_type
PVFS2 = 2,
LUSTRE = 3,
PLFS = 4,
IME = 5,
GPFS = 6
};

View File

@ -0,0 +1,40 @@
#
# Copyright (c) 2018 DataDirect Networks. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
if MCA_BUILD_ompi_fbtl_ime_DSO
component_noinst =
component_install = mca_fbtl_ime.la
else
component_noinst = libmca_fbtl_ime.la
component_install =
endif
# Source files
fbtl_ime_sources = \
fbtl_ime.h \
fbtl_ime.c \
fbtl_ime_component.c \
fbtl_ime_blocking_op.c \
fbtl_ime_nonblocking_op.c
AM_CPPFLAGS = $(fbtl_ime_CPPFLAGS)
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_fbtl_ime_la_SOURCES = $(fbtl_ime_sources)
mca_fbtl_ime_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(fbtl_ime_LIBS)
mca_fbtl_ime_la_LDFLAGS = -module -avoid-version $(fbtl_ime_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_fbtl_ime_la_SOURCES = $(fbtl_ime_sources)
libmca_fbtl_ime_la_LIBADD = $(fbtl_ime_LIBS)
libmca_fbtl_ime_la_LDFLAGS = -module -avoid-version $(fbtl_ime_LDFLAGS)

View File

@ -0,0 +1,29 @@
# -*- shell-script -*-
#
# Copyright (c) 2018 DataDirect Networks. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_fbtl_ime_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_ompi_fbtl_ime_CONFIG],[
AC_CONFIG_FILES([ompi/mca/fbtl/ime/Makefile])
OMPI_CHECK_IME([fbtl_ime],
[fbtl_ime_happy="yes"],
[fbtl_ime_happy="no"])
AS_IF([test "$fbtl_ime_happy" = "yes"],
[$1],
[$2])
# substitute in the things needed to build ime
AC_SUBST([fbtl_ime_CPPFLAGS])
AC_SUBST([fbtl_ime_LDFLAGS])
AC_SUBST([fbtl_ime_LIBS])
])dnl

View File

@ -0,0 +1,182 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/fbtl/fbtl.h"
#include "ompi/mca/fbtl/ime/fbtl_ime.h"
/*
* *******************************************************************
* ************************ actions structure ************************
* *******************************************************************
*/
static mca_fbtl_base_module_1_0_0_t ime = {
mca_fbtl_ime_module_init, /* initalise after being selected */
mca_fbtl_ime_module_finalize, /* close a module on a communicator */
mca_fbtl_ime_preadv, /* blocking read */
mca_fbtl_ime_ipreadv, /* non-blocking read*/
mca_fbtl_ime_pwritev, /* blocking write */
mca_fbtl_ime_ipwritev, /* non-blocking write */
mca_fbtl_ime_progress, /* module specific progress */
mca_fbtl_ime_request_free /* free module specific data items on the request */
};
/*
* *******************************************************************
* ************************* structure ends **************************
* *******************************************************************
*/
int mca_fbtl_ime_component_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
/* Nothing to do */
return OMPI_SUCCESS;
}
struct mca_fbtl_base_module_1_0_0_t *
mca_fbtl_ime_component_file_query (ompio_file_t *fh, int *priority)
{
*priority = mca_fbtl_ime_priority;
/* Do the same as the FS component:
Only return a non-null component if IME
can handle the IO operations. */
if (IME == fh->f_fstype) {
if (*priority < FBTL_IME_INCREASED_PRIORITY) {
*priority = FBTL_IME_INCREASED_PRIORITY;
}
return &ime;
}
return NULL;
}
int mca_fbtl_ime_component_file_unquery (ompio_file_t *file)
{
/* This function might be needed for some purposes later. for now it
* does not have anything to do since there are no steps which need
* to be undone if this module is not selected */
return OMPI_SUCCESS;
}
int mca_fbtl_ime_module_init (ompio_file_t *file)
{
return OMPI_SUCCESS;
}
int mca_fbtl_ime_module_finalize (ompio_file_t *file)
{
return OMPI_SUCCESS;
}
bool mca_fbtl_ime_progress ( mca_ompio_request_t *req)
{
int i=0, lcount=0, ret_code=0;
mca_fbtl_ime_request_data_t *data=(mca_fbtl_ime_request_data_t *)req->req_data;
/* Go through all the requests in the current batch to check
* if they have finished. */
for (i=data->aio_first_active_req; i < data->aio_last_active_req; i++ ) {
if ( data->aio_req_status[i] == FBTL_IME_REQ_CLOSED ) {
lcount++;
}
else if ( data->aio_req_status[i] >= 0 ) {
/* request has finished */
data->aio_open_reqs--;
lcount++;
data->aio_total_len += data->aio_req_status[i];
data->aio_req_status[i] = FBTL_IME_REQ_CLOSED;
}
else if ( data->aio_req_status[i] == FBTL_IME_REQ_ERROR ) {
/* an error occured. */
data->aio_open_reqs--;
lcount++;
data->aio_req_fail_count++;
data->aio_req_status[i] = FBTL_IME_REQ_CLOSED;
}
else {
/* not yet done */
}
}
/* In case the current batch of requests terminated, exit if an error
* happened for any request.
*/
if ( data->aio_req_fail_count > 0 &&
lcount == data->aio_last_active_req - data->aio_first_active_req ) {
goto error_exit;
}
/* In case some requests are pending, and no error happened in any of the
* previous requests, then the next batch of operations should be prepared.
*/
if ( (lcount == data->aio_req_chunks) && (0 != data->aio_open_reqs) ) {
/* prepare the next batch of operations */
data->aio_first_active_req = data->aio_last_active_req;
if ( (data->aio_req_count-data->aio_last_active_req) > data->aio_req_chunks ) {
data->aio_last_active_req += data->aio_req_chunks;
}
else {
data->aio_last_active_req = data->aio_req_count;
}
/* Send the requests. */
for ( i=data->aio_first_active_req; i< data->aio_last_active_req; i++ ) {
if ( FBTL_IME_READ == data->aio_req_type &&
ime_native_aio_read(&data->aio_reqs[i]) < 0 ) {
opal_output(1, "mca_fbtl_ime_progress: error in aio_read()");
data->aio_req_status[i] = FBTL_IME_REQ_ERROR;
data->aio_last_active_req = i + 1;
break;
}
else if ( FBTL_IME_WRITE == data->aio_req_type &&
ime_native_aio_write(&data->aio_reqs[i]) < 0 ) {
opal_output(1, "mca_fbtl_ime_progress: error in aio_write()");
data->aio_req_status[i] = FBTL_IME_REQ_ERROR;
data->aio_last_active_req = i + 1;
break;
}
}
}
if ( 0 == data->aio_open_reqs ) {
/* all pending operations are finished for this request */
req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
req->req_ompi.req_status._ucount = data->aio_total_len;
return true;
}
return false;
error_exit:
req->req_ompi.req_status.MPI_ERROR = OMPI_ERROR;
req->req_ompi.req_status._ucount = data->aio_total_len;
return true;
}
void mca_fbtl_ime_request_free ( mca_ompio_request_t *req)
{
/* Free the fbtl specific data structures */
mca_fbtl_ime_request_data_t *data=(mca_fbtl_ime_request_data_t *)req->req_data;
if (NULL != data) {
free (data->allocated_data);
free (data);
req->req_data = NULL;
}
}
void mca_fbtl_ime_complete_cb (struct ime_aiocb *aiocb, int err, ssize_t bytes)
{
ssize_t *req_status = (ssize_t *) aiocb->user_context;
*req_status = err == 0 ? bytes : FBTL_IME_REQ_ERROR;
}

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_FBTL_IME_H
#define MCA_FBTL_IME_H
#include "ime_native.h"
#include "ompi_config.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/fbtl/fbtl.h"
#include "ompi/mca/common/ompio/common_ompio.h"
#include "ompi/mca/common/ompio/common_ompio_request.h"
extern int mca_fbtl_ime_priority;
extern int mca_fbtl_ime_iov_max;
extern int mca_fbtl_ime_aio_reqs_max;
#define FBTL_IME_BASE_PRIORITY 0
#define FBTL_IME_INCREASED_PRIORITY 50
#define FBTL_IME_AIO_REQS_MAX 128
#ifdef IME_IOV_MAX
#define FBTL_IME_IOV_MAX IME_IOV_MAX
#else
#define FBTL_IME_IOV_MAX 1024
#endif
BEGIN_C_DECLS
int mca_fbtl_ime_component_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
struct mca_fbtl_base_module_1_0_0_t *
mca_fbtl_ime_component_file_query (ompio_file_t *file, int *priority);
int mca_fbtl_ime_component_file_unquery (ompio_file_t *file);
int mca_fbtl_ime_module_init (ompio_file_t *file);
int mca_fbtl_ime_module_finalize (ompio_file_t *file);
OMPI_MODULE_DECLSPEC extern mca_fbtl_base_component_2_0_0_t mca_fbtl_ime_component;
/*
* ******************************************************************
* ********* functions which are implemented in this module *********
* ******************************************************************
*/
ssize_t mca_fbtl_ime_preadv (ompio_file_t *file );
ssize_t mca_fbtl_ime_pwritev (ompio_file_t *file );
ssize_t mca_fbtl_ime_ipreadv (ompio_file_t *file,
ompi_request_t *request);
ssize_t mca_fbtl_ime_ipwritev (ompio_file_t *file,
ompi_request_t *request);
bool mca_fbtl_ime_progress (mca_ompio_request_t *req);
void mca_fbtl_ime_request_free (mca_ompio_request_t *req);
void mca_fbtl_ime_complete_cb (struct ime_aiocb *aiocb, int err, ssize_t bytes);
struct mca_fbtl_ime_request_data_t {
int aio_req_count; /* total number of aio reqs */
int aio_open_reqs; /* number of unfinished reqs */
int aio_req_type; /* read or write */
int aio_req_chunks; /* max. no. of aio reqs that can be posted at once*/
int aio_first_active_req; /* first active posted req */
int aio_last_active_req; /* last currently active poted req */
int aio_req_fail_count; /* number of requests that failed*/
struct iovec *aio_iovecs; /* array of iovecs copied from the file handle */
struct ime_aiocb *aio_reqs; /* array of aio requests that will be sent to IME */
ssize_t *aio_req_status; /* array of status for the IME requests */
ssize_t aio_total_len; /* total amount of data written */
ompio_file_t *aio_fh; /* pointer back to the mca_io_ompio_fh structure */
void *allocated_data; /* pointer to the allocated space
that will contain all the necessary iovecs,
IME requests and their statuses */
};
typedef struct mca_fbtl_ime_request_data_t mca_fbtl_ime_request_data_t;
/* define constants for read/write operations */
#define FBTL_IME_READ 1
#define FBTL_IME_WRITE 2
#define FBTL_IME_IN_PROGRESS -1
#define FBTL_IME_REQ_ERROR -2
#define FBTL_IME_REQ_CLOSED -3
/*
* ******************************************************************
* ************ functions implemented in this module end ************
* ******************************************************************
*/
END_C_DECLS
#endif /* MCA_FBTL_IME_H */

View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "fbtl_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fbtl/fbtl.h"
static ssize_t mca_fbtl_ime_blocking_op(ompio_file_t *fh, int io_op);
ssize_t mca_fbtl_ime_preadv(ompio_file_t *fh)
{
return mca_fbtl_ime_blocking_op(fh, FBTL_IME_READ);
}
ssize_t mca_fbtl_ime_pwritev(ompio_file_t *fh)
{
return mca_fbtl_ime_blocking_op(fh, FBTL_IME_WRITE);
}
static ssize_t mca_fbtl_ime_blocking_op(ompio_file_t *fh, int io_op)
{
int i, block = 1, ret;
struct iovec *iov = NULL;
int iov_count = 0;
OMPI_MPI_OFFSET_TYPE iov_offset = 0;
ssize_t bytes_processed = 0, ret_code = 0;
if (NULL == fh->f_io_array) {
return OMPI_ERROR;
}
iov = (struct iovec *) malloc
(OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec));
if (NULL == iov) {
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Go through all IO entries and try to aggregate them. */
for (i = 0 ; i < fh->f_num_of_io_entries; i++) {
iov[iov_count].iov_base = fh->f_io_array[i].memory_address;
iov[iov_count].iov_len = fh->f_io_array[i].length;
iov_count++;
/* Save the file offset if the current iovec is
the first one in the iovec array. */
if (iov_count == 1) {
iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset;
}
/* Allocate more memory for the iovecs if necessary */
if (iov_count == OMPIO_IOVEC_INITIAL_SIZE * block) {
block++;
struct iovec *new_iov = (struct iovec *) realloc(iov,
OMPIO_IOVEC_INITIAL_SIZE * block * sizeof(struct iovec));
if (new_iov == NULL) {
free(iov);
opal_output(1, "OUT OF MEMORY\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
/* If:
- There is no next iovec
- OR the next iovec is not "contiguous"
- OR we exceeded the advised number of iovecs for IME
Then: pwritev/preadv shall be called,
and the iovec array resetted */
if (i+1 == fh->f_num_of_io_entries ||
((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset +
(ptrdiff_t)fh->f_io_array[i].length) !=
(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset ||
iov_count >= mca_fbtl_ime_iov_max ) {
switch (io_op) {
case FBTL_IME_READ:
ret_code = ime_native_preadv(fh->fd, iov, iov_count, iov_offset);
if (ret_code < 0) {
opal_output(1, "mca_fbtl_ime_blocking_op: error in "
"ime_native_preadv error ret=%zd %s",
ret_code, strerror(errno));
goto error_exit;
}
break;
case FBTL_IME_WRITE:
ret_code = ime_native_pwritev(fh->fd, iov, iov_count, iov_offset);
if (ret_code < 0) {
opal_output(1, "mca_fbtl_ime_blocking_op: error in "
"ime_native_pwritev error ret=%zd %s",
ret_code, strerror(errno));
goto error_exit;
}
break;
default:
opal_output(1, "mca_fbtl_ime_blocking_op: an unsupported "
"IO operation was requested. io_op=%d", io_op);
goto error_exit;
}
bytes_processed += ret_code;
iov_count = 0;
}
}
free (iov);
return bytes_processed;
error_exit:
free(iov);
return OMPI_ERROR;
}

View File

@ -0,0 +1,79 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "fbtl_ime.h"
#include "mpi.h"
int mca_fbtl_ime_priority = FBTL_IME_BASE_PRIORITY;
int mca_fbtl_ime_iov_max = FBTL_IME_IOV_MAX;
int mca_fbtl_ime_aio_reqs_max = FBTL_IME_AIO_REQS_MAX;
/*
* Private functions
*/
static int register_component(void);
/*
* Public string showing the fbtl ime component version number
*/
const char *mca_fbtl_ime_component_version_string =
"OMPI/MPI IME FBTL MCA component version " OMPI_VERSION;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_fbtl_base_component_2_0_0_t mca_fbtl_ime_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
.fbtlm_version = {
MCA_FBTL_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "ime",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_register_component_params = register_component,
},
.fbtlm_data = {
/* This component is checkpointable */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.fbtlm_init_query = mca_fbtl_ime_component_init_query, /* get thread level */
.fbtlm_file_query = mca_fbtl_ime_component_file_query, /* get priority and actions */
.fbtlm_file_unquery = mca_fbtl_ime_component_file_unquery, /* undo what was done by previous function */
};
static int register_component(void)
{
mca_fbtl_ime_iov_max = FBTL_IME_IOV_MAX;
(void) mca_base_component_var_register(&mca_fbtl_ime_component.fbtlm_version,
"iov_max", "Maximum iov count that should be used when "
"calling an IME native function",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_ime_iov_max);
mca_fbtl_ime_aio_reqs_max = FBTL_IME_AIO_REQS_MAX;
(void) mca_base_component_var_register(&mca_fbtl_ime_component.fbtlm_version,
"aio_reqs_max", "Maximum number of aiocb requests that should "
"be sent simultaneously when calling an IME native function",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fbtl_ime_aio_reqs_max );
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,175 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "fbtl_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fbtl/fbtl.h"
static ssize_t mca_fbtl_ime_nonblocking_op (ompio_file_t *fh,
ompi_request_t *request, int io_op);
ssize_t mca_fbtl_ime_ipreadv (ompio_file_t *fh, ompi_request_t *request)
{
return mca_fbtl_ime_nonblocking_op(fh, request, FBTL_IME_READ);
}
ssize_t mca_fbtl_ime_ipwritev (ompio_file_t *fh, ompi_request_t *request)
{
return mca_fbtl_ime_nonblocking_op(fh, request, FBTL_IME_WRITE);
}
static ssize_t mca_fbtl_ime_nonblocking_op (ompio_file_t *fh,
ompi_request_t *request, int io_op)
{
mca_fbtl_ime_request_data_t *data;
mca_ompio_request_t *req = (mca_ompio_request_t *) request;
int i=0, req_index = 0, ret;
data = (mca_fbtl_ime_request_data_t *) malloc ( sizeof (mca_fbtl_ime_request_data_t));
if ( NULL == data ) {
opal_output (1,"could not allocate memory\n");
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* We might allocate too much memory here because we don't know
how many IME requests will be necessary.
We will use all the iovec "slots" in the array,
but maybe not all the request and request status slots.
That is, because an IME request can handle several iovecs,
not just one. */
data->allocated_data = (void*) malloc( fh->f_num_of_io_entries *
(sizeof(struct iovec) +
sizeof(struct ime_aiocb) +
sizeof(ssize_t)) );
if (NULL == data->allocated_data) {
opal_output(1, "OUT OF MEMORY\n");
free(data);
return OMPI_ERR_OUT_OF_RESOURCE;
}
data->aio_iovecs = (struct iovec *) data->allocated_data;
data->aio_reqs = (struct ime_aiocb *) (data->aio_iovecs +
fh->f_num_of_io_entries);
data->aio_req_status = (ssize_t *) (data->aio_reqs +
fh->f_num_of_io_entries);
/* Fill some attributes of the OMPIO request data */
data->aio_req_type = io_op; /* The correctness of io_op will be checked later */
data->aio_req_chunks = mca_fbtl_ime_aio_reqs_max;
data->aio_req_fail_count = 0;
data->aio_total_len = 0;
data->aio_fh = fh;
data->aio_reqs[0].iovcnt = 0;
/* Go through all IO entries and try to aggregate them. */
for ( i=0; i<fh->f_num_of_io_entries; i++ ) {
data->aio_iovecs[i].iov_base = fh->f_io_array[i].memory_address;
data->aio_iovecs[i].iov_len = fh->f_io_array[i].length;
/* If the processed iovec will be the first in our ime_aiocb request,
then we initialize this aio request for IME. */
if (data->aio_reqs[req_index].iovcnt == 0) {
data->aio_reqs[req_index].iov = &data->aio_iovecs[i];
data->aio_reqs[req_index].iovcnt = 1;
data->aio_reqs[req_index].file_offset = (off_t)
fh->f_io_array[i].offset;
data->aio_reqs[req_index].fd = fh->fd;
data->aio_reqs[req_index].complete_cb = &mca_fbtl_ime_complete_cb;
data->aio_reqs[req_index].user_context = (intptr_t)
&data->aio_req_status[req_index];
data->aio_req_status[req_index] = FBTL_IME_IN_PROGRESS;
}
/* Here we check if the next iovec will be appended to
the current ime_aiocb request.
ie: if data is contiguous
AND we don't exceed the advised number of iovecs for IME
In that case, the next iovec will be appended to the IME req. */
if (i+1 != fh->f_num_of_io_entries &&
((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset +
(ptrdiff_t)fh->f_io_array[i].length) ==
(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset &&
data->aio_reqs[req_index].iovcnt < mca_fbtl_ime_iov_max ) {
data->aio_reqs[req_index].iovcnt++;
}
/* Otherwise, we need to create a new request
(except if there is no next iovec to process) */
else if ( i+1 != fh->f_num_of_io_entries ) {
req_index++;
data->aio_reqs[req_index].iovcnt = 0;
}
}
/* Fill the missing attributes of the OMPI request */
data->aio_req_count = req_index + 1;
data->aio_open_reqs = req_index + 1;
data->aio_first_active_req = 0;
if ( data->aio_req_count > data->aio_req_chunks ) {
data->aio_last_active_req = data->aio_req_chunks;
}
else {
data->aio_last_active_req = data->aio_req_count;
}
/* Actually start the requests (or at least the first batch).
In case an error happened when one request is started, we
don't send the next ones and mark the failing request as
the last active one. Finally we exit as if no error happened,
because some other requests might have already been started
and they need to be finalized properly (via the progress function).
*/
for (i=0; i < data->aio_last_active_req; i++) {
switch(io_op) {
case FBTL_IME_READ:
ret = ime_native_aio_read(&data->aio_reqs[i]);
if (ret < 0) {
opal_output(1, "mca_fbtl_ime_nonblocking_op: error in "
"ime_native_aio_read() error ret=%d %s",
ret, strerror(errno));
data->aio_req_status[i] = FBTL_IME_REQ_ERROR;
data->aio_last_active_req = i + 1;
goto standard_exit;
}
break;
case FBTL_IME_WRITE:
ret = ime_native_aio_write(&data->aio_reqs[i]);
if (ret < 0) {
opal_output(1, "mca_fbtl_ime_nonblocking_op: error in "
"ime_native_aio_write() error ret=%d %s",
ret, strerror(errno));
data->aio_req_status[i] = FBTL_IME_REQ_ERROR;
data->aio_last_active_req = i + 1;
goto standard_exit;
}
break;
default:
opal_output(1, "mca_fbtl_ime_nonblocking_op: an unsupported "
"IO operation was requested. io_op=%d", io_op);
goto error_exit;
}
}
standard_exit:
req->req_data = data;
req->req_progress_fn = mca_fbtl_ime_progress;
req->req_free_fn = mca_fbtl_ime_request_free;
return OMPI_SUCCESS;
error_exit:
free(data->allocated_data);
free(data);
return OMPI_ERROR;
}

View File

@ -0,0 +1,7 @@
#
# owner/status file
# owner: institution that is responsible for this package
# status: e.g. active, maintenance, unmaintained
#
owner: DataDirect Networks
status: active

View File

@ -117,6 +117,9 @@ int mca_fs_base_get_fstype(char *fname )
else if ( 0 == strncasecmp(fstype, "pvfs2", sizeof("pvfs2"))) {
ompio_type = PVFS2;
}
else if ( 0 == strncasecmp(fstype, "ime", sizeof("ime"))) {
ompio_type = IME;
}
else if ( 0 == strncasecmp(fstype, "gpfs", sizeof("gpfs"))) {
ompio_type = GPFS;
}

View File

@ -0,0 +1,47 @@
#
# Copyright (c) 2018 DataDirect Networks. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_fs_ime_DSO
component_noinst =
component_install = mca_fs_ime.la
else
component_noinst = libmca_fs_ime.la
component_install =
endif
# Source files
fs_ime_sources = \
fs_ime.h \
fs_ime.c \
fs_ime_component.c \
fs_ime_file_open.c \
fs_ime_file_close.c \
fs_ime_file_delete.c \
fs_ime_file_sync.c \
fs_ime_file_set_size.c \
fs_ime_file_get_size.c
AM_CPPFLAGS = $(fs_ime_CPPFLAGS)
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_fs_ime_la_SOURCES = $(fs_ime_sources)
mca_fs_ime_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(fs_ime_LIBS)
mca_fs_ime_la_LDFLAGS = -module -avoid-version $(fs_ime_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_fs_ime_la_SOURCES = $(fs_ime_sources)
libmca_fs_ime_la_LIBADD = $(fs_ime_LIBS)
libmca_fs_ime_la_LDFLAGS = -module -avoid-version $(fs_ime_LDFLAGS)

View File

@ -0,0 +1,30 @@
# -*- shell-script -*-
#
# Copyright (c) 2018 DataDirect Networks. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_fs_ime_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_ompi_fs_ime_CONFIG],[
AC_CONFIG_FILES([ompi/mca/fs/ime/Makefile])
OMPI_CHECK_IME([fs_ime],
[fs_ime_happy="yes"],
[fs_ime_happy="no"])
OPAL_SUMMARY_ADD([[OMPIO File Systems]],[[DDN Infinite Memory Engine]],[$1],[$fs_ime_happy])
AS_IF([test "$fs_ime_happy" = "yes"],
[$1],
[$2])
# substitute in the things needed to build ime
AC_SUBST([fs_ime_CPPFLAGS])
AC_SUBST([fs_ime_LDFLAGS])
AC_SUBST([fs_ime_LIBS])
])dnl

155
ompi/mca/fs/ime/fs_ime.c Normal file
View File

@ -0,0 +1,155 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/fs/fs.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/ime/fs_ime.h"
/*
* *******************************************************************
* ************************ actions structure ************************
* *******************************************************************
*/
static mca_fs_base_module_1_0_0_t ime = {
mca_fs_ime_module_init, /* initalise after being selected */
mca_fs_ime_module_finalize, /* close a module on a communicator */
mca_fs_ime_file_open,
mca_fs_ime_file_close,
mca_fs_ime_file_delete,
mca_fs_ime_file_set_size,
mca_fs_ime_file_get_size,
mca_fs_ime_file_sync
};
/*
* *******************************************************************
* ************************* structure ends **************************
* *******************************************************************
*/
/*
* Private variables
*/
static int mca_fs_ime_IS_INITIALIZED = 0;
/*
* Function decls
*/
int mca_fs_ime_component_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
/* Nothing to do */
return OMPI_SUCCESS;
}
struct mca_fs_base_module_1_0_0_t *
mca_fs_ime_component_file_query (ompio_file_t *fh, int *priority)
{
/* IME should only be used for paths starting with ime: or IME:
Therefore, this function will return a NULL module when no IME
path is detected. */
char *tmp;
*priority = mca_fs_ime_priority;
tmp = strchr (fh->f_filename, ':');
if (!tmp) {
/* The communicator might be NULL if we only want to delete the file */
if (OMPIO_ROOT == fh->f_rank || MPI_COMM_NULL == fh->f_comm) {
fh->f_fstype = mca_fs_base_get_fstype ( fh->f_filename );
}
if (fh->f_comm != MPI_COMM_NULL) {
fh->f_comm->c_coll->coll_bcast (&(fh->f_fstype),
1,
MPI_INT,
OMPIO_ROOT,
fh->f_comm,
fh->f_comm->c_coll->coll_bcast_module);
}
}
else {
if (!strncmp(fh->f_filename, DEFAULT_IME_PREFIX_NO_FWD_SLASH,
IME_FILE_PREFIX_LEN_NO_FWD_SLASH)){
fh->f_fstype = IME;
}
}
/* According to my understanding, a valid module should be returned
as long as a valid FS type is detected. (This isn't what is done
for LUSTRE or PVFS2)
*/
if (IME == fh->f_fstype) {
if (*priority < FS_IME_INCREASED_PRIORITY) {
*priority = FS_IME_INCREASED_PRIORITY;
}
return &ime;
}
return NULL;
}
int mca_fs_ime_component_file_unquery (ompio_file_t *file)
{
/* This function might be needed for some purposes later. for now it
* does not have anything to do since there are no steps which need
* to be undone if this module is not selected */
return OMPI_SUCCESS;
}
int mca_fs_ime_module_init (ompio_file_t *file)
{
/* Make sure the file type is not overwritten by the last queried
* component */
file->f_fstype = IME;
if (mca_fs_ime_IS_INITIALIZED == 0) {
mca_fs_ime_IS_INITIALIZED = 1;
ime_native_init();
}
return OMPI_SUCCESS;
}
int mca_fs_ime_module_finalize (ompio_file_t *file)
{
/*
* Nothing to do here:
* We can't finalize IME here because other files might
* still be using it. Instead, IME is finalized when
* the OMPIO component is closed.
*/
return OMPI_SUCCESS;
}
int mca_fs_ime_native_fini()
{
int ret;
if (mca_fs_ime_IS_INITIALIZED == 0) {
return OMPI_SUCCESS;
}
/* We don't actually need to reset this variable since
mca_fs_ime_native_fini is only called once:
when OMPIO is closed
*/
mca_fs_ime_IS_INITIALIZED = 0;
ret = ime_native_finalize();
if (ret != 0) {
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}

72
ompi/mca/fs/ime/fs_ime.h Normal file
View File

@ -0,0 +1,72 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_FS_IME_H
#define MCA_FS_IME_H
#include "ompi_config.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/fs/fs.h"
#include "ompi/mca/common/ompio/common_ompio.h"
extern int mca_fs_ime_priority;
extern int mca_fs_ime_lock_algorithm;
#define FS_IME_LOCK_AUTO 0
#define FS_IME_BASE_PRIORITY 20
#define FS_IME_INCREASED_PRIORITY 50
BEGIN_C_DECLS
int mca_fs_ime_component_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
struct mca_fs_base_module_1_0_0_t *
mca_fs_ime_component_file_query (ompio_file_t *fh, int *priority);
int mca_fs_ime_component_file_unquery (ompio_file_t *file);
int mca_fs_ime_module_init (ompio_file_t *file);
int mca_fs_ime_module_finalize (ompio_file_t *file);
int mca_fs_ime_native_fini();
OMPI_MODULE_DECLSPEC extern mca_fs_base_component_2_0_0_t mca_fs_ime_component;
/*
* ******************************************************************
* ********* functions which are implemented in this module *********
* ******************************************************************
*/
int mca_fs_ime_file_open (struct ompi_communicator_t *comm,
const char *filename,
int amode,
struct opal_info_t *info,
ompio_file_t *fh);
int mca_fs_ime_file_close (ompio_file_t *fh);
int mca_fs_ime_file_delete (char *filename,
struct opal_info_t *info);
int mca_fs_ime_file_set_size (ompio_file_t *fh,
OMPI_MPI_OFFSET_TYPE size);
int mca_fs_ime_file_get_size (ompio_file_t *fh,
OMPI_MPI_OFFSET_TYPE *size);
int mca_fs_ime_file_sync (ompio_file_t *fh);
/*
* ******************************************************************
* ************ functions implemented in this module end ************
* ******************************************************************
*/
END_C_DECLS
#endif /* MCA_FS_IME_H */

View File

@ -0,0 +1,77 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "fs_ime.h"
#include "mpi.h"
int mca_fs_ime_priority = FS_IME_BASE_PRIORITY;
int mca_fs_ime_lock_algorithm = FS_IME_LOCK_AUTO;
/*
* Private functions
*/
static int register_component(void);
/*
* Public string showing the fs ime component version number
*/
const char *mca_fs_ime_component_version_string =
"OMPI/MPI IME FS MCA component version " OMPI_VERSION;
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_fs_base_component_2_0_0_t mca_fs_ime_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
.fsm_version = {
MCA_FS_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "ime",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_register_component_params = register_component,
},
.fsm_data = {
/* This component is checkpointable */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.fsm_init_query = mca_fs_ime_component_init_query, /* get thread level */
.fsm_file_query = mca_fs_ime_component_file_query, /* get priority and actions */
.fsm_file_unquery = mca_fs_ime_component_file_unquery, /* undo what was done by previous function */
};
static int register_component(void)
{
mca_fs_ime_priority = FS_IME_BASE_PRIORITY;
(void) mca_base_component_var_register(&mca_fs_ime_component.fsm_version,
"priority", "Priority of the fs ime component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fs_ime_priority);
mca_fs_ime_lock_algorithm = FS_IME_LOCK_AUTO;
(void) mca_base_component_var_register(&mca_fs_ime_component.fsm_version,
"lock_algorithm", "Locking algorithm used by the fs ime component. "
" 0: auto (default)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_fs_ime_lock_algorithm );
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,42 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "fs_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/fs.h"
/*
* file_close_ime
*
* Function: - closes a new file
* Accepts: - file handle
* Returns: - Success if file closed
*/
int mca_fs_ime_file_close (ompio_file_t *fh)
{
int ret;
fh->f_comm->c_coll->coll_barrier (fh->f_comm,
fh->f_comm->c_coll->coll_barrier_module);
/* reset errno */
errno = 0;
ret = ime_native_close(fh->fd);
if (ret != 0) {
return mca_fs_base_get_mpi_err(errno);
}
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "fs_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/fs.h"
/*
* file_delete_ime
*
* Function: - deletes a file
* Accepts: - file name & info
* Returns: - Success if file closed
*/
int mca_fs_ime_file_delete (char* file_name,
struct opal_info_t *info)
{
int ret;
/* reset errno */
errno = 0;
ret = ime_native_unlink(file_name);
if (ret != 0) {
return mca_fs_base_get_mpi_err(errno);
}
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "fs_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/fs.h"
/*
* file_get_size_ime
*
* Function: - get_size of a file
* Accepts: - same arguments as MPI_File_get_size()
* Returns: - Success if size is get
*/
int mca_fs_ime_file_get_size (ompio_file_t *fh,
OMPI_MPI_OFFSET_TYPE *size)
{
/* reset errno */
errno = 0;
*size = ime_native_lseek(fh->fd, 0, SEEK_END);
if (*size < 0) {
return mca_fs_base_get_mpi_err(errno);
}
errno = 0;
if ((ime_native_lseek(fh->fd, fh->f_offset, SEEK_SET)) < 0) {
return mca_fs_base_get_mpi_err(errno);
}
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "fs_ime.h"
#include <sys/stat.h>
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/fs.h"
#include "ompi/communicator/communicator.h"
#include "ompi/info/info.h"
#include "opal/util/path.h"
/*
* file_open_ime
*
* Function: - opens a new file
* Accepts: - same arguments as MPI_File_open()
* Returns: - Success if new file handle
*/
int mca_fs_ime_file_open (struct ompi_communicator_t *comm,
const char* filename,
int access_mode,
struct opal_info_t *info,
ompio_file_t *fh)
{
int perm, amode;
int ret = OMPI_SUCCESS;
perm = mca_fs_base_get_file_perm(fh);
amode = mca_fs_base_get_file_amode(fh->f_rank, access_mode);
/* Reset errno */
errno = 0;
if (OMPIO_ROOT == fh->f_rank) {
fh->fd = ime_native_open(filename, amode, perm);
if ( fh->fd < 0 ) {
ret = mca_fs_base_get_mpi_err(errno);
}
}
comm->c_coll->coll_bcast (&ret, 1, MPI_INT, OMPIO_ROOT, comm,
comm->c_coll->coll_bcast_module);
if ( ret != OMPI_SUCCESS ) {
fh->fd = -1;
return ret;
}
if (OMPIO_ROOT != fh->f_rank) {
errno = 0;
fh->fd = ime_native_open(filename, amode, perm);
if ( fh->fd < 0 ) {
return mca_fs_base_get_mpi_err(errno);
}
}
return OMPI_SUCCESS;
}

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ime_native.h"
#include "ompi_config.h"
#include "fs_ime.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/mca/fs/base/base.h"
#include "ompi/mca/fs/fs.h"
/*
* file_set_size_ime
*
* Function: - set_size of a file
* Accepts: - same arguments as MPI_File_set_size()
* Returns: - Success if size is set
*/
int mca_fs_ime_file_set_size (ompio_file_t *fh,
OMPI_MPI_OFFSET_TYPE size)
{
int ret = 0;
/* reset errno */
errno = 0;
if (OMPIO_ROOT == fh->f_rank) {
ret = ime_native_ftruncate(fh->fd, size);
}
fh->f_comm->c_coll->coll_bcast(&ret,
1,
MPI_INT,
OMPIO_ROOT,
fh->f_comm,
fh->f_comm->c_coll->coll_bcast_module);
if (ret < 0) {
return mca_fs_base_get_mpi_err(errno);
}
return OMPI_SUCCESS;
}

View File
<