Merge pull request #5196 from edgargabriel/topic/cuda

io/ompio: introduce initial support for cuda buffers in ompio
2018-06-21 10:14:43 -05:00 · 2018-06-21 10:14:43 -05:00 · fb16d40775
--- a/ompi/mca/common/ompio/Makefile.am
+++ b/ompi/mca/common/ompio/Makefile.am
@ -9,7 +9,7 @@
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
-# Copyright (c) 2008-2016 University of Houston. All rights reserved.
+# Copyright (c) 2008-2018 University of Houston. All rights reserved.
 # Copyright (c) 2016      IBM Corporation.  All rights reserved.
 # Copyright (c) 2017-2018 Research Organization for Information Science
 #                         and Technology (RIST). All rights reserved.
@ -74,6 +74,11 @@ else
 ompidir = $(includedir)
 endif
 if OPAL_cuda_support
 headers += common_ompio_cuda.h
 sources += common_ompio_cuda.c
 endif
 # These two rules will sym link the "noinst" libtool library filename
 # to the installable libtool library filename in the case where we are
 # compiling this component statically (case 2), described above).
--- a/ompi/mca/common/ompio/common_ompio_cuda.c
+++ b/ompi/mca/common/ompio/common_ompio_cuda.c
@ -0,0 +1,165 @@
 /*
 *  Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                          University Research and Technology
 *                          Corporation.  All rights reserved.
 *  Copyright (c) 2004-2016 The University of Tennessee and The University
 *                          of Tennessee Research Foundation.  All rights
 *                          reserved.
 *  Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                          University of Stuttgart.  All rights reserved.
 *  Copyright (c) 2004-2005 The Regents of the University of California.
 *                          All rights reserved.
 *  Copyright (c) 2008-2018 University of Houston. All rights reserved.
 *  $COPYRIGHT$
 *
 *  Additional copyrights may follow
 *
 *  $HEADER$
 */
 #include "ompi_config.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/util/sys_limits.h"
 #include "opal/mca/allocator/allocator.h"
 #include "opal/mca/allocator/base/base.h"
 #include "common_ompio.h"
 #include "common_ompio_cuda.h"
 static opal_mutex_t     mca_common_ompio_cuda_mutex;      /* lock for thread safety */
 static mca_allocator_base_component_t* mca_common_ompio_allocator_component=NULL;
 static mca_allocator_base_module_t* mca_common_ompio_allocator=NULL;  
 static int32_t  mca_common_ompio_cuda_init = 0;
 static int32_t  mca_common_ompio_pagesize=4096;
 static void* mca_common_ompio_cuda_alloc_seg ( void *ctx, size_t *size );
 static void mca_common_ompio_cuda_free_seg ( void *ctx, void *buf );
 void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, int *is_gpu, 
 				      int *is_managed)
 {
    opal_convertor_t    convertor;  
    *is_gpu=0;
    *is_managed=0;
    convertor.flags=0;
    if ( opal_cuda_check_one_buf ( (char *)buf, &convertor ) ) {
        *is_gpu = 1;
        if ( convertor.flags & CONVERTOR_CUDA_UNIFIED ){
            *is_managed =1;
        }
    } 
    return;
 }
 static void* mca_common_ompio_cuda_alloc_seg ( void*ctx, size_t *size )
 {
    char *buf=NULL;
    size_t realsize, numpages;
    numpages = (*size + mca_common_ompio_pagesize -1 )/mca_common_ompio_pagesize;
    realsize = numpages * mca_common_ompio_pagesize;
    buf = malloc ( realsize);
    if ( NULL != buf ) {
        mca_common_cuda_register ( ( char *)buf, realsize, NULL  );
    }
    *size = realsize;
    return buf;
 }
 static void mca_common_ompio_cuda_free_seg ( void *ctx, void *buf )
 {
    if ( NULL != buf ) {
        mca_common_cuda_unregister ( (char *) buf, NULL );
        free ( buf );
    }
    return;
 }
 int mca_common_ompio_cuda_alloc_init ( void )
 {
    bool thread_safe=true;
    if(OPAL_THREAD_ADD_FETCH32(&mca_common_ompio_cuda_init, 1) > 1)
        return OMPI_SUCCESS;
    /* initialize static objects */
    OBJ_CONSTRUCT(&mca_common_ompio_cuda_mutex, opal_mutex_t);
    OPAL_THREAD_LOCK (&mca_common_ompio_cuda_mutex );
    /* lookup name of the allocator to use */
    if(NULL == (mca_common_ompio_allocator_component = mca_allocator_component_lookup("basic"))) {
        OPAL_THREAD_UNLOCK(&mca_common_ompio_cuda_mutex);
        return OMPI_ERR_BUFFER;
    }
    /* create an instance of the allocator */
    mca_common_ompio_allocator = mca_common_ompio_allocator_component->allocator_init(thread_safe, 
                                                                                      mca_common_ompio_cuda_alloc_seg, 
                                                                                      mca_common_ompio_cuda_free_seg, 
                                                                                      NULL);
    if(NULL == mca_common_ompio_allocator) {
        OPAL_THREAD_UNLOCK(&mca_common_ompio_cuda_mutex);
        return OMPI_ERR_BUFFER;
    }
 //    mca_common_ompio_pagesize = sysconf(_SC_PAGESIZE);
    mca_common_ompio_pagesize = opal_getpagesize();
    OPAL_THREAD_UNLOCK(&mca_common_ompio_cuda_mutex);
    return OMPI_SUCCESS;
 }
 int mca_common_ompio_cuda_alloc_fini ( void )
 {
    if ( NULL != mca_common_ompio_allocator ) {
        OPAL_THREAD_LOCK (&mca_common_ompio_cuda_mutex);
        mca_common_ompio_allocator->alc_finalize(mca_common_ompio_allocator);
        mca_common_ompio_allocator=NULL;
        OPAL_THREAD_UNLOCK (&mca_common_ompio_cuda_mutex);
        OBJ_DESTRUCT (&mca_common_ompio_cuda_mutex);
    }
    return OMPI_SUCCESS;
 }
 void *mca_common_ompio_alloc_buf ( ompio_file_t *fh, size_t bufsize )
 {
    char *tmp=NULL;
    if ( !mca_common_ompio_cuda_init ){
        mca_common_ompio_cuda_alloc_init ();
    }
    OPAL_THREAD_LOCK (&mca_common_ompio_cuda_mutex);
    tmp = mca_common_ompio_allocator->alc_alloc (mca_common_ompio_allocator,
                                                 bufsize, 0 );
    OPAL_THREAD_UNLOCK (&mca_common_ompio_cuda_mutex);
    return tmp;
 }
 void mca_common_ompio_release_buf ( ompio_file_t *fh, void *buf )
 {
    if ( !mca_common_ompio_cuda_init ){
        /* Should not happen. You can not release a buf without
        ** having it allocated first. 
        */
        opal_output (1, "error in mca_common_ompio_release_buf: allocator not initialized\n");
    }
    OPAL_THREAD_LOCK (&mca_common_ompio_cuda_mutex);
    mca_common_ompio_allocator->alc_free (mca_common_ompio_allocator,
                                          buf);
    OPAL_THREAD_UNLOCK (&mca_common_ompio_cuda_mutex);
    return;
 }
--- a/ompi/mca/common/ompio/common_ompio_cuda.h
+++ b/ompi/mca/common/ompio/common_ompio_cuda.h
@ -0,0 +1,53 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2007 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008-2018 University of Houston. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
 #ifndef MCA_COMMON_OMPIO_CUDA_H
 #define MCA_COMMON_OMPIO_CUDA_H
 #define OMPIO_CUDA_PREPARE_BUF(_fh,_buf,_count,_datatype,_tbuf,_convertor,_max_data,_decoded_iov,_iov_count){ \
    opal_convertor_clone ( _fh->f_convertor, _convertor, 0);                          \
    opal_convertor_prepare_for_send ( _convertor, &(_datatype->super), _count, _buf );\
    opal_convertor_get_packed_size( _convertor, &_max_data );           \
    _tbuf = mca_common_ompio_alloc_buf (_fh, _max_data);                \
    if ( NULL == _tbuf ) {                                              \
        opal_output(1, "common_ompio: error allocating memory\n");      \
        return OMPI_ERR_OUT_OF_RESOURCE;                                \
    }                                                                   \
    _decoded_iov = (struct iovec *) malloc ( sizeof ( struct iovec ));  \
    if ( NULL == _decoded_iov ) {                                       \
        opal_output(1, "common_ompio: could not allocate memory.\n");   \
        return OMPI_ERR_OUT_OF_RESOURCE;                                \
    }                                                                   \
    _decoded_iov->iov_base = _tbuf;                                     \
    _decoded_iov->iov_len  = _max_data;                                 \
    _iov_count=1;}
 void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, 
 				      int *is_gpu, int *is_managed);
 int mca_common_ompio_cuda_alloc_init ( void );
 int mca_common_ompio_cuda_alloc_fini ( void );
 void* mca_common_ompio_alloc_buf ( ompio_file_t *fh, size_t bufsize);
 void mca_common_ompio_release_buf ( ompio_file_t *fh,  void *buf );
 #endif
--- a/ompi/mca/common/ompio/common_ompio_file_read.c
+++ b/ompi/mca/common/ompio/common_ompio_file_read.c
@ -36,6 +36,10 @@
 #include "math.h"
 #include <unistd.h>
 #if OPAL_CUDA_SUPPORT
 #include "common_ompio_cuda.h"
 #endif
 /* Read and write routines are split into two interfaces.
 **   The
 **   mca_io_ompio_file_read/write[_at]
@ -74,10 +78,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
    int j = 0; /* index into the file vie iovec */
    if ( 0 == count ) {
-	if ( MPI_STATUS_IGNORE != status ) {
+        if ( MPI_STATUS_IGNORE != status ) {
-	    status->_ucount = 0;
+            status->_ucount = 0;
-	}
+        }
-	return ret;
+        return ret;
    }
    if (fh->f_amode & MPI_MODE_WRONLY){
@ -86,6 +90,26 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
      return ret;
    }
 #if OPAL_CUDA_SUPPORT
    int is_gpu, is_managed;
    opal_convertor_t convertor;
    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
    if ( is_gpu && !is_managed ) {
        char *tbuf=NULL;
        OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
    }
    else {
        mca_common_ompio_decode_datatype (fh,
                                          datatype,
                                          count,
                                          buf,
                                          &max_data,
                                          &decoded_iov,
                                          &iov_count);
    }
 #else
    mca_common_ompio_decode_datatype (fh,
                                      datatype,
                                      count,
@ -93,9 +117,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
                                      &max_data,
                                      &decoded_iov,
                                      &iov_count);
 #endif
-    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) {
+    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) {
-	bytes_per_cycle = max_data;
+        bytes_per_cycle = max_data;
    }
    else {
 	bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size);
@ -124,9 +149,9 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
        if (fh->f_num_of_io_entries) {
            ret_code = fh->f_fbtl->fbtl_preadv (fh);
-	    if ( 0<= ret_code ) {
+            if ( 0<= ret_code ) {
-		real_bytes_read+=(size_t)ret_code;
+                real_bytes_read+=(size_t)ret_code;
-	    }
+            }
        }
        fh->f_num_of_io_entries = 0;
@ -136,13 +161,22 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
        }
    }
 #if OPAL_CUDA_SUPPORT
    if ( is_gpu && !is_managed ) {
        size_t pos=0;
        opal_convertor_unpack (&convertor, decoded_iov, &iov_count, &pos );
        opal_convertor_cleanup (&convertor);
        mca_common_ompio_release_buf (fh, decoded_iov->iov_base);
    }
 #endif
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    if ( MPI_STATUS_IGNORE != status ) {
-	status->_ucount = real_bytes_read;
+        status->_ucount = real_bytes_read;
    }
    return ret;
@ -189,37 +223,58 @@ int mca_common_ompio_file_iread (ompio_file_t *fh,
    mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_READ);
    if ( 0 == count ) {
-	ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
+        ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
-	ompio_req->req_ompi.req_status._ucount = 0;
+        ompio_req->req_ompi.req_status._ucount = 0;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        ompi_request_complete (&ompio_req->req_ompi, false);
        *request = (ompi_request_t *) ompio_req;
-
+        
-	return OMPI_SUCCESS;
+        return OMPI_SUCCESS;
    }
    if ( NULL != fh->f_fbtl->fbtl_ipreadv ) {
-	// This fbtl has support for non-blocking operations
+        // This fbtl has support for non-blocking operations
-	size_t total_bytes_read = 0;       /* total bytes that have been read*/
+        size_t total_bytes_read = 0;       /* total bytes that have been read*/
-	uint32_t iov_count = 0;
+        uint32_t iov_count = 0;
-	struct iovec *decoded_iov = NULL;
+        struct iovec *decoded_iov = NULL;
-
+        
-	size_t max_data = 0;
+        size_t max_data = 0;
-	int i = 0; /* index into the decoded iovec of the buffer */
+        int i = 0; /* index into the decoded iovec of the buffer */
-	int j = 0; /* index into the file vie iovec */
+        int j = 0; /* index into the file vie iovec */
-
+        
-	mca_common_ompio_decode_datatype (fh,
+#if OPAL_CUDA_SUPPORT
-				          datatype,
+        int is_gpu, is_managed;
-				          count,
+        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
-				          buf,
+        if ( is_gpu && !is_managed ) {
-				          &max_data,
+            char *tbuf=NULL;
-				          &decoded_iov,
+            
-				          &iov_count);
+            OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&ompio_req->req_convertor,max_data,decoded_iov,iov_count);        
-
+            
-	// Non-blocking operations have to occur in a single cycle
+            ompio_req->req_tbuf = tbuf;
-	j = fh->f_index_in_file_view;
+            ompio_req->req_size = max_data;
-
+        }
-	mca_common_ompio_build_io_array ( fh,
+        else {
            mca_common_ompio_decode_datatype (fh,
                                              datatype,
                                              count,
                                              buf,
                                              &max_data,
                                              &decoded_iov,
                                              &iov_count);
        }
 #else
        mca_common_ompio_decode_datatype (fh,
                                          datatype,
                                          count,
                                          buf,
                                          &max_data,
                                          &decoded_iov,
                                          &iov_count);
 #endif
        // Non-blocking operations have to occur in a single cycle
        j = fh->f_index_in_file_view;
        mca_common_ompio_build_io_array ( fh,
                                          0,         // index
                                          1,         // no. of cyces
                                          max_data,  // setting bytes per cycle to match data
--- a/ompi/mca/common/ompio/common_ompio_file_write.c
+++ b/ompi/mca/common/ompio/common_ompio_file_write.c
@ -34,6 +34,10 @@
 #include "math.h"
 #include <unistd.h>
 #if OPAL_CUDA_SUPPORT
 #include "common_ompio_cuda.h"
 #endif
 int mca_common_ompio_file_write (ompio_file_t *fh,
 			       const void *buf,
 			       int count,
@ -55,12 +59,35 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
    int j = 0; /* index into the file view iovec */
    if ( 0 == count ) {
-	if ( MPI_STATUS_IGNORE != status ) {
+        if ( MPI_STATUS_IGNORE != status ) {
-	    status->_ucount = 0;
+            status->_ucount = 0;
-	}
+        }
-	return ret;
+        return ret;
    }
 #if OPAL_CUDA_SUPPORT
    int is_gpu, is_managed;
    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
    if ( is_gpu && !is_managed ) {
        size_t pos=0;
        char *tbuf=NULL;
        opal_convertor_t convertor;
        OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
        opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos );
        opal_convertor_cleanup ( &convertor);
    }
    else {
        mca_common_ompio_decode_datatype (fh,
                                          datatype,
                                          count,
                                          buf,
                                          &max_data,
                                          &decoded_iov,
                                          &iov_count);
    }
 #else
    mca_common_ompio_decode_datatype (fh,
                                      datatype,
                                      count,
@ -68,9 +95,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
                                      &max_data,
                                      &decoded_iov,
                                      &iov_count);
-
+#endif
-    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) {
+    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) {
-	bytes_per_cycle = max_data;
+        bytes_per_cycle = max_data;
    }
    else {
 	bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size);
@ -83,7 +110,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
    j = fh->f_index_in_file_view;
    for (index = 0; index < cycles; index++) {
-	mca_common_ompio_build_io_array ( fh,
+        mca_common_ompio_build_io_array ( fh,
                                          index,
                                          cycles,
                                          bytes_per_cycle,
@ -97,9 +124,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
        if (fh->f_num_of_io_entries) {
            ret_code =fh->f_fbtl->fbtl_pwritev (fh);
-	    if ( 0<= ret_code ) {
+            if ( 0<= ret_code ) {
-		real_bytes_written+= (size_t)ret_code;
+                real_bytes_written+= (size_t)ret_code;
-	    }
+            }
        }
        fh->f_num_of_io_entries = 0;
@ -108,6 +135,11 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
            fh->f_io_array = NULL;
        }
    }
 #if OPAL_CUDA_SUPPORT
    if ( is_gpu && !is_managed ) {
        mca_common_ompio_release_buf (fh, decoded_iov->iov_base);
    }
 #endif
    if (NULL != decoded_iov) {
        free (decoded_iov);
@ -115,7 +147,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
    }
    if ( MPI_STATUS_IGNORE != status ) {
-	status->_ucount = real_bytes_written;
+        status->_ucount = real_bytes_written;
    }
    return ret;
@ -158,35 +190,62 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
    mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_WRITE);
    if ( 0 == count ) {
-	ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
+        ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
-	ompio_req->req_ompi.req_status._ucount = 0;
+        ompio_req->req_ompi.req_status._ucount = 0;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        ompi_request_complete (&ompio_req->req_ompi, false);
        *request = (ompi_request_t *) ompio_req;
-
+        
-	return OMPI_SUCCESS;
+        return OMPI_SUCCESS;
    }
    if ( NULL != fh->f_fbtl->fbtl_ipwritev ) {
-	/* This fbtl has support for non-blocking operations */
+        /* This fbtl has support for non-blocking operations */
        uint32_t iov_count = 0;
        struct iovec *decoded_iov = NULL;
        size_t max_data = 0;
        size_t total_bytes_written =0;
        int i = 0; /* index into the decoded iovec of the buffer */
        int j = 0; /* index into the file vie iovec */
-	uint32_t iov_count = 0;
+#if OPAL_CUDA_SUPPORT
-	struct iovec *decoded_iov = NULL;
+        int is_gpu, is_managed;
-	size_t max_data = 0;
+        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
-	size_t total_bytes_written =0;
+        if ( is_gpu && !is_managed ) {
-	int i = 0; /* index into the decoded iovec of the buffer */
+            size_t pos=0;
-	int j = 0; /* index into the file vie iovec */
+            char *tbuf=NULL;
            opal_convertor_t convertor;
-	mca_common_ompio_decode_datatype (fh,
+            OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
-				          datatype,
+            
-				          count,
+            opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos );
-				          buf,
+            opal_convertor_cleanup (&convertor);
 				          &max_data,
 				          &decoded_iov,
 				          &iov_count);
 	j = fh->f_index_in_file_view;
-	/* Non blocking operations have to occur in a single cycle */
+            ompio_req->req_tbuf = tbuf;
-	mca_common_ompio_build_io_array ( fh,
+            ompio_req->req_size = max_data;
        }
        else {
            mca_common_ompio_decode_datatype (fh,
                                              datatype,
                                              count,
                                              buf,
                                              &max_data,
                                              &decoded_iov,
                                              &iov_count);
        }
 #else
        mca_common_ompio_decode_datatype (fh,
                                          datatype,
                                          count,
                                          buf,
                                          &max_data,
                                          &decoded_iov,
                                          &iov_count);
 #endif
        j = fh->f_index_in_file_view;
        /* Non blocking operations have to occur in a single cycle */
        mca_common_ompio_build_io_array ( fh,
                                          0,         // index of current cycle iteration
                                          1,         // number of cycles
                                          max_data,  // setting bytes_per_cycle to max_data
@ -199,9 +258,9 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
                                          &spc);
        if (fh->f_num_of_io_entries) {
-	  fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req);
+            fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req);
        }
-
+        
        mca_common_ompio_register_progress ();
        fh->f_num_of_io_entries = 0;
@ -209,19 +268,19 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
            free (fh->f_io_array);
            fh->f_io_array = NULL;
        }
-	if (NULL != decoded_iov) {
+        if (NULL != decoded_iov) {
-	    free (decoded_iov);
+            free (decoded_iov);
-	    decoded_iov = NULL;
+            decoded_iov = NULL;
-	}
+        }
    }
    else {
-	// This fbtl does not support non-blocking write operations
+        // This fbtl does not support non-blocking write operations
-	ompi_status_public_t status;
+        ompi_status_public_t status;
-	ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status);
+        ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status);
-
+        
-	ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+        ompio_req->req_ompi.req_status.MPI_ERROR = ret;
-	ompio_req->req_ompi.req_status._ucount = status._ucount;
+        ompio_req->req_ompi.req_status._ucount = status._ucount;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        ompi_request_complete (&ompio_req->req_ompi, false);
    }
    *request = (ompi_request_t *) ompio_req;
--- a/ompi/mca/common/ompio/common_ompio_request.c
+++ b/ompi/mca/common/ompio/common_ompio_request.c
@ -19,6 +19,9 @@
 */
 #include "common_ompio_request.h"
 #if OPAL_CUDA_SUPPORT
 #include "common_ompio_cuda.h"
 #endif
 static void mca_common_ompio_request_construct(mca_ompio_request_t* req);
 static void mca_common_ompio_request_destruct(mca_ompio_request_t *req);
@ -34,6 +37,20 @@ opal_list_t mca_common_ompio_pending_requests = {{0}};
 static int mca_common_ompio_request_free ( struct ompi_request_t **req)
 {
    mca_ompio_request_t *ompio_req = ( mca_ompio_request_t *)*req;
 #if OPAL_CUDA_SUPPORT
    if ( NULL != ompio_req->req_tbuf ) {
        if ( MCA_OMPIO_REQUEST_READ == ompio_req->req_type ){
            struct iovec decoded_iov;
            uint32_t iov_count=1;
            size_t pos=0;
            decoded_iov.iov_base = ompio_req->req_tbuf;
            decoded_iov.iov_len  = ompio_req->req_size;
            opal_convertor_unpack (&ompio_req->req_convertor, &decoded_iov, &iov_count, &pos );
        }
        mca_common_ompio_release_buf ( NULL, ompio_req->req_tbuf );
    }
 #endif
    if ( NULL != ompio_req->req_free_fn ) {
        ompio_req->req_free_fn (ompio_req );
    }
@ -60,6 +77,10 @@ void mca_common_ompio_request_construct(mca_ompio_request_t* req)
    req->req_ompi.req_cancel = mca_common_ompio_request_cancel;
    req->req_ompi.req_type   = OMPI_REQUEST_IO;
    req->req_data            = NULL;
 #if OPAL_CUDA_SUPPORT
    req->req_tbuf            = NULL;
    req->req_size            = 0;
 #endif
    req->req_progress_fn     = NULL;
    req->req_free_fn         = NULL;
--- a/ompi/mca/common/ompio/common_ompio_request.h
+++ b/ompi/mca/common/ompio/common_ompio_request.h
@ -52,6 +52,11 @@ struct mca_ompio_request_t {
    mca_ompio_request_type_t                       req_type;
    void                                          *req_data;
    opal_list_item_t                               req_item;
 #if OPAL_CUDA_SUPPORT
    void                                          *req_tbuf;
    size_t                                         req_size;
    opal_convertor_t                          req_convertor;
 #endif
    mca_fbtl_base_module_progress_fn_t      req_progress_fn;
    mca_fbtl_base_module_request_free_fn_t      req_free_fn;
 };
--- a/ompi/mca/fcoll/base/fcoll_base_file_select.c
+++ b/ompi/mca/fcoll/base/fcoll_base_file_select.c
@ -280,11 +280,18 @@ int mca_fcoll_base_query_table (struct ompio_file_t *file, char *name)
        }
    }
    if (!strcmp (name, "two_phase")) {
 #if OPAL_CUDA_SUPPORT
        /* do not use the two_phase component with CUDA
           buffers, since the data sieving causes trouble 
           on unmanaged GPU buffers.
        */
 #else
        if ((int)file->f_cc_size < file->f_bytes_per_agg &&
            (0 == file->f_stripe_size || file->f_cc_size < file->f_stripe_size) && 
 	    (LUSTRE != file->f_fstype) ) {
            return 1;
        }
 #endif
    }
    return 0;
 }
--- a/ompi/mca/io/ompio/io_ompio_component.c
+++ b/ompi/mca/io/ompio/io_ompio_component.c
@ -34,6 +34,10 @@
 #include "io_ompio.h"
 #include "ompi/mca/common/ompio/common_ompio_request.h"
 #if OPAL_CUDA_SUPPORT
 #include "ompi/mca/common/ompio/common_ompio_cuda.h"
 #endif
 int mca_io_ompio_cycle_buffer_size = OMPIO_DEFAULT_CYCLE_BUF_SIZE;
 int mca_io_ompio_bytes_per_agg = OMPIO_PREALLOC_MAX_BUF_SIZE;
 int mca_io_ompio_num_aggregators = -1;
@ -272,6 +276,10 @@ static int close_component(void)
 {
    mca_common_ompio_request_fini ();
 #if OPAL_CUDA_SUPPORT
    mca_common_ompio_cuda_alloc_fini();
 #endif
    OBJ_DESTRUCT(&mca_io_ompio_mutex);
    return OMPI_SUCCESS;
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@ -91,6 +91,33 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 * Note that if there is an error with any of the CUDA calls, the program
 * aborts as there is no recovering.
 */
 /* Checks the type of pointer
 *
 * @param buf   check one pointer providing a convertor.
 *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
 */
 bool  opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor )
 {
    /* Only do the initialization on the first GPU access */
    if (!initialized) {
        opal_cuda_support_init();
    }
    if (!opal_cuda_enabled) {
        return false;
    }
    return ( ftable.gpu_is_gpu_buffer(buf, convertor));
 }
 /*
 * With CUDA enabled, all contiguous copies will pass through this function.
 * Therefore, the first check is to see if the convertor is a GPU buffer.
 * Note that if there is an error with any of the CUDA calls, the program
 * aborts as there is no recovering.
 */
 void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
 {
    int res;
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@ -23,6 +23,7 @@ typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t
 void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
 bool opal_cuda_check_bufs(char *dest, char *src);
 bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor );
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
 void* opal_cuda_memmove(void * dest, void * src, size_t size);