From 6a532101aabd098bececbbb4eb29b950e1f95ad8 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <egabriel@central.uh.edu>
Date: Sun, 10 Jun 2018 16:27:08 -0500
Subject: [PATCH] io/ompio and common/ompio: add initial support for cuda
 buffers in ompio

this commit adds the initial support for cuda buffers in ompio, for blocking
and non-blocking individual read and write operations.

Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
---
 ompi/mca/common/ompio/Makefile.am             |   7 +-
 ompi/mca/common/ompio/common_ompio_cuda.c     |  69 ++++++++
 ompi/mca/common/ompio/common_ompio_cuda.h     |  51 ++++++
 .../mca/common/ompio/common_ompio_file_read.c | 127 ++++++++++-----
 .../common/ompio/common_ompio_file_write.c    | 153 ++++++++++++------
 ompi/mca/common/ompio/common_ompio_request.c  |  21 +++
 ompi/mca/common/ompio/common_ompio_request.h  |   5 +
 7 files changed, 349 insertions(+), 84 deletions(-)
 create mode 100644 ompi/mca/common/ompio/common_ompio_cuda.c
 create mode 100644 ompi/mca/common/ompio/common_ompio_cuda.h

diff --git a/ompi/mca/common/ompio/Makefile.am b/ompi/mca/common/ompio/Makefile.am
index 6eda4644b8..d8ed32b9f6 100644
--- a/ompi/mca/common/ompio/Makefile.am
+++ b/ompi/mca/common/ompio/Makefile.am
@@ -9,7 +9,7 @@
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
-# Copyright (c) 2008-2016 University of Houston. All rights reserved.
+# Copyright (c) 2008-2018 University of Houston. All rights reserved.
 # Copyright (c) 2016      IBM Corporation.  All rights reserved.
 # Copyright (c) 2017-2018 Research Organization for Information Science
 #                         and Technology (RIST). All rights reserved.
@@ -74,6 +74,11 @@ else
 ompidir = $(includedir)
 endif
 
+if OPAL_cuda_support
+headers += common_ompio_cuda.h
+sources += common_ompio_cuda.c
+endif
+
 # These two rules will sym link the "noinst" libtool library filename
 # to the installable libtool library filename in the case where we are
 # compiling this component statically (case 2), described above).
diff --git a/ompi/mca/common/ompio/common_ompio_cuda.c b/ompi/mca/common/ompio/common_ompio_cuda.c
new file mode 100644
index 0000000000..e21b93400a
--- /dev/null
+++ b/ompi/mca/common/ompio/common_ompio_cuda.c
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                          University Research and Technology
+ *                          Corporation.  All rights reserved.
+ *  Copyright (c) 2004-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.  All rights
+ *                          reserved.
+ *  Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                          University of Stuttgart.  All rights reserved.
+ *  Copyright (c) 2004-2005 The Regents of the University of California.
+ *                          All rights reserved.
+ *  Copyright (c) 2008-2018 University of Houston. All rights reserved.
+ *  $COPYRIGHT$
+ *
+ *  Additional copyrights may follow
+ *
+ *  $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+
+#include "ompi/mca/io/ompio/io_ompio.h"
+#include "common_ompio.h"
+#include "common_ompio_cuda.h"
+
+void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, int *is_gpu, 
+				      int *is_managed)
+{
+    opal_convertor_t    convertor;  
+    
+    *is_gpu=0;
+    *is_managed=0;
+    
+    convertor.flags=0;
+    if ( opal_cuda_check_one_buf ( (char *)buf, &convertor ) ) {
+        *is_gpu = 1;
+        if ( convertor.flags & CONVERTOR_CUDA_UNIFIED ){
+            *is_managed =1;
+        }
+    } 
+    
+    return;
+}
+
+
+void mca_common_ompio_register_buf ( ompio_file_t *fh, const void *buf, 
+                                    size_t bufsize )
+{
+    mca_common_cuda_register ( ( char *)buf, bufsize,  (char *) fh->f_filename );
+    return;
+}
+
+void mca_common_ompio_unregister_buf ( ompio_file_t *fh, void *buf )
+{
+    if ( NULL != fh ) {
+        mca_common_cuda_unregister ( (char *)buf, (char *)fh->f_filename);
+    }
+    else {
+        char dummy_filename[]="dummy_ompio_filename";
+        mca_common_cuda_unregister ( (char *)buf, (char *)dummy_filename);
+    }
+    free (buf);
+    return;
+}
+
diff --git a/ompi/mca/common/ompio/common_ompio_cuda.h b/ompi/mca/common/ompio/common_ompio_cuda.h
new file mode 100644
index 0000000000..3d567f8529
--- /dev/null
+++ b/ompi/mca/common/ompio/common_ompio_cuda.h
@@ -0,0 +1,51 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008-2018 University of Houston. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_COMMON_OMPIO_CUDA_H
+#define MCA_COMMON_OMPIO_CUDA_H
+
+
+#define OMPIO_CUDA_PREPARE_BUF(_fh,_buf,_count,_datatype,_tbuf,_convertor,_max_data,_decoded_iov,_iov_count){ \
+    opal_convertor_clone ( _fh->f_convertor, _convertor, 0);                          \
+    opal_convertor_prepare_for_send ( _convertor, &(_datatype->super), _count, _buf );\
+    opal_convertor_get_packed_size( _convertor, &_max_data );           \
+    _tbuf = (char *) malloc ( _max_data );                              \
+    if ( NULL == _tbuf ) {                                              \
+        opal_output(1, "common_ompio: could not allocate memory.\n");   \
+        return OMPI_ERR_OUT_OF_RESOURCE;                                \
+    }                                                                   \
+    mca_common_ompio_register_buf (_fh, _tbuf, _max_data);              \
+    _decoded_iov = (struct iovec *) malloc ( sizeof ( struct iovec ));  \
+    if ( NULL == _decoded_iov ) {                                       \
+        opal_output(1, "common_ompio: could not allocate memory.\n");   \
+        return OMPI_ERR_OUT_OF_RESOURCE;                                \
+    }                                                                   \
+    _decoded_iov->iov_base = _tbuf;                                     \
+    _decoded_iov->iov_len  = _max_data;                                 \
+    _iov_count=1;}
+
+
+void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, 
+				      int *is_gpu, int *is_managed);
+void mca_common_ompio_register_buf ( ompio_file_t *fh,  const void *buf, 
+				    size_t bufsize);
+void mca_common_ompio_unregister_buf ( ompio_file_t *fh,  void *buf );
+
+#endif
diff --git a/ompi/mca/common/ompio/common_ompio_file_read.c b/ompi/mca/common/ompio/common_ompio_file_read.c
index 9b9c19a6b4..95c93e3a51 100644
--- a/ompi/mca/common/ompio/common_ompio_file_read.c
+++ b/ompi/mca/common/ompio/common_ompio_file_read.c
@@ -36,6 +36,10 @@
 #include "math.h"
 #include <unistd.h>
 
+#if OPAL_CUDA_SUPPORT
+#include "common_ompio_cuda.h"
+#endif
+
 /* Read and write routines are split into two interfaces.
 **   The
 **   mca_io_ompio_file_read/write[_at]
@@ -74,10 +78,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
     int j = 0; /* index into the file vie iovec */
 
     if ( 0 == count ) {
-	if ( MPI_STATUS_IGNORE != status ) {
-	    status->_ucount = 0;
-	}
-	return ret;
+        if ( MPI_STATUS_IGNORE != status ) {
+            status->_ucount = 0;
+        }
+        return ret;
     }
 
     if (fh->f_amode & MPI_MODE_WRONLY){
@@ -86,6 +90,26 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
       return ret;
     }
 
+#if OPAL_CUDA_SUPPORT
+    int is_gpu=0, is_managed=0;
+    opal_convertor_t convertor;
+    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+    if ( is_gpu && !is_managed ) {
+        char *tbuf=NULL;
+
+        OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
+        
+    }
+    else {
+        mca_common_ompio_decode_datatype (fh,
+                                          datatype,
+                                          count,
+                                          buf,
+                                          &max_data,
+                                          &decoded_iov,
+                                          &iov_count);
+    }
+#else
     mca_common_ompio_decode_datatype (fh,
                                       datatype,
                                       count,
@@ -93,9 +117,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
                                       &max_data,
                                       &decoded_iov,
                                       &iov_count);
+#endif
 
-    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) {
-	bytes_per_cycle = max_data;
+    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) {
+        bytes_per_cycle = max_data;
     }
     else {
 	bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size);
@@ -124,9 +149,9 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
 
         if (fh->f_num_of_io_entries) {
             ret_code = fh->f_fbtl->fbtl_preadv (fh);
-	    if ( 0<= ret_code ) {
-		real_bytes_read+=(size_t)ret_code;
-	    }
+            if ( 0<= ret_code ) {
+                real_bytes_read+=(size_t)ret_code;
+            }
         }
 
         fh->f_num_of_io_entries = 0;
@@ -136,13 +161,22 @@ int mca_common_ompio_file_read (ompio_file_t *fh,
         }
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( is_gpu && !is_managed ) {
+        size_t pos=0;
+
+        opal_convertor_unpack (&convertor, decoded_iov, &iov_count, &pos );
+        opal_convertor_cleanup (&convertor);
+        mca_common_ompio_unregister_buf (fh, decoded_iov->iov_base);
+    }
+#endif
     if (NULL != decoded_iov) {
         free (decoded_iov);
         decoded_iov = NULL;
     }
 
     if ( MPI_STATUS_IGNORE != status ) {
-	status->_ucount = real_bytes_read;
+        status->_ucount = real_bytes_read;
     }
 
     return ret;
@@ -189,37 +223,58 @@ int mca_common_ompio_file_iread (ompio_file_t *fh,
     mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_READ);
 
     if ( 0 == count ) {
-	ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
-	ompio_req->req_ompi.req_status._ucount = 0;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
+        ompio_req->req_ompi.req_status._ucount = 0;
+        ompi_request_complete (&ompio_req->req_ompi, false);
         *request = (ompi_request_t *) ompio_req;
-
-	return OMPI_SUCCESS;
+        
+        return OMPI_SUCCESS;
     }
 
     if ( NULL != fh->f_fbtl->fbtl_ipreadv ) {
-	// This fbtl has support for non-blocking operations
+        // This fbtl has support for non-blocking operations
 
-	size_t total_bytes_read = 0;       /* total bytes that have been read*/
-	uint32_t iov_count = 0;
-	struct iovec *decoded_iov = NULL;
-
-	size_t max_data = 0;
-	int i = 0; /* index into the decoded iovec of the buffer */
-	int j = 0; /* index into the file vie iovec */
-
-	mca_common_ompio_decode_datatype (fh,
-				          datatype,
-				          count,
-				          buf,
-				          &max_data,
-				          &decoded_iov,
-				          &iov_count);
-
-	// Non-blocking operations have to occur in a single cycle
-	j = fh->f_index_in_file_view;
-
-	mca_common_ompio_build_io_array ( fh,
+        size_t total_bytes_read = 0;       /* total bytes that have been read*/
+        uint32_t iov_count = 0;
+        struct iovec *decoded_iov = NULL;
+        
+        size_t max_data = 0;
+        int i = 0; /* index into the decoded iovec of the buffer */
+        int j = 0; /* index into the file vie iovec */
+        
+#if OPAL_CUDA_SUPPORT
+        int is_gpu=0, is_managed=0;
+        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+        if ( is_gpu && !is_managed ) {
+            char *tbuf=NULL;
+            
+            OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&ompio_req->req_convertor,max_data,decoded_iov,iov_count);        
+            
+            ompio_req->req_tbuf = tbuf;
+            ompio_req->req_size = max_data;
+        }
+        else {
+            mca_common_ompio_decode_datatype (fh,
+                                              datatype,
+                                              count,
+                                              buf,
+                                              &max_data,
+                                              &decoded_iov,
+                                              &iov_count);
+        }
+#else
+        mca_common_ompio_decode_datatype (fh,
+                                          datatype,
+                                          count,
+                                          buf,
+                                          &max_data,
+                                          &decoded_iov,
+                                          &iov_count);
+#endif
+        // Non-blocking operations have to occur in a single cycle
+        j = fh->f_index_in_file_view;
+        
+        mca_common_ompio_build_io_array ( fh,
                                           0,         // index
                                           1,         // no. of cyces
                                           max_data,  // setting bytes per cycle to match data
diff --git a/ompi/mca/common/ompio/common_ompio_file_write.c b/ompi/mca/common/ompio/common_ompio_file_write.c
index c7d0c32e3d..3bb96d972d 100644
--- a/ompi/mca/common/ompio/common_ompio_file_write.c
+++ b/ompi/mca/common/ompio/common_ompio_file_write.c
@@ -34,6 +34,10 @@
 #include "math.h"
 #include <unistd.h>
 
+#if OPAL_CUDA_SUPPORT
+#include "common_ompio_cuda.h"
+#endif
+
 int mca_common_ompio_file_write (ompio_file_t *fh,
 			       const void *buf,
 			       int count,
@@ -55,12 +59,35 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
     int j = 0; /* index into the file view iovec */
 
     if ( 0 == count ) {
-	if ( MPI_STATUS_IGNORE != status ) {
-	    status->_ucount = 0;
-	}
-	return ret;
+        if ( MPI_STATUS_IGNORE != status ) {
+            status->_ucount = 0;
+        }
+        return ret;
     }
 
+#if OPAL_CUDA_SUPPORT
+    int is_gpu=0, is_managed=0;
+    mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+    if ( is_gpu && !is_managed ) {
+        size_t pos=0;
+        char *tbuf=NULL;
+        opal_convertor_t convertor;
+        
+        OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
+        
+        opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos );
+        opal_convertor_cleanup ( &convertor);
+    }
+    else {
+        mca_common_ompio_decode_datatype (fh,
+                                          datatype,
+                                          count,
+                                          buf,
+                                          &max_data,
+                                          &decoded_iov,
+                                          &iov_count);
+    }
+#else
     mca_common_ompio_decode_datatype (fh,
                                       datatype,
                                       count,
@@ -68,9 +95,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
                                       &max_data,
                                       &decoded_iov,
                                       &iov_count);
-
-    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) {
-	bytes_per_cycle = max_data;
+#endif
+    if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) {
+        bytes_per_cycle = max_data;
     }
     else {
 	bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size);
@@ -83,7 +110,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
 
     j = fh->f_index_in_file_view;
     for (index = 0; index < cycles; index++) {
-	mca_common_ompio_build_io_array ( fh,
+        mca_common_ompio_build_io_array ( fh,
                                           index,
                                           cycles,
                                           bytes_per_cycle,
@@ -97,9 +124,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
 
         if (fh->f_num_of_io_entries) {
             ret_code =fh->f_fbtl->fbtl_pwritev (fh);
-	    if ( 0<= ret_code ) {
-		real_bytes_written+= (size_t)ret_code;
-	    }
+            if ( 0<= ret_code ) {
+                real_bytes_written+= (size_t)ret_code;
+            }
         }
 
         fh->f_num_of_io_entries = 0;
@@ -108,6 +135,11 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
             fh->f_io_array = NULL;
         }
     }
+#if OPAL_CUDA_SUPPORT
+    if ( is_gpu && !is_managed ) {
+        mca_common_ompio_unregister_buf (fh, decoded_iov->iov_base);
+    }
+#endif
 
     if (NULL != decoded_iov) {
         free (decoded_iov);
@@ -115,7 +147,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh,
     }
 
     if ( MPI_STATUS_IGNORE != status ) {
-	status->_ucount = real_bytes_written;
+        status->_ucount = real_bytes_written;
     }
 
     return ret;
@@ -158,35 +190,62 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
     mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_WRITE);
 
     if ( 0 == count ) {
-	ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
-	ompio_req->req_ompi.req_status._ucount = 0;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;
+        ompio_req->req_ompi.req_status._ucount = 0;
+        ompi_request_complete (&ompio_req->req_ompi, false);
         *request = (ompi_request_t *) ompio_req;
-
-	return OMPI_SUCCESS;
+        
+        return OMPI_SUCCESS;
     }
 
     if ( NULL != fh->f_fbtl->fbtl_ipwritev ) {
-	/* This fbtl has support for non-blocking operations */
+        /* This fbtl has support for non-blocking operations */
+        
+        uint32_t iov_count = 0;
+        struct iovec *decoded_iov = NULL;
+        size_t max_data = 0;
+        size_t total_bytes_written =0;
+        int i = 0; /* index into the decoded iovec of the buffer */
+        int j = 0; /* index into the file vie iovec */
 
-	uint32_t iov_count = 0;
-	struct iovec *decoded_iov = NULL;
-	size_t max_data = 0;
-	size_t total_bytes_written =0;
-	int i = 0; /* index into the decoded iovec of the buffer */
-	int j = 0; /* index into the file vie iovec */
+#if OPAL_CUDA_SUPPORT
+        int is_gpu=0, is_managed=0;
+        mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed);
+        if ( is_gpu && !is_managed ) {
+            size_t pos=0;
+            char *tbuf=NULL;
+            opal_convertor_t convertor;
 
-	mca_common_ompio_decode_datatype (fh,
-				          datatype,
-				          count,
-				          buf,
-				          &max_data,
-				          &decoded_iov,
-				          &iov_count);
-	j = fh->f_index_in_file_view;
+            OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count);        
+            
+            opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos );
+            opal_convertor_cleanup (&convertor);
 
-	/* Non blocking operations have to occur in a single cycle */
-	mca_common_ompio_build_io_array ( fh,
+            ompio_req->req_tbuf = tbuf;
+            ompio_req->req_size = max_data;
+        }
+        else {
+            mca_common_ompio_decode_datatype (fh,
+                                              datatype,
+                                              count,
+                                              buf,
+                                              &max_data,
+                                              &decoded_iov,
+                                              &iov_count);
+        }
+#else
+        mca_common_ompio_decode_datatype (fh,
+                                          datatype,
+                                          count,
+                                          buf,
+                                          &max_data,
+                                          &decoded_iov,
+                                          &iov_count);
+#endif
+        j = fh->f_index_in_file_view;
+
+        /* Non blocking operations have to occur in a single cycle */
+        mca_common_ompio_build_io_array ( fh,
                                           0,         // index of current cycle iteration
                                           1,         // number of cycles
                                           max_data,  // setting bytes_per_cycle to max_data
@@ -199,9 +258,9 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
                                           &spc);
         
         if (fh->f_num_of_io_entries) {
-	  fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req);
+            fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req);
         }
-
+        
         mca_common_ompio_register_progress ();
 
         fh->f_num_of_io_entries = 0;
@@ -209,19 +268,19 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
             free (fh->f_io_array);
             fh->f_io_array = NULL;
         }
-	if (NULL != decoded_iov) {
-	    free (decoded_iov);
-	    decoded_iov = NULL;
-	}
+        if (NULL != decoded_iov) {
+            free (decoded_iov);
+            decoded_iov = NULL;
+        }
     }
     else {
-	// This fbtl does not support non-blocking write operations
-	ompi_status_public_t status;
-	ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status);
-
-	ompio_req->req_ompi.req_status.MPI_ERROR = ret;
-	ompio_req->req_ompi.req_status._ucount = status._ucount;
-	ompi_request_complete (&ompio_req->req_ompi, false);
+        // This fbtl does not support non-blocking write operations
+        ompi_status_public_t status;
+        ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status);
+        
+        ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+        ompio_req->req_ompi.req_status._ucount = status._ucount;
+        ompi_request_complete (&ompio_req->req_ompi, false);
     }
 
     *request = (ompi_request_t *) ompio_req;
diff --git a/ompi/mca/common/ompio/common_ompio_request.c b/ompi/mca/common/ompio/common_ompio_request.c
index 821e9fc9a4..e385f5bfa2 100644
--- a/ompi/mca/common/ompio/common_ompio_request.c
+++ b/ompi/mca/common/ompio/common_ompio_request.c
@@ -19,6 +19,9 @@
  */
 
 #include "common_ompio_request.h"
+#if OPAL_CUDA_SUPPORT
+#include "common_ompio_cuda.h"
+#endif
 
 static void mca_common_ompio_request_construct(mca_ompio_request_t* req);
 static void mca_common_ompio_request_destruct(mca_ompio_request_t *req);
@@ -34,6 +37,20 @@ opal_list_t mca_common_ompio_pending_requests = {{0}};
 static int mca_common_ompio_request_free ( struct ompi_request_t **req)
 {
     mca_ompio_request_t *ompio_req = ( mca_ompio_request_t *)*req;
+#if OPAL_CUDA_SUPPORT
+    if ( NULL != ompio_req->req_tbuf ) {
+        if ( MCA_OMPIO_REQUEST_READ == ompio_req->req_type ){
+            struct iovec decoded_iov;
+            uint32_t iov_count=1;
+            size_t pos=0;
+
+            decoded_iov.iov_base = ompio_req->req_tbuf;
+            decoded_iov.iov_len  = ompio_req->req_size;
+            opal_convertor_unpack (&ompio_req->req_convertor, &decoded_iov, &iov_count, &pos );
+        }
+        mca_common_ompio_unregister_buf ( NULL, ompio_req->req_tbuf );
+    }
+#endif
     if ( NULL != ompio_req->req_free_fn ) {
         ompio_req->req_free_fn (ompio_req );
     }
@@ -60,6 +77,10 @@ void mca_common_ompio_request_construct(mca_ompio_request_t* req)
     req->req_ompi.req_cancel = mca_common_ompio_request_cancel;
     req->req_ompi.req_type   = OMPI_REQUEST_IO;
     req->req_data            = NULL;
+#if OPAL_CUDA_SUPPORT
+    req->req_tbuf            = NULL;
+    req->req_size            = 0;
+#endif
     req->req_progress_fn     = NULL;
     req->req_free_fn         = NULL;
 
diff --git a/ompi/mca/common/ompio/common_ompio_request.h b/ompi/mca/common/ompio/common_ompio_request.h
index 50508e99e1..d019ca68a8 100644
--- a/ompi/mca/common/ompio/common_ompio_request.h
+++ b/ompi/mca/common/ompio/common_ompio_request.h
@@ -52,6 +52,11 @@ struct mca_ompio_request_t {
     mca_ompio_request_type_t                       req_type;
     void                                          *req_data;
     opal_list_item_t                               req_item;
+#if OPAL_CUDA_SUPPORT
+    void                                          *req_tbuf;
+    size_t                                         req_size;
+    opal_convertor_t                          req_convertor;
+#endif
     mca_fbtl_base_module_progress_fn_t      req_progress_fn;
     mca_fbtl_base_module_request_free_fn_t      req_free_fn;
 };