From 6a532101aabd098bececbbb4eb29b950e1f95ad8 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Sun, 10 Jun 2018 16:27:08 -0500 Subject: [PATCH] io/ompio and common/ompio: add initial support for cuda buffers in ompio this commit adds the initial support for cuda buffers in ompio, for blocking and non-blocking individual read and write operations. Signed-off-by: Edgar Gabriel --- ompi/mca/common/ompio/Makefile.am | 7 +- ompi/mca/common/ompio/common_ompio_cuda.c | 69 ++++++++ ompi/mca/common/ompio/common_ompio_cuda.h | 51 ++++++ .../mca/common/ompio/common_ompio_file_read.c | 127 ++++++++++----- .../common/ompio/common_ompio_file_write.c | 153 ++++++++++++------ ompi/mca/common/ompio/common_ompio_request.c | 21 +++ ompi/mca/common/ompio/common_ompio_request.h | 5 + 7 files changed, 349 insertions(+), 84 deletions(-) create mode 100644 ompi/mca/common/ompio/common_ompio_cuda.c create mode 100644 ompi/mca/common/ompio/common_ompio_cuda.h diff --git a/ompi/mca/common/ompio/Makefile.am b/ompi/mca/common/ompio/Makefile.am index 6eda4644b8..d8ed32b9f6 100644 --- a/ompi/mca/common/ompio/Makefile.am +++ b/ompi/mca/common/ompio/Makefile.am @@ -9,7 +9,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2008-2016 University of Houston. All rights reserved. +# Copyright (c) 2008-2018 University of Houston. All rights reserved. # Copyright (c) 2016 IBM Corporation. All rights reserved. # Copyright (c) 2017-2018 Research Organization for Information Science # and Technology (RIST). All rights reserved. @@ -74,6 +74,11 @@ else ompidir = $(includedir) endif +if OPAL_cuda_support +headers += common_ompio_cuda.h +sources += common_ompio_cuda.c +endif + # These two rules will sym link the "noinst" libtool library filename # to the installable libtool library filename in the case where we are # compiling this component statically (case 2), described above). diff --git a/ompi/mca/common/ompio/common_ompio_cuda.c b/ompi/mca/common/ompio/common_ompio_cuda.c new file mode 100644 index 0000000000..e21b93400a --- /dev/null +++ b/ompi/mca/common/ompio/common_ompio_cuda.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2016 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008-2018 University of Houston. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/datatype/opal_convertor.h" +#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" + +#include "ompi/mca/io/ompio/io_ompio.h" +#include "common_ompio.h" +#include "common_ompio_cuda.h" + +void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, int *is_gpu, + int *is_managed) +{ + opal_convertor_t convertor; + + *is_gpu=0; + *is_managed=0; + + convertor.flags=0; + if ( opal_cuda_check_one_buf ( (char *)buf, &convertor ) ) { + *is_gpu = 1; + if ( convertor.flags & CONVERTOR_CUDA_UNIFIED ){ + *is_managed =1; + } + } + + return; +} + + +void mca_common_ompio_register_buf ( ompio_file_t *fh, const void *buf, + size_t bufsize ) +{ + mca_common_cuda_register ( ( char *)buf, bufsize, (char *) fh->f_filename ); + return; +} + +void mca_common_ompio_unregister_buf ( ompio_file_t *fh, void *buf ) +{ + if ( NULL != fh ) { + mca_common_cuda_unregister ( (char *)buf, (char *)fh->f_filename); + } + else { + char dummy_filename[]="dummy_ompio_filename"; + mca_common_cuda_unregister ( (char *)buf, (char *)dummy_filename); + } + free (buf); + return; +} + diff --git a/ompi/mca/common/ompio/common_ompio_cuda.h b/ompi/mca/common/ompio/common_ompio_cuda.h new file mode 100644 index 0000000000..3d567f8529 --- /dev/null +++ b/ompi/mca/common/ompio/common_ompio_cuda.h @@ -0,0 +1,51 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008-2018 University of Houston. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COMMON_OMPIO_CUDA_H +#define MCA_COMMON_OMPIO_CUDA_H + + +#define OMPIO_CUDA_PREPARE_BUF(_fh,_buf,_count,_datatype,_tbuf,_convertor,_max_data,_decoded_iov,_iov_count){ \ + opal_convertor_clone ( _fh->f_convertor, _convertor, 0); \ + opal_convertor_prepare_for_send ( _convertor, &(_datatype->super), _count, _buf );\ + opal_convertor_get_packed_size( _convertor, &_max_data ); \ + _tbuf = (char *) malloc ( _max_data ); \ + if ( NULL == _tbuf ) { \ + opal_output(1, "common_ompio: could not allocate memory.\n"); \ + return OMPI_ERR_OUT_OF_RESOURCE; \ + } \ + mca_common_ompio_register_buf (_fh, _tbuf, _max_data); \ + _decoded_iov = (struct iovec *) malloc ( sizeof ( struct iovec )); \ + if ( NULL == _decoded_iov ) { \ + opal_output(1, "common_ompio: could not allocate memory.\n"); \ + return OMPI_ERR_OUT_OF_RESOURCE; \ + } \ + _decoded_iov->iov_base = _tbuf; \ + _decoded_iov->iov_len = _max_data; \ + _iov_count=1;} + + +void mca_common_ompio_check_gpu_buf ( ompio_file_t *fh, const void *buf, + int *is_gpu, int *is_managed); +void mca_common_ompio_register_buf ( ompio_file_t *fh, const void *buf, + size_t bufsize); +void mca_common_ompio_unregister_buf ( ompio_file_t *fh, void *buf ); + +#endif diff --git a/ompi/mca/common/ompio/common_ompio_file_read.c b/ompi/mca/common/ompio/common_ompio_file_read.c index 9b9c19a6b4..95c93e3a51 100644 --- a/ompi/mca/common/ompio/common_ompio_file_read.c +++ b/ompi/mca/common/ompio/common_ompio_file_read.c @@ -36,6 +36,10 @@ #include "math.h" #include +#if OPAL_CUDA_SUPPORT +#include "common_ompio_cuda.h" +#endif + /* Read and write routines are split into two interfaces. ** The ** mca_io_ompio_file_read/write[_at] @@ -74,10 +78,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh, int j = 0; /* index into the file vie iovec */ if ( 0 == count ) { - if ( MPI_STATUS_IGNORE != status ) { - status->_ucount = 0; - } - return ret; + if ( MPI_STATUS_IGNORE != status ) { + status->_ucount = 0; + } + return ret; } if (fh->f_amode & MPI_MODE_WRONLY){ @@ -86,6 +90,26 @@ int mca_common_ompio_file_read (ompio_file_t *fh, return ret; } +#if OPAL_CUDA_SUPPORT + int is_gpu=0, is_managed=0; + opal_convertor_t convertor; + mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed); + if ( is_gpu && !is_managed ) { + char *tbuf=NULL; + + OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count); + + } + else { + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); + } +#else mca_common_ompio_decode_datatype (fh, datatype, count, @@ -93,9 +117,10 @@ int mca_common_ompio_file_read (ompio_file_t *fh, &max_data, &decoded_iov, &iov_count); +#endif - if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) { - bytes_per_cycle = max_data; + if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) { + bytes_per_cycle = max_data; } else { bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size); @@ -124,9 +149,9 @@ int mca_common_ompio_file_read (ompio_file_t *fh, if (fh->f_num_of_io_entries) { ret_code = fh->f_fbtl->fbtl_preadv (fh); - if ( 0<= ret_code ) { - real_bytes_read+=(size_t)ret_code; - } + if ( 0<= ret_code ) { + real_bytes_read+=(size_t)ret_code; + } } fh->f_num_of_io_entries = 0; @@ -136,13 +161,22 @@ int mca_common_ompio_file_read (ompio_file_t *fh, } } +#if OPAL_CUDA_SUPPORT + if ( is_gpu && !is_managed ) { + size_t pos=0; + + opal_convertor_unpack (&convertor, decoded_iov, &iov_count, &pos ); + opal_convertor_cleanup (&convertor); + mca_common_ompio_unregister_buf (fh, decoded_iov->iov_base); + } +#endif if (NULL != decoded_iov) { free (decoded_iov); decoded_iov = NULL; } if ( MPI_STATUS_IGNORE != status ) { - status->_ucount = real_bytes_read; + status->_ucount = real_bytes_read; } return ret; @@ -189,37 +223,58 @@ int mca_common_ompio_file_iread (ompio_file_t *fh, mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_READ); if ( 0 == count ) { - ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; - ompio_req->req_ompi.req_status._ucount = 0; - ompi_request_complete (&ompio_req->req_ompi, false); + ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; + ompio_req->req_ompi.req_status._ucount = 0; + ompi_request_complete (&ompio_req->req_ompi, false); *request = (ompi_request_t *) ompio_req; - - return OMPI_SUCCESS; + + return OMPI_SUCCESS; } if ( NULL != fh->f_fbtl->fbtl_ipreadv ) { - // This fbtl has support for non-blocking operations + // This fbtl has support for non-blocking operations - size_t total_bytes_read = 0; /* total bytes that have been read*/ - uint32_t iov_count = 0; - struct iovec *decoded_iov = NULL; - - size_t max_data = 0; - int i = 0; /* index into the decoded iovec of the buffer */ - int j = 0; /* index into the file vie iovec */ - - mca_common_ompio_decode_datatype (fh, - datatype, - count, - buf, - &max_data, - &decoded_iov, - &iov_count); - - // Non-blocking operations have to occur in a single cycle - j = fh->f_index_in_file_view; - - mca_common_ompio_build_io_array ( fh, + size_t total_bytes_read = 0; /* total bytes that have been read*/ + uint32_t iov_count = 0; + struct iovec *decoded_iov = NULL; + + size_t max_data = 0; + int i = 0; /* index into the decoded iovec of the buffer */ + int j = 0; /* index into the file vie iovec */ + +#if OPAL_CUDA_SUPPORT + int is_gpu=0, is_managed=0; + mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed); + if ( is_gpu && !is_managed ) { + char *tbuf=NULL; + + OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&ompio_req->req_convertor,max_data,decoded_iov,iov_count); + + ompio_req->req_tbuf = tbuf; + ompio_req->req_size = max_data; + } + else { + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); + } +#else + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); +#endif + // Non-blocking operations have to occur in a single cycle + j = fh->f_index_in_file_view; + + mca_common_ompio_build_io_array ( fh, 0, // index 1, // no. of cyces max_data, // setting bytes per cycle to match data diff --git a/ompi/mca/common/ompio/common_ompio_file_write.c b/ompi/mca/common/ompio/common_ompio_file_write.c index c7d0c32e3d..3bb96d972d 100644 --- a/ompi/mca/common/ompio/common_ompio_file_write.c +++ b/ompi/mca/common/ompio/common_ompio_file_write.c @@ -34,6 +34,10 @@ #include "math.h" #include +#if OPAL_CUDA_SUPPORT +#include "common_ompio_cuda.h" +#endif + int mca_common_ompio_file_write (ompio_file_t *fh, const void *buf, int count, @@ -55,12 +59,35 @@ int mca_common_ompio_file_write (ompio_file_t *fh, int j = 0; /* index into the file view iovec */ if ( 0 == count ) { - if ( MPI_STATUS_IGNORE != status ) { - status->_ucount = 0; - } - return ret; + if ( MPI_STATUS_IGNORE != status ) { + status->_ucount = 0; + } + return ret; } +#if OPAL_CUDA_SUPPORT + int is_gpu=0, is_managed=0; + mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed); + if ( is_gpu && !is_managed ) { + size_t pos=0; + char *tbuf=NULL; + opal_convertor_t convertor; + + OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count); + + opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos ); + opal_convertor_cleanup ( &convertor); + } + else { + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); + } +#else mca_common_ompio_decode_datatype (fh, datatype, count, @@ -68,9 +95,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh, &max_data, &decoded_iov, &iov_count); - - if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size) ) { - bytes_per_cycle = max_data; +#endif + if ( -1 == OMPIO_MCA_GET(fh, cycle_buffer_size )) { + bytes_per_cycle = max_data; } else { bytes_per_cycle = OMPIO_MCA_GET(fh, cycle_buffer_size); @@ -83,7 +110,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh, j = fh->f_index_in_file_view; for (index = 0; index < cycles; index++) { - mca_common_ompio_build_io_array ( fh, + mca_common_ompio_build_io_array ( fh, index, cycles, bytes_per_cycle, @@ -97,9 +124,9 @@ int mca_common_ompio_file_write (ompio_file_t *fh, if (fh->f_num_of_io_entries) { ret_code =fh->f_fbtl->fbtl_pwritev (fh); - if ( 0<= ret_code ) { - real_bytes_written+= (size_t)ret_code; - } + if ( 0<= ret_code ) { + real_bytes_written+= (size_t)ret_code; + } } fh->f_num_of_io_entries = 0; @@ -108,6 +135,11 @@ int mca_common_ompio_file_write (ompio_file_t *fh, fh->f_io_array = NULL; } } +#if OPAL_CUDA_SUPPORT + if ( is_gpu && !is_managed ) { + mca_common_ompio_unregister_buf (fh, decoded_iov->iov_base); + } +#endif if (NULL != decoded_iov) { free (decoded_iov); @@ -115,7 +147,7 @@ int mca_common_ompio_file_write (ompio_file_t *fh, } if ( MPI_STATUS_IGNORE != status ) { - status->_ucount = real_bytes_written; + status->_ucount = real_bytes_written; } return ret; @@ -158,35 +190,62 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh, mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_WRITE); if ( 0 == count ) { - ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; - ompio_req->req_ompi.req_status._ucount = 0; - ompi_request_complete (&ompio_req->req_ompi, false); + ompio_req->req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; + ompio_req->req_ompi.req_status._ucount = 0; + ompi_request_complete (&ompio_req->req_ompi, false); *request = (ompi_request_t *) ompio_req; - - return OMPI_SUCCESS; + + return OMPI_SUCCESS; } if ( NULL != fh->f_fbtl->fbtl_ipwritev ) { - /* This fbtl has support for non-blocking operations */ + /* This fbtl has support for non-blocking operations */ + + uint32_t iov_count = 0; + struct iovec *decoded_iov = NULL; + size_t max_data = 0; + size_t total_bytes_written =0; + int i = 0; /* index into the decoded iovec of the buffer */ + int j = 0; /* index into the file vie iovec */ - uint32_t iov_count = 0; - struct iovec *decoded_iov = NULL; - size_t max_data = 0; - size_t total_bytes_written =0; - int i = 0; /* index into the decoded iovec of the buffer */ - int j = 0; /* index into the file vie iovec */ +#if OPAL_CUDA_SUPPORT + int is_gpu=0, is_managed=0; + mca_common_ompio_check_gpu_buf ( fh, buf, &is_gpu, &is_managed); + if ( is_gpu && !is_managed ) { + size_t pos=0; + char *tbuf=NULL; + opal_convertor_t convertor; - mca_common_ompio_decode_datatype (fh, - datatype, - count, - buf, - &max_data, - &decoded_iov, - &iov_count); - j = fh->f_index_in_file_view; + OMPIO_CUDA_PREPARE_BUF(fh,buf,count,datatype,tbuf,&convertor,max_data,decoded_iov,iov_count); + + opal_convertor_pack (&convertor, decoded_iov, &iov_count, &pos ); + opal_convertor_cleanup (&convertor); - /* Non blocking operations have to occur in a single cycle */ - mca_common_ompio_build_io_array ( fh, + ompio_req->req_tbuf = tbuf; + ompio_req->req_size = max_data; + } + else { + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); + } +#else + mca_common_ompio_decode_datatype (fh, + datatype, + count, + buf, + &max_data, + &decoded_iov, + &iov_count); +#endif + j = fh->f_index_in_file_view; + + /* Non blocking operations have to occur in a single cycle */ + mca_common_ompio_build_io_array ( fh, 0, // index of current cycle iteration 1, // number of cycles max_data, // setting bytes_per_cycle to max_data @@ -199,9 +258,9 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh, &spc); if (fh->f_num_of_io_entries) { - fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req); + fh->f_fbtl->fbtl_ipwritev (fh, (ompi_request_t *) ompio_req); } - + mca_common_ompio_register_progress (); fh->f_num_of_io_entries = 0; @@ -209,19 +268,19 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh, free (fh->f_io_array); fh->f_io_array = NULL; } - if (NULL != decoded_iov) { - free (decoded_iov); - decoded_iov = NULL; - } + if (NULL != decoded_iov) { + free (decoded_iov); + decoded_iov = NULL; + } } else { - // This fbtl does not support non-blocking write operations - ompi_status_public_t status; - ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status); - - ompio_req->req_ompi.req_status.MPI_ERROR = ret; - ompio_req->req_ompi.req_status._ucount = status._ucount; - ompi_request_complete (&ompio_req->req_ompi, false); + // This fbtl does not support non-blocking write operations + ompi_status_public_t status; + ret = mca_common_ompio_file_write(fh,buf,count,datatype, &status); + + ompio_req->req_ompi.req_status.MPI_ERROR = ret; + ompio_req->req_ompi.req_status._ucount = status._ucount; + ompi_request_complete (&ompio_req->req_ompi, false); } *request = (ompi_request_t *) ompio_req; diff --git a/ompi/mca/common/ompio/common_ompio_request.c b/ompi/mca/common/ompio/common_ompio_request.c index 821e9fc9a4..e385f5bfa2 100644 --- a/ompi/mca/common/ompio/common_ompio_request.c +++ b/ompi/mca/common/ompio/common_ompio_request.c @@ -19,6 +19,9 @@ */ #include "common_ompio_request.h" +#if OPAL_CUDA_SUPPORT +#include "common_ompio_cuda.h" +#endif static void mca_common_ompio_request_construct(mca_ompio_request_t* req); static void mca_common_ompio_request_destruct(mca_ompio_request_t *req); @@ -34,6 +37,20 @@ opal_list_t mca_common_ompio_pending_requests = {{0}}; static int mca_common_ompio_request_free ( struct ompi_request_t **req) { mca_ompio_request_t *ompio_req = ( mca_ompio_request_t *)*req; +#if OPAL_CUDA_SUPPORT + if ( NULL != ompio_req->req_tbuf ) { + if ( MCA_OMPIO_REQUEST_READ == ompio_req->req_type ){ + struct iovec decoded_iov; + uint32_t iov_count=1; + size_t pos=0; + + decoded_iov.iov_base = ompio_req->req_tbuf; + decoded_iov.iov_len = ompio_req->req_size; + opal_convertor_unpack (&ompio_req->req_convertor, &decoded_iov, &iov_count, &pos ); + } + mca_common_ompio_unregister_buf ( NULL, ompio_req->req_tbuf ); + } +#endif if ( NULL != ompio_req->req_free_fn ) { ompio_req->req_free_fn (ompio_req ); } @@ -60,6 +77,10 @@ void mca_common_ompio_request_construct(mca_ompio_request_t* req) req->req_ompi.req_cancel = mca_common_ompio_request_cancel; req->req_ompi.req_type = OMPI_REQUEST_IO; req->req_data = NULL; +#if OPAL_CUDA_SUPPORT + req->req_tbuf = NULL; + req->req_size = 0; +#endif req->req_progress_fn = NULL; req->req_free_fn = NULL; diff --git a/ompi/mca/common/ompio/common_ompio_request.h b/ompi/mca/common/ompio/common_ompio_request.h index 50508e99e1..d019ca68a8 100644 --- a/ompi/mca/common/ompio/common_ompio_request.h +++ b/ompi/mca/common/ompio/common_ompio_request.h @@ -52,6 +52,11 @@ struct mca_ompio_request_t { mca_ompio_request_type_t req_type; void *req_data; opal_list_item_t req_item; +#if OPAL_CUDA_SUPPORT + void *req_tbuf; + size_t req_size; + opal_convertor_t req_convertor; +#endif mca_fbtl_base_module_progress_fn_t req_progress_fn; mca_fbtl_base_module_request_free_fn_t req_free_fn; };