Add some basic support for sending and receiving CUDA device memory. Feature is disabled by default and has no effect on default code paths.
This commit was SVN r24659.
Этот коммит содержится в:
родитель
0ff0d20e72
Коммит
2634f6401a
@ -16,6 +16,7 @@ dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
|
||||
dnl reserved.
|
||||
dnl Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
dnl Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
dnl
|
||||
dnl $COPYRIGHT$
|
||||
dnl
|
||||
@ -564,4 +565,78 @@ fi
|
||||
AC_DEFINE_UNQUOTED([OPAL_ENABLE_CRDEBUG], [$ompi_want_prd],
|
||||
[Whether we want checkpoint/restart enabled debugging functionality or not])
|
||||
|
||||
#
|
||||
# Check to see if user wants CUDA support in datatype and convertor code.
|
||||
#
|
||||
AC_ARG_WITH([cuda],
|
||||
[AC_HELP_STRING([--with-cuda(=DIR)],
|
||||
[Build cuda support, optionally adding DIR/include, DIR/lib, and DIR/lib64])])
|
||||
AC_MSG_CHECKING([if --with-cuda is set])
|
||||
|
||||
# CUDA support is off by default. User has to request it.
|
||||
AS_IF([test "$with_cuda" = "no" -o "x$with_cuda" = "x"],
|
||||
[opal_check_cuda_happy="no"
|
||||
AC_MSG_RESULT([not set (--with-cuda=$with_cuda)])],
|
||||
[AS_IF([test "$with_cuda" = "yes"],
|
||||
[AS_IF([test "x`ls /usr/local/cuda/include/cuda.h 2> /dev/null`" = "x"],
|
||||
[AC_MSG_RESULT([not found in standard location])
|
||||
AC_MSG_WARN([Expected file /usr/local/cuda/include/cuda.h not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[AC_MSG_RESULT([found])
|
||||
opal_check_cuda_happy="yes"
|
||||
with_cuda="/usr/local/cuda"])],
|
||||
[AS_IF([test ! -d "$with_cuda"],
|
||||
[AC_MSG_RESULT([not found])
|
||||
AC_MSG_WARN([Directory $with_cuda not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[AS_IF([test "x`ls $with_cuda/include/cuda.h 2> /dev/null`" = "x"],
|
||||
[AC_MSG_RESULT([not found])
|
||||
AC_MSG_WARN([Expected file $with_cuda/include/cuda.h not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[opal_check_cuda_happy="yes"
|
||||
AC_MSG_RESULT([found ($with_cuda/include/cuda.h)])])])])])
|
||||
|
||||
# Check for optional libdir setting
|
||||
AC_ARG_WITH([cuda-libdir],
|
||||
[AC_HELP_STRING([--with-cuda-libdir=DIR],
|
||||
[Search for cuda libraries in DIR])])
|
||||
AC_MSG_CHECKING([if --with-cuda-libdir is set])
|
||||
|
||||
# Only check for the extra cuda libdir if we have passed the --with-cuda tests.
|
||||
AS_IF([test "$opal_check_cuda_happy" = "yes"],
|
||||
[AS_IF([test "$with_cuda_libdir" != "yes" -a "$with_cuda_libdir" != "no" -a "x$with_cuda_libdir" != "x"],
|
||||
[AS_IF([test ! -d "$with_cuda_libdir"],
|
||||
[AC_MSG_RESULT([not found])
|
||||
AC_MSG_WARN([Directory $with_cuda_libdir not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
|
||||
[AC_MSG_RESULT([not found])
|
||||
AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
|
||||
[with_cuda_libdir=/usr/lib64
|
||||
AS_IF([test "x`ls $with_cuda_libdir/libcuda.* 2> /dev/null`" = "x"],
|
||||
[AC_MSG_RESULT([not found])
|
||||
AC_MSG_WARN([Expected file $with_cuda_libdir/libcuda.* not found])
|
||||
AC_MSG_ERROR([Cannot continue])],
|
||||
[AC_MSG_RESULT([ok - found directory ($with_cuda_libdir)])])])],
|
||||
[AC_MSG_RESULT([not applicable since --with-cuda is not set])])
|
||||
|
||||
AC_MSG_CHECKING([if have cuda support])
|
||||
if test "$opal_check_cuda_happy" = "yes"; then
|
||||
AC_MSG_RESULT([yes (-I$with_cuda/include -L$with_cuda_libdir -lcuda)])
|
||||
CUDA_SUPPORT=1
|
||||
opal_datatype_CPPFLAGS="-I$with_cuda/include"
|
||||
opal_datatype_LIBS="-L$with_cuda_libdir -lcuda"
|
||||
AC_SUBST([opal_datatype_CPPFLAGS])
|
||||
AC_SUBST([opal_datatype_LIBS])
|
||||
else
|
||||
AC_MSG_RESULT([no])
|
||||
CUDA_SUPPORT=0
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
|
||||
AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
|
||||
[Whether we want cuda device pointer support])
|
||||
|
||||
])dnl
|
||||
|
@ -14,6 +14,7 @@
|
||||
# reserved.
|
||||
# Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -71,3 +72,11 @@ if WANT_INSTALL_HEADERS
|
||||
opaldir = $(includedir)/openmpi/$(subdir)
|
||||
opal_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
# If we have cuda support, modify file list and flags
|
||||
if OPAL_cuda_support
|
||||
libdatatype_la_SOURCES += opal_datatype_cuda.c
|
||||
headers += opal_datatype_cuda.h
|
||||
AM_CPPFLAGS = $(opal_datatype_CPPFLAGS)
|
||||
libdatatype_la_LIBADD += $(opal_datatype_LIBS)
|
||||
endif
|
||||
|
@ -11,6 +11,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -37,6 +38,11 @@
|
||||
#include "opal/datatype/opal_datatype_checksum.h"
|
||||
#include "opal/datatype/opal_datatype_prototypes.h"
|
||||
#include "opal/datatype/opal_convertor_internal.h"
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
#endif
|
||||
|
||||
extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
|
||||
int starting_point, const int* sizes );
|
||||
@ -48,6 +54,9 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
|
||||
convertor->partial_length = 0;
|
||||
convertor->remoteArch = opal_local_arch;
|
||||
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
convertor->cbmemcpy = &memcpy;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -235,7 +244,11 @@ int32_t opal_convertor_pack( opal_convertor_t* pConv,
|
||||
if( OPAL_LIKELY(NULL == iov[i].iov_base) )
|
||||
iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
|
||||
else
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
|
||||
#else
|
||||
MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
|
||||
#endif
|
||||
pending_length -= iov[i].iov_len;
|
||||
base_pointer += iov[i].iov_len;
|
||||
}
|
||||
@ -248,7 +261,11 @@ complete_contiguous_data_pack:
|
||||
if( OPAL_LIKELY(NULL == iov[i].iov_base) )
|
||||
iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
|
||||
else
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
|
||||
#else
|
||||
MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
|
||||
#endif
|
||||
pConv->bConverted = pConv->local_size;
|
||||
*out_size = i + 1;
|
||||
pConv->flags |= CONVERTOR_COMPLETED;
|
||||
@ -282,7 +299,11 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,
|
||||
if( iov[i].iov_len >= pending_length ) {
|
||||
goto complete_contiguous_data_unpack;
|
||||
}
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
|
||||
#else
|
||||
MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
|
||||
#endif
|
||||
pending_length -= iov[i].iov_len;
|
||||
base_pointer += iov[i].iov_len;
|
||||
}
|
||||
@ -292,7 +313,11 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,
|
||||
|
||||
complete_contiguous_data_unpack:
|
||||
iov[i].iov_len = pending_length;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
|
||||
#else
|
||||
MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
|
||||
#endif
|
||||
pConv->bConverted = pConv->local_size;
|
||||
*out_size = i + 1;
|
||||
pConv->flags |= CONVERTOR_COMPLETED;
|
||||
@ -519,6 +544,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
|
||||
/* Here I should check that the data is not overlapping */
|
||||
|
||||
convertor->flags |= CONVERTOR_RECV;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_cuda_convertor_init(convertor, pUserBuf);
|
||||
#endif
|
||||
|
||||
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
|
||||
|
||||
@ -555,6 +583,9 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
|
||||
const void* pUserBuf )
|
||||
{
|
||||
convertor->flags |= CONVERTOR_SEND;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
mca_cuda_convertor_init(convertor, pUserBuf);
|
||||
#endif
|
||||
|
||||
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
|
||||
|
||||
@ -623,6 +654,9 @@ int opal_convertor_clone( const opal_convertor_t* source,
|
||||
destination->bConverted = source->bConverted;
|
||||
destination->stack_pos = source->stack_pos;
|
||||
}
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
destination->cbmemcpy = source->cbmemcpy;
|
||||
#endif
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -34,6 +34,9 @@
|
||||
#ifdef HAVE_NET_UIO_H
|
||||
#include <net/uio.h>
|
||||
#endif
|
||||
#if HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/constants.h"
|
||||
#include "opal/datatype/opal_datatype.h"
|
||||
@ -51,6 +54,7 @@ BEGIN_C_DECLS
|
||||
#define CONVERTOR_HOMOGENEOUS 0x00080000
|
||||
#define CONVERTOR_NO_OP 0x00100000
|
||||
#define CONVERTOR_WITH_CHECKSUM 0x00200000
|
||||
#define CONVERTOR_CUDA 0x00400000
|
||||
#define CONVERTOR_TYPE_MASK 0x00FF0000
|
||||
#define CONVERTOR_STATE_START 0x01000000
|
||||
#define CONVERTOR_STATE_COMPLETE 0x02000000
|
||||
@ -65,6 +69,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
|
||||
uint32_t* out_size,
|
||||
size_t* max_data );
|
||||
typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
|
||||
typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n );
|
||||
|
||||
/* The master convertor struct (defined in convertor_internal.h) */
|
||||
struct opal_convertor_master_t;
|
||||
@ -109,6 +114,9 @@ struct opal_convertor_t {
|
||||
dt_stack_t static_stack[DT_STATIC_STACK_SIZE]; /**< local stack for small datatypes */
|
||||
/* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */
|
||||
#endif
|
||||
/* size: 248, cachelines: 4, members: 20 */
|
||||
/* last cacheline: 56 bytes */
|
||||
};
|
||||
@ -156,6 +164,9 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
|
||||
convertor->pStack = convertor->static_stack;
|
||||
convertor->stack_size = DT_STATIC_STACK_SIZE;
|
||||
}
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
convertor->cbmemcpy = &memcpy;
|
||||
#endif
|
||||
convertor->pDesc = NULL;
|
||||
convertor->stack_pos = 0;
|
||||
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
|
||||
@ -176,6 +187,9 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv
|
||||
{
|
||||
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
|
||||
#endif
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if( pConvertor->flags & CONVERTOR_CUDA ) return 1;
|
||||
#endif
|
||||
if( pConvertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) return 0;
|
||||
if( (pConvertor->count == 1) && (pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0;
|
||||
|
@ -11,6 +11,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -71,6 +72,31 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
|
||||
#define MEM_OP MEMMOVE
|
||||
#include "opal_datatype_copy.h"
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
#include "opal_datatype_cuda.h"
|
||||
|
||||
#undef MEM_OP_NAME
|
||||
#define MEM_OP_NAME non_overlap_cuda
|
||||
#undef MEM_OP
|
||||
#define MEM_OP opal_cuda_memcpy
|
||||
#include "opal_datatype_copy.h"
|
||||
|
||||
#undef MEM_OP_NAME
|
||||
#define MEM_OP_NAME overlap_cuda
|
||||
#undef MEM_OP
|
||||
#define MEM_OP opal_cuda_memmove
|
||||
#include "opal_datatype_copy.h"
|
||||
|
||||
#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) \
|
||||
do { \
|
||||
if (true == cuda_device_bufs) { \
|
||||
fct = copy_function; \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function)
|
||||
#endif
|
||||
|
||||
int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count,
|
||||
char* destination_base, char* source_base )
|
||||
{
|
||||
@ -78,6 +104,10 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
|
||||
size_t iov_len_local;
|
||||
int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*);
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
bool cuda_device_bufs = opal_cuda_check_bufs(destination_base, source_base);
|
||||
#endif
|
||||
|
||||
DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n",
|
||||
(void*)datatype, count, destination_base, source_base ); );
|
||||
|
||||
@ -95,15 +125,18 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
|
||||
extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb);
|
||||
|
||||
fct = non_overlap_copy_content_same_ddt;
|
||||
SET_CUDA_COPY_FCT(cuda_device_bufs, fct, non_overlap_cuda_copy_content_same_ddt);
|
||||
if( destination_base < source_base ) {
|
||||
if( (destination_base + extent) > source_base ) {
|
||||
/* memmove */
|
||||
fct = overlap_copy_content_same_ddt;
|
||||
SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
|
||||
}
|
||||
} else {
|
||||
if( (source_base + extent) > destination_base ) {
|
||||
/* memmove */
|
||||
fct = overlap_copy_content_same_ddt;
|
||||
SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
|
||||
}
|
||||
}
|
||||
return fct( datatype, count, destination_base, source_base );
|
||||
|
154
opal/datatype/opal_datatype_cuda.c
Исполняемый файл
154
opal/datatype/opal_datatype_cuda.c
Исполняемый файл
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#include "opal/align.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
|
||||
static bool initialized = false;
|
||||
static int opal_cuda_verbose;
|
||||
static int opal_cuda_output = 0;
|
||||
static void opal_cuda_support_init(void);
|
||||
|
||||
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
|
||||
{
|
||||
int res;
|
||||
CUmemorytype memType;
|
||||
CUdeviceptr dbuf = (CUdeviceptr)pUserBuf;
|
||||
|
||||
if (!initialized) {
|
||||
opal_cuda_support_init();
|
||||
}
|
||||
|
||||
res = cuPointerGetAttribute(&memType,
|
||||
CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
/* If we cannot determine it is device pointer,
|
||||
* just assume it is not. */
|
||||
return;
|
||||
} else if (memType == CU_MEMORYTYPE_HOST) {
|
||||
/* Host memory, nothing to do here */
|
||||
return;
|
||||
}
|
||||
/* Must be a device pointer */
|
||||
assert(memType == CU_MEMORYTYPE_DEVICE);
|
||||
|
||||
convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
|
||||
convertor->flags |= CONVERTOR_CUDA;
|
||||
}
|
||||
|
||||
/* Checks the type of pointer
|
||||
*
|
||||
* @param dest One pointer to check
|
||||
* @param source Another pointer to check
|
||||
*/
|
||||
bool opal_cuda_check_bufs(char *dest, char *src)
|
||||
{
|
||||
int res;
|
||||
CUmemorytype memType;
|
||||
|
||||
res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest);
|
||||
if( memType == CU_MEMORYTYPE_DEVICE){
|
||||
return true;
|
||||
}
|
||||
res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src);
|
||||
if( memType == CU_MEMORYTYPE_DEVICE){
|
||||
return true;
|
||||
}
|
||||
/* Assuming it is a host pointer for all other situations */
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Need intermediate cuMemcpy function so we can check the return code
|
||||
* of the call. If we see an error, abort as there is no recovery at
|
||||
* this point.
|
||||
*/
|
||||
void *opal_cuda_memcpy(void *dest, void *src, size_t size)
|
||||
{
|
||||
int res;
|
||||
res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
|
||||
res, dest, src, (int)size);
|
||||
abort();
|
||||
} else {
|
||||
return dest;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* In some cases, need an implementation of memmove. This is not fast, but
|
||||
* it is not often needed.
|
||||
*/
|
||||
void *opal_cuda_memmove(void *dest, void *src, size_t size)
|
||||
{
|
||||
CUdeviceptr tmp;
|
||||
int res;
|
||||
|
||||
res = cuMemAlloc(&tmp,size);
|
||||
res = cuMemcpy(tmp, (CUdeviceptr) src, size);
|
||||
if(res != CUDA_SUCCESS){
|
||||
opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
|
||||
res, (void *)tmp, src, (int)size);
|
||||
abort();
|
||||
}
|
||||
res = cuMemcpy((CUdeviceptr) dest, tmp, size);
|
||||
if(res != CUDA_SUCCESS){
|
||||
opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
|
||||
res, dest, (void *)tmp, (int)size);
|
||||
abort();
|
||||
}
|
||||
cuMemFree(tmp);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function gets called once to check if the program is running in a cuda
|
||||
* environment.
|
||||
*/
|
||||
static void opal_cuda_support_init(void)
|
||||
{
|
||||
int id;
|
||||
CUresult res;
|
||||
CUcontext cuContext;
|
||||
|
||||
if (initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Set different levels of verbosity in the cuda related code. */
|
||||
id = mca_base_param_reg_int_name("opal", "cuda_verbose",
|
||||
"Set level of opal cuda verbosity",
|
||||
false, false, 0, &opal_cuda_verbose);
|
||||
opal_cuda_output = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
|
||||
|
||||
/* Check to see if this process is running in a CUDA context. If so,
|
||||
* all is good. Currently, just print out a message in verbose mode
|
||||
* to help with debugging. */
|
||||
res = cuCtxGetCurrent(&cuContext);
|
||||
if (CUDA_SUCCESS != res) {
|
||||
opal_output_verbose(10, opal_cuda_output,
|
||||
"CUDA: cuCtxGetCurrent failed, CUDA device pointers will not work");
|
||||
} else {
|
||||
opal_output_verbose(10, opal_cuda_output,
|
||||
"CUDA: cuCtxGetCurrent succeeded, CUDA device pointers will work");
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
20
opal/datatype/opal_datatype_cuda.h
Исполняемый файл
20
opal/datatype/opal_datatype_cuda.h
Исполняемый файл
@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef _OPAL_DATATYPE_CUDA_H
|
||||
#define _OPAL_DATATYPE_CUDA_H
|
||||
|
||||
#include "cuda.h"
|
||||
|
||||
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
|
||||
bool opal_cuda_check_bufs(char *dest, char *src);
|
||||
void* opal_cuda_memcpy(void * dest, void * src, size_t size);
|
||||
void* opal_cuda_memmove(void * dest, void * src, size_t size);
|
||||
|
||||
#endif
|
@ -4,6 +4,7 @@
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,6 +22,13 @@
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
|
||||
/* Make use of existing macro to do CUDA style memcpy */
|
||||
#undef MEMCPY_CSUM
|
||||
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
#endif
|
||||
|
||||
static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
|
||||
dt_elem_desc_t* ELEM,
|
||||
uint32_t* COUNT,
|
||||
|
@ -11,6 +11,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -318,8 +319,16 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
|
||||
memset( temporary, unused_byte, data_length );
|
||||
MEMCPY( temporary + start_position, partial_data, (end_position - start_position) );
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
/* In the case where the data is being unpacked from device
|
||||
* memory, need to use the special host to device memory copy.
|
||||
* Note this code path was only seen on large receives of
|
||||
* noncontiguous data via buffered sends. */
|
||||
pConvertor->cbmemcpy(saved_data, real_data, data_length );
|
||||
#else
|
||||
/* Save the content of the user memory */
|
||||
MEMCPY( saved_data, real_data, data_length );
|
||||
#endif
|
||||
|
||||
/* Then unpack the data into the user memory */
|
||||
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
|
||||
@ -331,10 +340,25 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
|
||||
/* For every occurence of the unused byte move data from the saved
|
||||
* buffer back into the user memory.
|
||||
*/
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
/* Need to copy the modified real_data again so we can see which
|
||||
* bytes need to be converted back to their original values. Note
|
||||
* this code path was only seen on large receives of noncontiguous
|
||||
* data via buffered sends. */
|
||||
{
|
||||
char resaved_data[16];
|
||||
pConvertor->cbmemcpy(resaved_data, real_data, data_length );
|
||||
for( i = 0; i < data_length; i++ ) {
|
||||
if( unused_byte == resaved_data[i] )
|
||||
pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for( i = 0; i < data_length; i++ ) {
|
||||
if( unused_byte == real_data[i] )
|
||||
real_data[i] = saved_data[i];
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,6 +22,13 @@
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
|
||||
/* Make use of existing macro to do CUDA style memcpy */
|
||||
#undef MEMCPY_CSUM
|
||||
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
#endif
|
||||
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user