Add support for CUDA Unified memory. Basically, add a new flag and disable some
optimizations when that flag is detected. Lightly reviewed by bosilca.
Этот коммит содержится в:
родитель
52ed5a9bf8
Коммит
f471b09ae9
@ -11,6 +11,7 @@
|
||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -45,11 +46,12 @@ BEGIN_C_DECLS
|
||||
#define CONVERTOR_WITH_CHECKSUM 0x00200000
|
||||
#define CONVERTOR_CUDA 0x00400000
|
||||
#define CONVERTOR_CUDA_ASYNC 0x00800000
|
||||
#define CONVERTOR_TYPE_MASK 0x00FF0000
|
||||
#define CONVERTOR_TYPE_MASK 0x10FF0000
|
||||
#define CONVERTOR_STATE_START 0x01000000
|
||||
#define CONVERTOR_STATE_COMPLETE 0x02000000
|
||||
#define CONVERTOR_STATE_ALLOC 0x04000000
|
||||
#define CONVERTOR_COMPLETED 0x08000000
|
||||
#define CONVERTOR_CUDA_UNIFIED 0x10000000
|
||||
|
||||
union dt_elem_desc;
|
||||
typedef struct opal_convertor_t opal_convertor_t;
|
||||
@ -177,7 +179,7 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv
|
||||
if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
|
||||
#endif
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if( pConvertor->flags & CONVERTOR_CUDA ) return 1;
|
||||
if( pConvertor->flags & (CONVERTOR_CUDA | CONVERTOR_CUDA_UNIFIED)) return 1;
|
||||
#endif
|
||||
if( pConvertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) return 0;
|
||||
if( (pConvertor->count == 1) && (pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0;
|
||||
|
@ -57,7 +57,7 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
|
||||
return;
|
||||
}
|
||||
|
||||
if (ftable.gpu_is_gpu_buffer(pUserBuf)) {
|
||||
if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
|
||||
convertor->flags |= CONVERTOR_CUDA;
|
||||
}
|
||||
}
|
||||
@ -78,7 +78,7 @@ bool opal_cuda_check_bufs(char *dest, char *src)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ftable.gpu_is_gpu_buffer(dest) || ftable.gpu_is_gpu_buffer(src)) {
|
||||
if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -14,7 +14,7 @@
|
||||
* common cuda code is initialized. This removes any dependency on <cuda.h>
|
||||
* in the opal cuda datatype code. */
|
||||
struct opal_common_cuda_function_table {
|
||||
int (*gpu_is_gpu_buffer)(const void*);
|
||||
int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*);
|
||||
int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*);
|
||||
int (*gpu_cu_memcpy)(void*, const void*, size_t);
|
||||
int (*gpu_memmove)(void*, void*, size_t);
|
||||
|
@ -127,7 +127,7 @@ static opal_mutex_t common_cuda_dtoh_lock;
|
||||
static opal_mutex_t common_cuda_ipc_lock;
|
||||
|
||||
/* Functions called by opal layer - plugged into opal function table */
|
||||
static int mca_common_cuda_is_gpu_buffer(const void*);
|
||||
static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*);
|
||||
static int mca_common_cuda_memmove(void*, void*, size_t);
|
||||
static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*);
|
||||
static int mca_common_cuda_cu_memcpy(void*, const void*, size_t);
|
||||
@ -1700,7 +1700,7 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
|
||||
#endif /* OPAL_CUDA_SUPPORT_41 */
|
||||
|
||||
/* Routines that get plugged into the opal datatype code */
|
||||
static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
|
||||
static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor)
|
||||
{
|
||||
int res;
|
||||
CUmemorytype memType = 0;
|
||||
@ -1715,6 +1715,15 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
|
||||
void *attrdata[] = {(void *)&memType, (void *)&ctx, (void *)&isManaged};
|
||||
|
||||
res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
|
||||
|
||||
/* Mark unified memory buffers with a flag. This will allow all unified
|
||||
* memory to be forced through host buffers. Note that this memory can
|
||||
* be either host or device so we need to set this flag prior to that check. */
|
||||
if (1 == isManaged) {
|
||||
if (NULL != convertor) {
|
||||
convertor->flags |= CONVERTOR_CUDA_UNIFIED;
|
||||
}
|
||||
}
|
||||
if (res != CUDA_SUCCESS) {
|
||||
/* If we cannot determine it is device pointer,
|
||||
* just assume it is not. */
|
||||
@ -1779,15 +1788,6 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf)
|
||||
}
|
||||
}
|
||||
|
||||
#if OPAL_CUDA_GET_ATTRIBUTES
|
||||
if (1 == isManaged) {
|
||||
/* Currently cannot support managed memory */
|
||||
opal_output(0, "CUDA: ptr=%p: CUDA-aware Open MPI detected managed memory but there "
|
||||
"is no support for it. Result will be unpredictable.", pUserBuf);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
|
||||
|
||||
/* First access on a device pointer finalizes CUDA support initialization.
|
||||
* If initialization fails, disable support. */
|
||||
if (!stage_three_init_complete) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user