diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index ea1546393a..878fe724c0 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -11,6 +11,7 @@ * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,11 +46,12 @@ BEGIN_C_DECLS #define CONVERTOR_WITH_CHECKSUM 0x00200000 #define CONVERTOR_CUDA 0x00400000 #define CONVERTOR_CUDA_ASYNC 0x00800000 -#define CONVERTOR_TYPE_MASK 0x00FF0000 +#define CONVERTOR_TYPE_MASK 0x10FF0000 #define CONVERTOR_STATE_START 0x01000000 #define CONVERTOR_STATE_COMPLETE 0x02000000 #define CONVERTOR_STATE_ALLOC 0x04000000 #define CONVERTOR_COMPLETED 0x08000000 +#define CONVERTOR_CUDA_UNIFIED 0x10000000 union dt_elem_desc; typedef struct opal_convertor_t opal_convertor_t; @@ -177,7 +179,7 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1; #endif #if OPAL_CUDA_SUPPORT - if( pConvertor->flags & CONVERTOR_CUDA ) return 1; + if( pConvertor->flags & (CONVERTOR_CUDA | CONVERTOR_CUDA_UNIFIED)) return 1; #endif if( pConvertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS ) return 0; if( (pConvertor->count == 1) && (pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0; diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c index 3d51a2201d..d62af3fa8b 100644 --- a/opal/datatype/opal_datatype_cuda.c +++ b/opal/datatype/opal_datatype_cuda.c @@ -57,7 +57,7 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) return; } - if (ftable.gpu_is_gpu_buffer(pUserBuf)) { + if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { convertor->flags |= CONVERTOR_CUDA; } } @@ -78,7 +78,7 @@ bool opal_cuda_check_bufs(char *dest, char *src) return false; } - if (ftable.gpu_is_gpu_buffer(dest) || ftable.gpu_is_gpu_buffer(src)) { + if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { return true; } else { return false; diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h index acce495962..676af80273 100644 --- a/opal/datatype/opal_datatype_cuda.h +++ b/opal/datatype/opal_datatype_cuda.h @@ -14,7 +14,7 @@ * common cuda code is initialized. This removes any dependency on * in the opal cuda datatype code. */ struct opal_common_cuda_function_table { - int (*gpu_is_gpu_buffer)(const void*); + int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*); int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*); int (*gpu_cu_memcpy)(void*, const void*, size_t); int (*gpu_memmove)(void*, void*, size_t); diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 6209c6b21b..a56a9b5c09 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -127,7 +127,7 @@ static opal_mutex_t common_cuda_dtoh_lock; static opal_mutex_t common_cuda_ipc_lock; /* Functions called by opal layer - plugged into opal function table */ -static int mca_common_cuda_is_gpu_buffer(const void*); +static int mca_common_cuda_is_gpu_buffer(const void*, opal_convertor_t*); static int mca_common_cuda_memmove(void*, void*, size_t); static int mca_common_cuda_cu_memcpy_async(void*, const void*, size_t, opal_convertor_t*); static int mca_common_cuda_cu_memcpy(void*, const void*, size_t); @@ -1700,7 +1700,7 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) { #endif /* OPAL_CUDA_SUPPORT_41 */ /* Routines that get plugged into the opal datatype code */ -static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf) +static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor) { int res; CUmemorytype memType = 0; @@ -1715,6 +1715,15 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf) void *attrdata[] = {(void *)&memType, (void *)&ctx, (void *)&isManaged}; res = cuFunc.cuPointerGetAttributes(3, attributes, attrdata, dbuf); + + /* Mark unified memory buffers with a flag. This will allow all unified + * memory to be forced through host buffers. Note that this memory can + * be either host or device so we need to set this flag prior to that check. */ + if (1 == isManaged) { + if (NULL != convertor) { + convertor->flags |= CONVERTOR_CUDA_UNIFIED; + } + } if (res != CUDA_SUCCESS) { /* If we cannot determine it is device pointer, * just assume it is not. */ @@ -1779,15 +1788,6 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf) } } -#if OPAL_CUDA_GET_ATTRIBUTES - if (1 == isManaged) { - /* Currently cannot support managed memory */ - opal_output(0, "CUDA: ptr=%p: CUDA-aware Open MPI detected managed memory but there " - "is no support for it. Result will be unpredictable.", pUserBuf); - return OPAL_ERROR; - } -#endif /* OPAL_CUDA_GET_ATTRIBUTES */ - /* First access on a device pointer finalizes CUDA support initialization. * If initialization fails, disable support. */ if (!stage_three_init_complete) {