diff --git a/ompi/mca/common/cuda/common_cuda.c b/ompi/mca/common/cuda/common_cuda.c index 7ea3c27a7c..4c71894624 100644 --- a/ompi/mca/common/cuda/common_cuda.c +++ b/ompi/mca/common/cuda/common_cuda.c @@ -91,6 +91,9 @@ struct cudaFunctionTable { int (*cuCtxGetDevice)(CUdevice *); int (*cuDeviceCanAccessPeer)(int *, CUdevice, CUdevice); int (*cuDeviceGet)(CUdevice *, int); +#if OMPI_CUDA_SUPPORT_60 + int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr); +#endif /* OMPI_CUDA_SUPPORT_60 */ } cudaFunctionTable; typedef struct cudaFunctionTable cudaFunctionTable_t; cudaFunctionTable_t cuFunc; @@ -446,6 +449,9 @@ int mca_common_cuda_stage_one_init(void) OMPI_CUDA_DLSYM(libcuda_handle, cuCtxGetDevice); OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceCanAccessPeer); OMPI_CUDA_DLSYM(libcuda_handle, cuDeviceGet); +#if OMPI_CUDA_SUPPORT_60 + OMPI_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute); +#endif /* OMPI_CUDA_SUPPORT_60 */ return 0; } @@ -832,6 +838,21 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne cuda_reg->base.bound = (unsigned char *)pbase + psize - 1; memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle)); +#if OMPI_CUDA_SUPPORT_60 + /* With CUDA 6.0, we can set an attribute on the memory pointer that will + * ensure any synchronous copies are completed prior to any other access + * of the memory region. This means we do not need to record an event + * and send to the remote side. + */ + memType = 1; /* Just use this variable since we already have it */ + result = cuFunc.cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)base); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", + true, result, base); + return OMPI_ERROR; + } +#else /* Need to record the event to ensure that any memcopies into the * device memory have completed. The event handle associated with * this event is sent to the remote process so that it will wait @@ -845,6 +866,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne true, result, base); return OMPI_ERROR; } +#endif /* OMPI_CUDA_SUPPORT_60 */ return OMPI_SUCCESS; } @@ -968,6 +990,10 @@ void mca_common_cuda_destruct_event(uint64_t *event) */ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg) { +#if OMPI_CUDA_SUPPORT_60 + /* No need for any of this with CUDA 6.0 */ + return; +#else /* OMPI_CUDA_SUPPORT_60 */ CUipcEventHandle evtHandle; CUevent event; CUresult result; @@ -1005,6 +1031,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg) opal_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed", true, result); } +#endif /* OMPI_CUDA_SUPPORT_60 */ } /* diff --git a/ompi/mca/common/cuda/help-mpi-common-cuda.txt b/ompi/mca/common/cuda/help-mpi-common-cuda.txt index d834ea5705..747ff13807 100644 --- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt +++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt @@ -172,3 +172,10 @@ could cause incorrect results. Turn of GPU Direct RDMA support by running with --mca btl_openib_cuda_want_gdr_support 0. cuPointerGetAttribute return value: %d Check the cuda.h file for what the return value means. +[cuPointerSetAttribute failed] +The call to cuPointerSetAttribute failed. This is a unrecoverable error and will +cause the program to abort. + cuPointerSetAttribute return value: %d + Address: %p +Check the cuda.h file for what the return value means. +#