From 9f3bf4747d530a5e0f932869a78f481dcabc5441 Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Thu, 23 Jan 2014 15:47:20 +0000 Subject: [PATCH] Provide option to have synchronous copy be asynchronous with a wait. For now, this has to be selected at runtime. Also fix up some error messages to have node name in them. This commit was SVN r30396. --- ompi/mca/common/cuda/common_cuda.c | 184 +++++++++++++++--- ompi/mca/common/cuda/help-mpi-common-cuda.txt | 12 +- 2 files changed, 165 insertions(+), 31 deletions(-) diff --git a/ompi/mca/common/cuda/common_cuda.c b/ompi/mca/common/cuda/common_cuda.c index 66468ac419..7b1c71195f 100644 --- a/ompi/mca/common/cuda/common_cuda.c +++ b/ompi/mca/common/cuda/common_cuda.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2011-2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -95,6 +95,7 @@ struct cudaFunctionTable { int (*cuPointerSetAttribute)(const void *, CUpointer_attribute, CUdeviceptr); #endif /* OPAL_CUDA_GDR_SUPPORT */ int (*cuCtxSetCurrent)(CUcontext); + int (*cuEventSynchronize)(CUevent); } cudaFunctionTable; typedef struct cudaFunctionTable cudaFunctionTable_t; cudaFunctionTable_t cuFunc; @@ -111,6 +112,8 @@ static opal_list_t common_cuda_memory_registrations; static CUstream ipcStream; static CUstream dtohStream; static CUstream htodStream; +static CUstream memcpyStream; +static CUevent memcpyEvent; /* Functions called by opal layer - plugged into opal function table */ static int mca_common_cuda_is_gpu_buffer(const void*); @@ -135,6 +138,8 @@ OBJ_CLASS_INSTANCE(common_cuda_mem_regs_t, #if OPAL_CUDA_SUPPORT_41 static int mca_common_cuda_async = 1; +static int mca_common_cuda_cumemcpy_async; +static int mca_common_cuda_cumemcpy_timing; /* Array of CUDA events to be queried for IPC stream, sending side and * receiving side. */ @@ -167,7 +172,7 @@ static int cuda_event_htod_most = 0; opal_lt_dlhandle libcuda_handle; #define CUDA_COMMON_TIMING 0 -#if CUDA_COMMON_TIMING +#if CUDA_COMMON_TIMING || OPAL_ENABLE_DEBUG /* Some timing support structures. Enable this to help analyze * internal performance issues. */ static struct timespec ts_start; @@ -292,6 +297,25 @@ int mca_common_cuda_stage_one_init(void) &cuda_event_max); #endif /* OPAL_CUDA_SUPPORT_41 */ + /* Use this flag to test cuMemcpyAsync vs cuMemcpy */ + mca_common_cuda_cumemcpy_async = 0; + (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_async", + "Set to 0 to force CUDA cuMemcpy instead of cuMemcpyAsync/cuEventRecord/cuEventSynchronize", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_common_cuda_cumemcpy_async); + + /* Use this flag to dump out timing of cumempcy sync and async */ + mca_common_cuda_cumemcpy_timing = 0; + (void) mca_base_var_register("ompi", "mpi", "common_cuda", "cumemcpy_timing", + "Set to 1 to dump timing of eager copies", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_common_cuda_cumemcpy_timing); + + mca_common_cuda_output = opal_output_open(NULL); opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose); @@ -463,6 +487,7 @@ int mca_common_cuda_stage_one_init(void) OMPI_CUDA_DLSYM(libcuda_handle, cuPointerSetAttribute); #endif /* OPAL_CUDA_GDR_SUPPORT */ OMPI_CUDA_DLSYM(libcuda_handle, cuCtxSetCurrent); + OMPI_CUDA_DLSYM(libcuda_handle, cuEventSynchronize); return 0; } @@ -559,7 +584,7 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING); if (CUDA_SUCCESS != res) { opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } } @@ -597,7 +622,7 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuEventCreate(&cuda_event_dtoh_array[i], CU_EVENT_DISABLE_TIMING); if (CUDA_SUCCESS != res) { opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } } @@ -632,7 +657,7 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuEventCreate(&cuda_event_htod_array[i], CU_EVENT_DISABLE_TIMING); if (CUDA_SUCCESS != res) { opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } } @@ -675,7 +700,7 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuStreamCreate(&ipcStream, 0); if (res != CUDA_SUCCESS) { opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } @@ -683,7 +708,7 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuStreamCreate(&dtohStream, 0); if (res != CUDA_SUCCESS) { opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } @@ -692,11 +717,26 @@ static int mca_common_cuda_stage_three_init(void) res = cuFunc.cuStreamCreate(&htodStream, 0); if (res != CUDA_SUCCESS) { opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", - true, res); + true, ompi_process_info.nodename, res); return OMPI_ERROR; } + /* Create stream for use in cuMemcpyAsync synchronous copies */ + res = cuFunc.cuStreamCreate(&memcpyStream, 0); + if (res != CUDA_SUCCESS) { + opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed", + true, ompi_process_info.nodename, res); + return OMPI_ERROR; + } + /* Create event for use in cuMemcpyAsync synchronous copies */ + res = cuFunc.cuEventCreate(&memcpyEvent, 0); + if (res != CUDA_SUCCESS){ + opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", + true, ompi_process_info.nodename, res); + return OMPI_ERROR; + } + opal_output_verbose(30, mca_common_cuda_output, "CUDA: initialized"); common_cuda_initialized = true; @@ -970,7 +1010,7 @@ void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle) result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", - true, result); + true, ompi_process_info.nodename, result); } result = cuFunc.cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event); @@ -1026,7 +1066,7 @@ void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg) result = cuFunc.cuEventRecord(event, 0); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", - true, result); + true, ompi_process_info.nodename, result); } /* END of Workaround */ @@ -1089,7 +1129,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", - true, result); + true, ompi_process_info.nodename, result); return OMPI_ERROR; } cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag; @@ -1119,7 +1159,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", - true, result); + true, ompi_process_info.nodename, result); return OMPI_ERROR; } @@ -1192,7 +1232,7 @@ int mca_common_cuda_record_dtoh_event(char *msg, struct mca_btl_base_descriptor_ result = cuFunc.cuEventRecord(cuda_event_dtoh_array[cuda_event_dtoh_first_avail], dtohStream); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", - true, result); + true, ompi_process_info.nodename, result); return OMPI_ERROR; } cuda_event_dtoh_frag_array[cuda_event_dtoh_first_avail] = frag; @@ -1236,7 +1276,7 @@ int mca_common_cuda_record_htod_event(char *msg, struct mca_btl_base_descriptor_ result = cuFunc.cuEventRecord(cuda_event_htod_array[cuda_event_htod_first_avail], htodStream); if (CUDA_SUCCESS != result) { opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", - true, result); + true, ompi_process_info.nodename, result); return OMPI_ERROR; } cuda_event_htod_frag_array[cuda_event_htod_first_avail] = frag; @@ -1480,7 +1520,7 @@ static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) { * opal_output(0, "Function took %7.2f usecs\n", accum); * */ -#if CUDA_COMMON_TIMING +#if CUDA_COMMON_TIMING || OPAL_ENABLE_DEBUG static float mydifftime(struct timespec ts_start, struct timespec ts_end) { float seconds; float microseconds; @@ -1581,31 +1621,115 @@ static int mca_common_cuda_cu_memcpy_async(void *dest, const void *src, size_t s (CUstream)convertor->stream); } +/** + * This function is plugged into various areas where a cuMemcpy would be called. + * This is a synchronous operation that will not return until the copy is complete. + */ static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size) { - return cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); + CUresult result; +#if OPAL_ENABLE_DEBUG + CUmemorytype memTypeSrc, memTypeDst; + if (mca_common_cuda_cumemcpy_timing) { + /* Nice to know type of source and destination for timing output. Do + * not care about return code as memory type will just be set to 0 */ + result = cuFunc.cuPointerGetAttribute(&memTypeDst, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)dest); + result = cuFunc.cuPointerGetAttribute(&memTypeSrc, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)src); + clock_gettime(CLOCK_MONOTONIC, &ts_start); + } +#endif + if (mca_common_cuda_cumemcpy_async) { + result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, memcpyStream); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", + true, dest, src, size, result); + return OMPI_ERROR; + } + result = cuFunc.cuEventRecord(memcpyEvent, memcpyStream); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", + true, ompi_process_info.nodename, result); + return 0; + } + result = cuFunc.cuEventSynchronize(memcpyEvent); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuEventSynchronize failed", + true, ompi_process_info.nodename, result); + return OMPI_ERROR; + } + } else { + result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed", + true, result); + return OMPI_ERROR; + } + } +#if OPAL_ENABLE_DEBUG + if (mca_common_cuda_cumemcpy_timing) { + clock_gettime(CLOCK_MONOTONIC, &ts_end); + accum = mydifftime(ts_start, ts_end); + if (mca_common_cuda_cumemcpy_async) { + opal_output(0, "cuMemcpyAsync took %7.2f usecs (src=%p (%d), dst=%p (%d))\n", + accum, src, memTypeSrc, dest, memTypeDst); + } else { + opal_output(0, "cuMemcpy took %7.2f usecs (src=%p (%d), dst=%p (%d))\n", + accum, src, memTypeSrc, dest, memTypeDst); + } + } +#endif + return OMPI_SUCCESS; } static int mca_common_cuda_memmove(void *dest, void *src, size_t size) { CUdeviceptr tmp; - int res; + int result; - res = cuFunc.cuMemAlloc(&tmp,size); - res = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size); - if(res != CUDA_SUCCESS){ - opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - res, (void *)tmp, src, (int)size); - return res; - } - res = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size); - if(res != CUDA_SUCCESS){ - opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - res, dest, (void *)tmp, (int)size); - return res; + result = cuFunc.cuMemAlloc(&tmp,size); + if (mca_common_cuda_cumemcpy_async) { + result = cuFunc.cuMemcpyAsync(tmp, (CUdeviceptr)src, size, memcpyStream); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", + true, tmp, src, size, result); + return OMPI_ERROR; + } + result = cuFunc.cuMemcpyAsync((CUdeviceptr)dest, tmp, size, memcpyStream); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", + true, dest, tmp, size, result); + return OMPI_ERROR; + } + result = cuFunc.cuEventRecord(memcpyEvent, memcpyStream); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", + true, ompi_process_info.nodename, result); + return OMPI_ERROR; + } + result = cuFunc.cuEventSynchronize(memcpyEvent); + if (CUDA_SUCCESS != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuEventSynchronize failed", + true, ompi_process_info.nodename, result); + return OMPI_ERROR; + } + } else { + result = cuFunc.cuMemcpy(tmp, (CUdeviceptr)src, size); + if(result != CUDA_SUCCESS){ + opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", + result, (void *)tmp, src, (int)size); + return OMPI_ERROR; + } + result = cuFunc.cuMemcpy((CUdeviceptr)dest, tmp, size); + if(result != CUDA_SUCCESS){ + opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", + result, dest, (void *)tmp, (int)size); + return OMPI_ERROR; + } } cuFunc.cuMemFree(tmp); - return 0; + return OMPI_SUCCESS; } int mca_common_cuda_get_device(int *devicenum) diff --git a/ompi/mca/common/cuda/help-mpi-common-cuda.txt b/ompi/mca/common/cuda/help-mpi-common-cuda.txt index 75173a4b1b..c77b45ffee 100644 --- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt +++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2011-2013 NVIDIA. All rights reserved. +# Copyright (c) 2011-2014 NVIDIA. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -96,12 +96,14 @@ Check the cuda.h file for what the return value means. [cuEventCreate failed] The call to cuEventCreate failed. This is a unrecoverable error and will cause the program to abort. + Hostname: %s cuEventCreate return value: %d Check the cuda.h file for what the return value means. # [cuEventRecord failed] The call to cuEventRecord failed. This is a unrecoverable error and will cause the program to abort. + Hostname: %s cuEventRecord return value: %d Check the cuda.h file for what the return value means. # @@ -138,6 +140,7 @@ Check the cuda.h file for what the return value means. [cuStreamCreate failed] The call to cuStreamCreate failed. This is a unrecoverable error and will cause the program to abort. + Hostname: %s cuStreamCreate return value: %d Check the cuda.h file for what the return vale means. # @@ -182,3 +185,10 @@ continue, but report this error to the Open MPI developers. Address: %p Check the cuda.h file for what the return value means. # +[cuEventSynchronize failed] +The call to cuEventSynchronize failed. This is highly unusual and should +not happen. Please report this error to the Open MPI developers. + Hostname: %s + cuEventSynchronize return value: %d +Check the cuda.h file for what the return value means. +#