1
1

Rename a few things for clarity. Add a stream.

This commit was SVN r26447.
Этот коммит содержится в:
Rolf vandeVaart 2012-05-17 18:10:59 +00:00
родитель 5fb48bafda
Коммит f8ace21366
2 изменённых файлов: 68 добавлений и 52 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California. * Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -47,6 +47,7 @@ static bool mca_common_cuda_enabled = false;
static bool mca_common_cuda_register_memory = true; static bool mca_common_cuda_register_memory = true;
static bool mca_common_cuda_warning = true; static bool mca_common_cuda_warning = true;
static opal_list_t common_cuda_memory_registrations; static opal_list_t common_cuda_memory_registrations;
static CUstream ipcStream;
/* Structure to hold memory registrations that are delayed until first /* Structure to hold memory registrations that are delayed until first
* call to send or receive a GPU pointer */ * call to send or receive a GPU pointer */
@ -66,21 +67,21 @@ OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t,
#if OMPI_CUDA_SUPPORT_41 #if OMPI_CUDA_SUPPORT_41
static int mca_common_cuda_async = 1; static int mca_common_cuda_async = 1;
/* Array of CUDA events to be queried */ /* Array of CUDA events to be queried for IPC stream */
CUevent *cuda_event_status_array; CUevent *cuda_event_ipc_array;
/* Array of fragments currently being moved by cuda async non-blocking /* Array of fragments currently being moved by cuda async non-blocking
* operations */ * operations */
struct mca_btl_base_descriptor_t **cuda_event_frag_array; struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
/* First free/available location in cuda_event_status_array */ /* First free/available location in cuda_event_status_array */
int cuda_event_status_first_avail; int cuda_event_ipc_first_avail;
/* First currently-being used location in the cuda_event_status_array */ /* First currently-being used location in the cuda_event_status_array */
int cuda_event_status_first_used; int cuda_event_ipc_first_used;
/* Number of status items currently in use */ /* Number of status items currently in use */
int cuda_event_status_num_used; int cuda_event_ipc_num_used;
/* Size of array holding events */ /* Size of array holding events */
int cuda_event_max = 200; int cuda_event_max = 200;
@ -195,14 +196,14 @@ static int mca_common_cuda_init(void)
#if OMPI_CUDA_SUPPORT_41 #if OMPI_CUDA_SUPPORT_41
if (true == mca_common_cuda_enabled) { if (true == mca_common_cuda_enabled) {
/* Set up an array to store outstanding async copy events */ /* Set up an array to store outstanding async copy events */
cuda_event_status_array = NULL; cuda_event_ipc_array = NULL;
cuda_event_frag_array = NULL; cuda_event_ipc_frag_array = NULL;
cuda_event_status_num_used = 0; cuda_event_ipc_num_used = 0;
cuda_event_status_first_avail = 0; cuda_event_ipc_first_avail = 0;
cuda_event_status_first_used = 0; cuda_event_ipc_first_used = 0;
cuda_event_status_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max); cuda_event_ipc_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
if (NULL == cuda_event_status_array) { if (NULL == cuda_event_ipc_array) {
orte_show_help("help-mpi-common-cuda.txt", "No memory", orte_show_help("help-mpi-common-cuda.txt", "No memory",
true, errno, strerror(errno)); true, errno, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
@ -210,7 +211,7 @@ static int mca_common_cuda_init(void)
/* Create the events since they can be reused. */ /* Create the events since they can be reused. */
for (i = 0; i < cuda_event_max; i++) { for (i = 0; i < cuda_event_max; i++) {
res = cuEventCreate(&cuda_event_status_array[i], CU_EVENT_DISABLE_TIMING); res = cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
if (CUDA_SUCCESS != res) { if (CUDA_SUCCESS != res) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
true, res); true, res);
@ -220,9 +221,9 @@ static int mca_common_cuda_init(void)
/* The first available status index is 0. Make an empty frag /* The first available status index is 0. Make an empty frag
array. */ array. */
cuda_event_frag_array = (struct mca_btl_base_descriptor_t **) cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max); malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
if (NULL == cuda_event_frag_array) { if (NULL == cuda_event_ipc_frag_array) {
orte_show_help("help-mpi-common-cuda.txt", "No memory", orte_show_help("help-mpi-common-cuda.txt", "No memory",
true, errno, strerror(errno)); true, errno, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
@ -254,6 +255,14 @@ static int mca_common_cuda_init(void)
OBJ_RELEASE(mem_reg); OBJ_RELEASE(mem_reg);
} }
/* Create stream for use in ipc asynchronous copies */
res = cuStreamCreate(&ipcStream, 0);
if (res != CUDA_SUCCESS) {
orte_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
true, res);
return OMPI_ERROR;
}
opal_output_verbose(30, mca_common_cuda_output, opal_output_verbose(30, mca_common_cuda_output,
"CUDA: initialized"); "CUDA: initialized");
common_cuda_initialized = true; common_cuda_initialized = true;
@ -395,11 +404,13 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
cuda_reg->base.bound = (unsigned char *)pbase + psize - 1; cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle)); memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));
/* Need to record the event to ensure that any memcopies /* Need to record the event to ensure that any memcopies into the
* into the device memory have completed. The event handle * device memory have completed. The event handle associated with
* associated with this event is sent to the remote process * this event is sent to the remote process so that it will wait
* so that it will wait on this event prior to copying data * on this event prior to copying data out of the device memory.
* out of the device memory. */ * Note that this needs to be the NULL stream to make since it is
* unknown what stream any copies into the device memory were done
* with. */
result = cuEventRecord((CUevent)cuda_reg->event, 0); result = cuEventRecord((CUevent)cuda_reg->event, 0);
if (CUDA_SUCCESS != result) { if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
@ -581,7 +592,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
/* First make sure there is room to store the event. If not, then /* First make sure there is room to store the event. If not, then
* return an error. The error message will tell the user to try and * return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */ * run again, but with a larger array for storing events. */
if (cuda_event_status_num_used == cuda_event_max) { if (cuda_event_ipc_num_used == cuda_event_max) {
orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles", orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
true, cuda_event_max, cuda_event_max+100, cuda_event_max+100); true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -590,7 +601,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
/* This is the standard way to run. Running with synchronous copies is available /* This is the standard way to run. Running with synchronous copies is available
* to measure the advantages of asynchronous copies. */ * to measure the advantages of asynchronous copies. */
if (OPAL_LIKELY(mca_common_cuda_async)) { if (OPAL_LIKELY(mca_common_cuda_async)) {
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0); result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
if (CUDA_SUCCESS != result) { if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
true, dst, src, amount, result); true, dst, src, amount, result);
@ -600,25 +611,25 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
dst, src, (int)amount); dst, src, (int)amount);
} }
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0); result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
if (CUDA_SUCCESS != result) { if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
true, result); true, result);
return OMPI_ERROR; return OMPI_ERROR;
} }
cuda_event_frag_array[cuda_event_status_first_avail] = frag; cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
/* Bump up the first available slot and number used by 1 */ /* Bump up the first available slot and number used by 1 */
cuda_event_status_first_avail++; cuda_event_ipc_first_avail++;
if (cuda_event_status_first_avail >= cuda_event_max) { if (cuda_event_ipc_first_avail >= cuda_event_max) {
cuda_event_status_first_avail = 0; cuda_event_ipc_first_avail = 0;
} }
cuda_event_status_num_used++; cuda_event_ipc_num_used++;
*done = 0; *done = 0;
} else { } else {
/* Mimic the async function so they use the same memcpy call. */ /* Mimic the async function so they use the same memcpy call. */
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0); result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
if (CUDA_SUCCESS != result) { if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed", orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
true, dst, src, amount, result); true, dst, src, amount, result);
@ -630,23 +641,23 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
} }
/* Record an event, then wait for it to complete with calls to cuEventQuery */ /* Record an event, then wait for it to complete with calls to cuEventQuery */
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0); result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
if (CUDA_SUCCESS != result) { if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
true, result); true, result);
return OMPI_ERROR; return OMPI_ERROR;
} }
cuda_event_frag_array[cuda_event_status_first_avail] = frag; cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
/* Bump up the first available slot and number used by 1 */ /* Bump up the first available slot and number used by 1 */
cuda_event_status_first_avail++; cuda_event_ipc_first_avail++;
if (cuda_event_status_first_avail >= cuda_event_max) { if (cuda_event_ipc_first_avail >= cuda_event_max) {
cuda_event_status_first_avail = 0; cuda_event_ipc_first_avail = 0;
} }
cuda_event_status_num_used++; cuda_event_ipc_num_used++;
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]); result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) { if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
true, result); true, result);
@ -658,7 +669,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
if (0 == (iter % 10)) { if (0 == (iter % 10)) {
opal_output(-1, "EVENT NOT DONE (iter=%d)", iter); opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
} }
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]); result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) { if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed", orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
true, result); true, result);
@ -667,10 +678,10 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
iter++; iter++;
} }
--cuda_event_status_num_used; --cuda_event_ipc_num_used;
++cuda_event_status_first_used; ++cuda_event_ipc_first_used;
if (cuda_event_status_first_used >= cuda_event_max) { if (cuda_event_ipc_first_used >= cuda_event_max) {
cuda_event_status_first_used = 0; cuda_event_ipc_first_used = 0;
} }
*done = 1; *done = 1;
} }
@ -685,12 +696,12 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) { int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
CUresult result; CUresult result;
if (cuda_event_status_num_used > 0) { if (cuda_event_ipc_num_used > 0) {
opal_output_verbose(20, mca_common_cuda_output, opal_output_verbose(20, mca_common_cuda_output,
"CUDA: progress_one_cuda_event, outstanding_events=%d", "CUDA: progress_one_cuda_event, outstanding_events=%d",
cuda_event_status_num_used); cuda_event_ipc_num_used);
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]); result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
/* We found an event that is not ready, so return. */ /* We found an event that is not ready, so return. */
if (CUDA_ERROR_NOT_READY == result) { if (CUDA_ERROR_NOT_READY == result) {
@ -705,15 +716,15 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
return OMPI_ERROR; return OMPI_ERROR;
} }
*frag = cuda_event_frag_array[cuda_event_status_first_used]; *frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
opal_output_verbose(10, mca_common_cuda_output, opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuEventQuery returned %d", result); "CUDA: cuEventQuery returned %d", result);
/* Bump counters, loop around the circular buffer if necessary */ /* Bump counters, loop around the circular buffer if necessary */
--cuda_event_status_num_used; --cuda_event_ipc_num_used;
++cuda_event_status_first_used; ++cuda_event_ipc_first_used;
if (cuda_event_status_first_used >= cuda_event_max) { if (cuda_event_ipc_first_used >= cuda_event_max) {
cuda_event_status_first_used = 0; cuda_event_ipc_first_used = 0;
} }
/* A return value of 1 indicates an event completed and a frag was returned */ /* A return value of 1 indicates an event completed and a frag was returned */
return 1; return 1;

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*- # -*- text -*-
# #
# Copyright (c) 2011 NVIDIA. All rights reserved. # Copyright (c) 2011-2012 NVIDIA. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
# Additional copyrights may follow # Additional copyrights may follow
@ -135,3 +135,8 @@ cause the program to abort.
cuEventDestory return value: %d cuEventDestory return value: %d
Check the cuda.h file for what the return value means. Check the cuda.h file for what the return value means.
# #
[cuStreamCreate failed]
The call to cuStreamCreate failed. This is a unrecoverable error and will
cause the program to abort.
cuStreamCreate return value: %d
Check the cuda.h file for what the return vale means.