1
1

Rename a few things for clarity. Add a stream.

This commit was SVN r26447.
Этот коммит содержится в:
Rolf vandeVaart 2012-05-17 18:10:59 +00:00
родитель 5fb48bafda
Коммит f8ace21366
2 изменённых файлов: 68 добавлений и 52 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -47,6 +47,7 @@ static bool mca_common_cuda_enabled = false;
static bool mca_common_cuda_register_memory = true;
static bool mca_common_cuda_warning = true;
static opal_list_t common_cuda_memory_registrations;
static CUstream ipcStream;
/* Structure to hold memory registrations that are delayed until first
* call to send or receive a GPU pointer */
@ -66,21 +67,21 @@ OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t,
#if OMPI_CUDA_SUPPORT_41
static int mca_common_cuda_async = 1;
/* Array of CUDA events to be queried */
CUevent *cuda_event_status_array;
/* Array of CUDA events to be queried for IPC stream */
CUevent *cuda_event_ipc_array;
/* Array of fragments currently being moved by cuda async non-blocking
* operations */
struct mca_btl_base_descriptor_t **cuda_event_frag_array;
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
/* First free/available location in cuda_event_status_array */
int cuda_event_status_first_avail;
int cuda_event_ipc_first_avail;
/* First currently-being used location in the cuda_event_status_array */
int cuda_event_status_first_used;
int cuda_event_ipc_first_used;
/* Number of status items currently in use */
int cuda_event_status_num_used;
int cuda_event_ipc_num_used;
/* Size of array holding events */
int cuda_event_max = 200;
@ -195,14 +196,14 @@ static int mca_common_cuda_init(void)
#if OMPI_CUDA_SUPPORT_41
if (true == mca_common_cuda_enabled) {
/* Set up an array to store outstanding async copy events */
cuda_event_status_array = NULL;
cuda_event_frag_array = NULL;
cuda_event_status_num_used = 0;
cuda_event_status_first_avail = 0;
cuda_event_status_first_used = 0;
cuda_event_ipc_array = NULL;
cuda_event_ipc_frag_array = NULL;
cuda_event_ipc_num_used = 0;
cuda_event_ipc_first_avail = 0;
cuda_event_ipc_first_used = 0;
cuda_event_status_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
if (NULL == cuda_event_status_array) {
cuda_event_ipc_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
if (NULL == cuda_event_ipc_array) {
orte_show_help("help-mpi-common-cuda.txt", "No memory",
true, errno, strerror(errno));
return OMPI_ERROR;
@ -210,7 +211,7 @@ static int mca_common_cuda_init(void)
/* Create the events since they can be reused. */
for (i = 0; i < cuda_event_max; i++) {
res = cuEventCreate(&cuda_event_status_array[i], CU_EVENT_DISABLE_TIMING);
res = cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
if (CUDA_SUCCESS != res) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
true, res);
@ -220,9 +221,9 @@ static int mca_common_cuda_init(void)
/* The first available status index is 0. Make an empty frag
array. */
cuda_event_frag_array = (struct mca_btl_base_descriptor_t **)
cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
if (NULL == cuda_event_frag_array) {
if (NULL == cuda_event_ipc_frag_array) {
orte_show_help("help-mpi-common-cuda.txt", "No memory",
true, errno, strerror(errno));
return OMPI_ERROR;
@ -254,6 +255,14 @@ static int mca_common_cuda_init(void)
OBJ_RELEASE(mem_reg);
}
/* Create stream for use in ipc asynchronous copies */
res = cuStreamCreate(&ipcStream, 0);
if (res != CUDA_SUCCESS) {
orte_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
true, res);
return OMPI_ERROR;
}
opal_output_verbose(30, mca_common_cuda_output,
"CUDA: initialized");
common_cuda_initialized = true;
@ -395,11 +404,13 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));
/* Need to record the event to ensure that any memcopies
* into the device memory have completed. The event handle
* associated with this event is sent to the remote process
* so that it will wait on this event prior to copying data
* out of the device memory. */
/* Need to record the event to ensure that any memcopies into the
* device memory have completed. The event handle associated with
* this event is sent to the remote process so that it will wait
* on this event prior to copying data out of the device memory.
* Note that this needs to be the NULL stream to make since it is
* unknown what stream any copies into the device memory were done
* with. */
result = cuEventRecord((CUevent)cuda_reg->event, 0);
if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
@ -581,7 +592,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
/* First make sure there is room to store the event. If not, then
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
if (cuda_event_status_num_used == cuda_event_max) {
if (cuda_event_ipc_num_used == cuda_event_max) {
orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
return OMPI_ERR_OUT_OF_RESOURCE;
@ -590,7 +601,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
/* This is the standard way to run. Running with synchronous copies is available
* to measure the advantages of asynchronous copies. */
if (OPAL_LIKELY(mca_common_cuda_async)) {
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
true, dst, src, amount, result);
@ -600,25 +611,25 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
dst, src, (int)amount);
}
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
true, result);
return OMPI_ERROR;
}
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
/* Bump up the first available slot and number used by 1 */
cuda_event_status_first_avail++;
if (cuda_event_status_first_avail >= cuda_event_max) {
cuda_event_status_first_avail = 0;
cuda_event_ipc_first_avail++;
if (cuda_event_ipc_first_avail >= cuda_event_max) {
cuda_event_ipc_first_avail = 0;
}
cuda_event_status_num_used++;
cuda_event_ipc_num_used++;
*done = 0;
} else {
/* Mimic the async function so they use the same memcpy call. */
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
true, dst, src, amount, result);
@ -630,23 +641,23 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
}
/* Record an event, then wait for it to complete with calls to cuEventQuery */
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
if (CUDA_SUCCESS != result) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
true, result);
return OMPI_ERROR;
}
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
/* Bump up the first available slot and number used by 1 */
cuda_event_status_first_avail++;
if (cuda_event_status_first_avail >= cuda_event_max) {
cuda_event_status_first_avail = 0;
cuda_event_ipc_first_avail++;
if (cuda_event_ipc_first_avail >= cuda_event_max) {
cuda_event_ipc_first_avail = 0;
}
cuda_event_status_num_used++;
cuda_event_ipc_num_used++;
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
true, result);
@ -658,7 +669,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
if (0 == (iter % 10)) {
opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
}
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
true, result);
@ -667,10 +678,10 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
iter++;
}
--cuda_event_status_num_used;
++cuda_event_status_first_used;
if (cuda_event_status_first_used >= cuda_event_max) {
cuda_event_status_first_used = 0;
--cuda_event_ipc_num_used;
++cuda_event_ipc_first_used;
if (cuda_event_ipc_first_used >= cuda_event_max) {
cuda_event_ipc_first_used = 0;
}
*done = 1;
}
@ -685,12 +696,12 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
CUresult result;
if (cuda_event_status_num_used > 0) {
if (cuda_event_ipc_num_used > 0) {
opal_output_verbose(20, mca_common_cuda_output,
"CUDA: progress_one_cuda_event, outstanding_events=%d",
cuda_event_status_num_used);
cuda_event_ipc_num_used);
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
/* We found an event that is not ready, so return. */
if (CUDA_ERROR_NOT_READY == result) {
@ -705,15 +716,15 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
return OMPI_ERROR;
}
*frag = cuda_event_frag_array[cuda_event_status_first_used];
*frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
opal_output_verbose(10, mca_common_cuda_output,
"CUDA: cuEventQuery returned %d", result);
/* Bump counters, loop around the circular buffer if necessary */
--cuda_event_status_num_used;
++cuda_event_status_first_used;
if (cuda_event_status_first_used >= cuda_event_max) {
cuda_event_status_first_used = 0;
--cuda_event_ipc_num_used;
++cuda_event_ipc_first_used;
if (cuda_event_ipc_first_used >= cuda_event_max) {
cuda_event_ipc_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
return 1;

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2011 NVIDIA. All rights reserved.
# Copyright (c) 2011-2012 NVIDIA. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -135,3 +135,8 @@ cause the program to abort.
cuEventDestory return value: %d
Check the cuda.h file for what the return value means.
#
[cuStreamCreate failed]
The call to cuStreamCreate failed. This is a unrecoverable error and will
cause the program to abort.
cuStreamCreate return value: %d
Check the cuda.h file for what the return vale means.