Rename a few things for clarity. Add a stream.
This commit was SVN r26447.
Этот коммит содержится в:
родитель
5fb48bafda
Коммит
f8ace21366
@ -9,7 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2006 The Regents of the University of California.
|
* Copyright (c) 2004-2006 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -47,6 +47,7 @@ static bool mca_common_cuda_enabled = false;
|
|||||||
static bool mca_common_cuda_register_memory = true;
|
static bool mca_common_cuda_register_memory = true;
|
||||||
static bool mca_common_cuda_warning = true;
|
static bool mca_common_cuda_warning = true;
|
||||||
static opal_list_t common_cuda_memory_registrations;
|
static opal_list_t common_cuda_memory_registrations;
|
||||||
|
static CUstream ipcStream;
|
||||||
|
|
||||||
/* Structure to hold memory registrations that are delayed until first
|
/* Structure to hold memory registrations that are delayed until first
|
||||||
* call to send or receive a GPU pointer */
|
* call to send or receive a GPU pointer */
|
||||||
@ -66,21 +67,21 @@ OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t,
|
|||||||
#if OMPI_CUDA_SUPPORT_41
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
static int mca_common_cuda_async = 1;
|
static int mca_common_cuda_async = 1;
|
||||||
|
|
||||||
/* Array of CUDA events to be queried */
|
/* Array of CUDA events to be queried for IPC stream */
|
||||||
CUevent *cuda_event_status_array;
|
CUevent *cuda_event_ipc_array;
|
||||||
|
|
||||||
/* Array of fragments currently being moved by cuda async non-blocking
|
/* Array of fragments currently being moved by cuda async non-blocking
|
||||||
* operations */
|
* operations */
|
||||||
struct mca_btl_base_descriptor_t **cuda_event_frag_array;
|
struct mca_btl_base_descriptor_t **cuda_event_ipc_frag_array;
|
||||||
|
|
||||||
/* First free/available location in cuda_event_status_array */
|
/* First free/available location in cuda_event_status_array */
|
||||||
int cuda_event_status_first_avail;
|
int cuda_event_ipc_first_avail;
|
||||||
|
|
||||||
/* First currently-being used location in the cuda_event_status_array */
|
/* First currently-being used location in the cuda_event_status_array */
|
||||||
int cuda_event_status_first_used;
|
int cuda_event_ipc_first_used;
|
||||||
|
|
||||||
/* Number of status items currently in use */
|
/* Number of status items currently in use */
|
||||||
int cuda_event_status_num_used;
|
int cuda_event_ipc_num_used;
|
||||||
|
|
||||||
/* Size of array holding events */
|
/* Size of array holding events */
|
||||||
int cuda_event_max = 200;
|
int cuda_event_max = 200;
|
||||||
@ -195,14 +196,14 @@ static int mca_common_cuda_init(void)
|
|||||||
#if OMPI_CUDA_SUPPORT_41
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
if (true == mca_common_cuda_enabled) {
|
if (true == mca_common_cuda_enabled) {
|
||||||
/* Set up an array to store outstanding async copy events */
|
/* Set up an array to store outstanding async copy events */
|
||||||
cuda_event_status_array = NULL;
|
cuda_event_ipc_array = NULL;
|
||||||
cuda_event_frag_array = NULL;
|
cuda_event_ipc_frag_array = NULL;
|
||||||
cuda_event_status_num_used = 0;
|
cuda_event_ipc_num_used = 0;
|
||||||
cuda_event_status_first_avail = 0;
|
cuda_event_ipc_first_avail = 0;
|
||||||
cuda_event_status_first_used = 0;
|
cuda_event_ipc_first_used = 0;
|
||||||
|
|
||||||
cuda_event_status_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
|
cuda_event_ipc_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
|
||||||
if (NULL == cuda_event_status_array) {
|
if (NULL == cuda_event_ipc_array) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||||
true, errno, strerror(errno));
|
true, errno, strerror(errno));
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -210,7 +211,7 @@ static int mca_common_cuda_init(void)
|
|||||||
|
|
||||||
/* Create the events since they can be reused. */
|
/* Create the events since they can be reused. */
|
||||||
for (i = 0; i < cuda_event_max; i++) {
|
for (i = 0; i < cuda_event_max; i++) {
|
||||||
res = cuEventCreate(&cuda_event_status_array[i], CU_EVENT_DISABLE_TIMING);
|
res = cuEventCreate(&cuda_event_ipc_array[i], CU_EVENT_DISABLE_TIMING);
|
||||||
if (CUDA_SUCCESS != res) {
|
if (CUDA_SUCCESS != res) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
|
||||||
true, res);
|
true, res);
|
||||||
@ -220,9 +221,9 @@ static int mca_common_cuda_init(void)
|
|||||||
|
|
||||||
/* The first available status index is 0. Make an empty frag
|
/* The first available status index is 0. Make an empty frag
|
||||||
array. */
|
array. */
|
||||||
cuda_event_frag_array = (struct mca_btl_base_descriptor_t **)
|
cuda_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **)
|
||||||
malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
|
malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
|
||||||
if (NULL == cuda_event_frag_array) {
|
if (NULL == cuda_event_ipc_frag_array) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||||
true, errno, strerror(errno));
|
true, errno, strerror(errno));
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -254,6 +255,14 @@ static int mca_common_cuda_init(void)
|
|||||||
OBJ_RELEASE(mem_reg);
|
OBJ_RELEASE(mem_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Create stream for use in ipc asynchronous copies */
|
||||||
|
res = cuStreamCreate(&ipcStream, 0);
|
||||||
|
if (res != CUDA_SUCCESS) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
|
||||||
|
true, res);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
opal_output_verbose(30, mca_common_cuda_output,
|
opal_output_verbose(30, mca_common_cuda_output,
|
||||||
"CUDA: initialized");
|
"CUDA: initialized");
|
||||||
common_cuda_initialized = true;
|
common_cuda_initialized = true;
|
||||||
@ -395,11 +404,13 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
|
|||||||
cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
|
cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
|
||||||
memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));
|
memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));
|
||||||
|
|
||||||
/* Need to record the event to ensure that any memcopies
|
/* Need to record the event to ensure that any memcopies into the
|
||||||
* into the device memory have completed. The event handle
|
* device memory have completed. The event handle associated with
|
||||||
* associated with this event is sent to the remote process
|
* this event is sent to the remote process so that it will wait
|
||||||
* so that it will wait on this event prior to copying data
|
* on this event prior to copying data out of the device memory.
|
||||||
* out of the device memory. */
|
* Note that this needs to be the NULL stream to make since it is
|
||||||
|
* unknown what stream any copies into the device memory were done
|
||||||
|
* with. */
|
||||||
result = cuEventRecord((CUevent)cuda_reg->event, 0);
|
result = cuEventRecord((CUevent)cuda_reg->event, 0);
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
@ -581,7 +592,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
/* First make sure there is room to store the event. If not, then
|
/* First make sure there is room to store the event. If not, then
|
||||||
* return an error. The error message will tell the user to try and
|
* return an error. The error message will tell the user to try and
|
||||||
* run again, but with a larger array for storing events. */
|
* run again, but with a larger array for storing events. */
|
||||||
if (cuda_event_status_num_used == cuda_event_max) {
|
if (cuda_event_ipc_num_used == cuda_event_max) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
|
orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
|
||||||
true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
|
true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
@ -590,7 +601,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
/* This is the standard way to run. Running with synchronous copies is available
|
/* This is the standard way to run. Running with synchronous copies is available
|
||||||
* to measure the advantages of asynchronous copies. */
|
* to measure the advantages of asynchronous copies. */
|
||||||
if (OPAL_LIKELY(mca_common_cuda_async)) {
|
if (OPAL_LIKELY(mca_common_cuda_async)) {
|
||||||
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
|
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
||||||
true, dst, src, amount, result);
|
true, dst, src, amount, result);
|
||||||
@ -600,25 +611,25 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
|
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
|
||||||
dst, src, (int)amount);
|
dst, src, (int)amount);
|
||||||
}
|
}
|
||||||
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
|
result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
true, result);
|
true, result);
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
|
cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
|
||||||
|
|
||||||
/* Bump up the first available slot and number used by 1 */
|
/* Bump up the first available slot and number used by 1 */
|
||||||
cuda_event_status_first_avail++;
|
cuda_event_ipc_first_avail++;
|
||||||
if (cuda_event_status_first_avail >= cuda_event_max) {
|
if (cuda_event_ipc_first_avail >= cuda_event_max) {
|
||||||
cuda_event_status_first_avail = 0;
|
cuda_event_ipc_first_avail = 0;
|
||||||
}
|
}
|
||||||
cuda_event_status_num_used++;
|
cuda_event_ipc_num_used++;
|
||||||
|
|
||||||
*done = 0;
|
*done = 0;
|
||||||
} else {
|
} else {
|
||||||
/* Mimic the async function so they use the same memcpy call. */
|
/* Mimic the async function so they use the same memcpy call. */
|
||||||
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
|
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
||||||
true, dst, src, amount, result);
|
true, dst, src, amount, result);
|
||||||
@ -630,23 +641,23 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Record an event, then wait for it to complete with calls to cuEventQuery */
|
/* Record an event, then wait for it to complete with calls to cuEventQuery */
|
||||||
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
|
result = cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
|
||||||
if (CUDA_SUCCESS != result) {
|
if (CUDA_SUCCESS != result) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
true, result);
|
true, result);
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
|
cuda_event_ipc_frag_array[cuda_event_ipc_first_avail] = frag;
|
||||||
|
|
||||||
/* Bump up the first available slot and number used by 1 */
|
/* Bump up the first available slot and number used by 1 */
|
||||||
cuda_event_status_first_avail++;
|
cuda_event_ipc_first_avail++;
|
||||||
if (cuda_event_status_first_avail >= cuda_event_max) {
|
if (cuda_event_ipc_first_avail >= cuda_event_max) {
|
||||||
cuda_event_status_first_avail = 0;
|
cuda_event_ipc_first_avail = 0;
|
||||||
}
|
}
|
||||||
cuda_event_status_num_used++;
|
cuda_event_ipc_num_used++;
|
||||||
|
|
||||||
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
|
||||||
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
||||||
true, result);
|
true, result);
|
||||||
@ -658,7 +669,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
if (0 == (iter % 10)) {
|
if (0 == (iter % 10)) {
|
||||||
opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
|
opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
|
||||||
}
|
}
|
||||||
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
|
||||||
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
||||||
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
||||||
true, result);
|
true, result);
|
||||||
@ -667,10 +678,10 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
iter++;
|
iter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
--cuda_event_status_num_used;
|
--cuda_event_ipc_num_used;
|
||||||
++cuda_event_status_first_used;
|
++cuda_event_ipc_first_used;
|
||||||
if (cuda_event_status_first_used >= cuda_event_max) {
|
if (cuda_event_ipc_first_used >= cuda_event_max) {
|
||||||
cuda_event_status_first_used = 0;
|
cuda_event_ipc_first_used = 0;
|
||||||
}
|
}
|
||||||
*done = 1;
|
*done = 1;
|
||||||
}
|
}
|
||||||
@ -685,12 +696,12 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
|||||||
int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
||||||
CUresult result;
|
CUresult result;
|
||||||
|
|
||||||
if (cuda_event_status_num_used > 0) {
|
if (cuda_event_ipc_num_used > 0) {
|
||||||
opal_output_verbose(20, mca_common_cuda_output,
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
"CUDA: progress_one_cuda_event, outstanding_events=%d",
|
"CUDA: progress_one_cuda_event, outstanding_events=%d",
|
||||||
cuda_event_status_num_used);
|
cuda_event_ipc_num_used);
|
||||||
|
|
||||||
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
result = cuEventQuery(cuda_event_ipc_array[cuda_event_ipc_first_used]);
|
||||||
|
|
||||||
/* We found an event that is not ready, so return. */
|
/* We found an event that is not ready, so return. */
|
||||||
if (CUDA_ERROR_NOT_READY == result) {
|
if (CUDA_ERROR_NOT_READY == result) {
|
||||||
@ -705,15 +716,15 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
|||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
*frag = cuda_event_frag_array[cuda_event_status_first_used];
|
*frag = cuda_event_ipc_frag_array[cuda_event_ipc_first_used];
|
||||||
opal_output_verbose(10, mca_common_cuda_output,
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
"CUDA: cuEventQuery returned %d", result);
|
"CUDA: cuEventQuery returned %d", result);
|
||||||
|
|
||||||
/* Bump counters, loop around the circular buffer if necessary */
|
/* Bump counters, loop around the circular buffer if necessary */
|
||||||
--cuda_event_status_num_used;
|
--cuda_event_ipc_num_used;
|
||||||
++cuda_event_status_first_used;
|
++cuda_event_ipc_first_used;
|
||||||
if (cuda_event_status_first_used >= cuda_event_max) {
|
if (cuda_event_ipc_first_used >= cuda_event_max) {
|
||||||
cuda_event_status_first_used = 0;
|
cuda_event_ipc_first_used = 0;
|
||||||
}
|
}
|
||||||
/* A return value of 1 indicates an event completed and a frag was returned */
|
/* A return value of 1 indicates an event completed and a frag was returned */
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# -*- text -*-
|
# -*- text -*-
|
||||||
#
|
#
|
||||||
# Copyright (c) 2011 NVIDIA. All rights reserved.
|
# Copyright (c) 2011-2012 NVIDIA. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -135,3 +135,8 @@ cause the program to abort.
|
|||||||
cuEventDestory return value: %d
|
cuEventDestory return value: %d
|
||||||
Check the cuda.h file for what the return value means.
|
Check the cuda.h file for what the return value means.
|
||||||
#
|
#
|
||||||
|
[cuStreamCreate failed]
|
||||||
|
The call to cuStreamCreate failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuStreamCreate return value: %d
|
||||||
|
Check the cuda.h file for what the return vale means.
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user