Some additional CUDA specific code.
Adding a few more support functions that will be used in future development. This commit was SVN r25684.
Этот коммит содержится в:
родитель
e0139a2d7e
Коммит
8073f5002a
@ -17,6 +17,12 @@
|
|||||||
* $HEADER$
|
* $HEADER$
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This file contains various support functions for doing CUDA
|
||||||
|
* operations. Some of the features are only available in CUDA 4.1
|
||||||
|
* and later, so some code is conditionalized around the
|
||||||
|
* OMPI_CUDA_SUPPORT_41 macro.
|
||||||
|
*/
|
||||||
#include "ompi_config.h"
|
#include "ompi_config.h"
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
@ -28,6 +34,7 @@
|
|||||||
#include "opal/datatype/opal_convertor.h"
|
#include "opal/datatype/opal_convertor.h"
|
||||||
#include "opal/datatype/opal_datatype_cuda.h"
|
#include "opal/datatype/opal_datatype_cuda.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "common_cuda.h"
|
#include "common_cuda.h"
|
||||||
|
|
||||||
@ -55,8 +62,54 @@ OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t,
|
|||||||
NULL,
|
NULL,
|
||||||
NULL );
|
NULL );
|
||||||
|
|
||||||
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
|
static int mca_common_cuda_async = 1;
|
||||||
|
|
||||||
static void mca_common_cuda_init(void)
|
/* Array of CUDA events to be queried */
|
||||||
|
CUevent *cuda_event_status_array;
|
||||||
|
|
||||||
|
/* Array of fragments currently being moved by cuda async non-blocking
|
||||||
|
* operations */
|
||||||
|
struct mca_btl_base_descriptor_t **cuda_event_frag_array;
|
||||||
|
|
||||||
|
/* First free/available location in cuda_event_status_array */
|
||||||
|
int cuda_event_status_first_avail;
|
||||||
|
|
||||||
|
/* First currently-being used location in the cuda_event_status_array */
|
||||||
|
int cuda_event_status_first_used;
|
||||||
|
|
||||||
|
/* Number of status items currently in use */
|
||||||
|
int cuda_event_status_num_used;
|
||||||
|
|
||||||
|
/* Size of array holding events */
|
||||||
|
int cuda_event_max = 200;
|
||||||
|
|
||||||
|
#define CUDA_COMMON_TIMING 0
|
||||||
|
#if CUDA_COMMON_TIMING
|
||||||
|
/* Some timing support structures. Enable this to help analyze
|
||||||
|
* internal performance issues. */
|
||||||
|
static struct timespec ts_start;
|
||||||
|
static struct timespec ts_end;
|
||||||
|
static double accum;
|
||||||
|
#define THOUSAND 1000L
|
||||||
|
#define MILLION 1000000L
|
||||||
|
static float mydifftime(struct timespec ts_start, struct timespec ts_end);
|
||||||
|
#endif /* CUDA_COMMON_TIMING */
|
||||||
|
|
||||||
|
/* These functions are typically unused in the optimized builds. */
|
||||||
|
static void cuda_dump_evthandle(int, void *, char *) __opal_attribute_unused__ ;
|
||||||
|
static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
|
||||||
|
#if OPAL_ENABLE_DEBUG
|
||||||
|
#define CUDA_DUMP_MEMHANDLE(a) cuda_dump_memhandle a
|
||||||
|
#define CUDA_DUMP_EVTHANDLE(a) cuda_dump_evthandle a
|
||||||
|
#else
|
||||||
|
#define CUDA_DUMP_MEMHANDLE(a)
|
||||||
|
#define CUDA_DUMP_EVTHANDLE(a)
|
||||||
|
#endif /* OPAL_ENABLE_DEBUG */
|
||||||
|
|
||||||
|
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||||
|
|
||||||
|
static int mca_common_cuda_init(void)
|
||||||
{
|
{
|
||||||
int id, value, i, s;
|
int id, value, i, s;
|
||||||
CUresult res;
|
CUresult res;
|
||||||
@ -64,7 +117,7 @@ static void mca_common_cuda_init(void)
|
|||||||
common_cuda_mem_regs_t *mem_reg;
|
common_cuda_mem_regs_t *mem_reg;
|
||||||
|
|
||||||
if (common_cuda_initialized) {
|
if (common_cuda_initialized) {
|
||||||
return;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set different levels of verbosity in the cuda related code. */
|
/* Set different levels of verbosity in the cuda related code. */
|
||||||
@ -91,6 +144,20 @@ static void mca_common_cuda_init(void)
|
|||||||
(int) mca_common_cuda_warning, &value);
|
(int) mca_common_cuda_warning, &value);
|
||||||
mca_common_cuda_warning = OPAL_INT_TO_BOOL(value);
|
mca_common_cuda_warning = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
|
/* Use this flag to test async vs sync copies */
|
||||||
|
id = mca_base_param_reg_int_name("mpi", "common_cuda_memcpy_async",
|
||||||
|
"Set to 0 to force CUDA sync copy instead of async",
|
||||||
|
false, false, mca_common_cuda_async, &i);
|
||||||
|
mca_common_cuda_async = i;
|
||||||
|
|
||||||
|
/* Use this parameter to increase the number of outstanding events allows */
|
||||||
|
id = mca_base_param_reg_int_name("mpi", "common_cuda_event_max",
|
||||||
|
"Set number of oustanding CUDA events",
|
||||||
|
false, false, cuda_event_max, &i);
|
||||||
|
cuda_event_max = i;
|
||||||
|
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||||
|
|
||||||
/* Check to see if this process is running in a CUDA context. If
|
/* Check to see if this process is running in a CUDA context. If
|
||||||
* so, all is good. If not, then disable registration of memory. */
|
* so, all is good. If not, then disable registration of memory. */
|
||||||
res = cuCtxGetCurrent(&cuContext);
|
res = cuCtxGetCurrent(&cuContext);
|
||||||
@ -124,6 +191,45 @@ static void mca_common_cuda_init(void)
|
|||||||
"CUDA: cuCtxGetCurrent succeeded");
|
"CUDA: cuCtxGetCurrent succeeded");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
|
if (true == mca_common_cuda_enabled) {
|
||||||
|
/* Set up an array to store outstanding async copy events */
|
||||||
|
cuda_event_status_array = NULL;
|
||||||
|
cuda_event_frag_array = NULL;
|
||||||
|
cuda_event_status_num_used = 0;
|
||||||
|
cuda_event_status_first_avail = 0;
|
||||||
|
cuda_event_status_first_used = 0;
|
||||||
|
|
||||||
|
cuda_event_status_array = (CUevent *) malloc(sizeof(CUevent) * cuda_event_max);
|
||||||
|
if (NULL == cuda_event_status_array) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||||
|
true, errno, strerror(errno));
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create the events since they can be reused. */
|
||||||
|
for (i = 0; i < cuda_event_max; i++) {
|
||||||
|
res = cuEventCreate(&cuda_event_status_array[i], CU_EVENT_DISABLE_TIMING);
|
||||||
|
if (CUDA_SUCCESS != res) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
|
||||||
|
true, res);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The first available status index is 0. Make an empty frag
|
||||||
|
array. */
|
||||||
|
cuda_event_frag_array = (struct mca_btl_base_descriptor_t **)
|
||||||
|
malloc(sizeof(struct mca_btl_base_descriptor_t *) * cuda_event_max);
|
||||||
|
if (NULL == cuda_event_frag_array) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "No memory",
|
||||||
|
true, errno, strerror(errno));
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||||
|
|
||||||
s = opal_list_get_size(&common_cuda_memory_registrations);
|
s = opal_list_get_size(&common_cuda_memory_registrations);
|
||||||
for(i = 0; i < s; i++) {
|
for(i = 0; i < s; i++) {
|
||||||
mem_reg = (common_cuda_mem_regs_t *)
|
mem_reg = (common_cuda_mem_regs_t *)
|
||||||
@ -149,9 +255,9 @@ static void mca_common_cuda_init(void)
|
|||||||
opal_output_verbose(30, mca_common_cuda_output,
|
opal_output_verbose(30, mca_common_cuda_output,
|
||||||
"CUDA: initialized");
|
"CUDA: initialized");
|
||||||
common_cuda_initialized = true;
|
common_cuda_initialized = true;
|
||||||
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Call the CUDA register function so we pin the memory in the CUDA
|
* Call the CUDA register function so we pin the memory in the CUDA
|
||||||
* space.
|
* space.
|
||||||
@ -196,11 +302,24 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) {
|
|||||||
* space.
|
* space.
|
||||||
*/
|
*/
|
||||||
void mca_common_cuda_unregister(void *ptr, char *msg) {
|
void mca_common_cuda_unregister(void *ptr, char *msg) {
|
||||||
int res;
|
int res, i, s;
|
||||||
|
common_cuda_mem_regs_t *mem_reg;
|
||||||
|
|
||||||
assert(true == common_cuda_initialized);
|
/* This can happen if memory was queued up to be registered, but
|
||||||
|
* no CUDA operations happened, so it never was registered.
|
||||||
|
* Therefore, just release any of the resources. */
|
||||||
|
if (false == common_cuda_initialized) {
|
||||||
|
s = opal_list_get_size(&common_cuda_memory_registrations);
|
||||||
|
for(i = 0; i < s; i++) {
|
||||||
|
mem_reg = (common_cuda_mem_regs_t *)
|
||||||
|
opal_list_remove_first(&common_cuda_memory_registrations);
|
||||||
|
free(mem_reg->msg);
|
||||||
|
OBJ_RELEASE(mem_reg);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
|
if (mca_common_cuda_enabled && mca_common_cuda_register_memory) {
|
||||||
res = cuMemHostUnregister(ptr);
|
res = cuMemHostUnregister(ptr);
|
||||||
if (res != CUDA_SUCCESS) {
|
if (res != CUDA_SUCCESS) {
|
||||||
/* If unregistering the memory fails, print a message and continue.
|
/* If unregistering the memory fails, print a message and continue.
|
||||||
@ -215,3 +334,497 @@ void mca_common_cuda_unregister(void *ptr, char *msg) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if OMPI_CUDA_SUPPORT_41
|
||||||
|
/*
|
||||||
|
* Get the memory handle of a local section of memory that can be sent
|
||||||
|
* to the remote size so it can access the memory. This is the
|
||||||
|
* registration function for the sending side of a message transfer.
|
||||||
|
*/
|
||||||
|
int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg,
|
||||||
|
mca_mpool_base_registration_t *hdrreg)
|
||||||
|
|
||||||
|
{
|
||||||
|
CUmemorytype memType;
|
||||||
|
CUresult result;
|
||||||
|
CUipcMemHandle memHandle;
|
||||||
|
CUdeviceptr pbase;
|
||||||
|
size_t psize;
|
||||||
|
|
||||||
|
mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)newreg;
|
||||||
|
|
||||||
|
/* We should only be there if this is a CUDA device pointer */
|
||||||
|
result = cuPointerGetAttribute(&memType,
|
||||||
|
CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)base);
|
||||||
|
assert(CUDA_SUCCESS == result);
|
||||||
|
assert(CU_MEMORYTYPE_DEVICE == memType);
|
||||||
|
|
||||||
|
/* Get the memory handle so we can send it to the remote process. */
|
||||||
|
result = cuIpcGetMemHandle(&memHandle, (CUdeviceptr)base);
|
||||||
|
CUDA_DUMP_MEMHANDLE((100, &memHandle, "GetMemHandle-After"));
|
||||||
|
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuIpcGetMemHandle failed",
|
||||||
|
true, result, base);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: cuIpcGetMemHandle passed: base=%p",
|
||||||
|
base);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Need to get the real base and size of the memory handle. This is
|
||||||
|
* how the remote side saves the handles in a cache. */
|
||||||
|
result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)base);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuMemGetAddressRange failed",
|
||||||
|
true, result, base);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
|
||||||
|
base, (int)size, (void *)pbase, (int)psize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Store all the information in the registration */
|
||||||
|
cuda_reg->base.base = (void *)pbase;
|
||||||
|
cuda_reg->base.bound = (unsigned char *)pbase + psize - 1;
|
||||||
|
memcpy(&cuda_reg->memHandle, &memHandle, sizeof(memHandle));
|
||||||
|
|
||||||
|
/* Need to record the event to ensure that any memcopies
|
||||||
|
* into the device memory have completed. The event handle
|
||||||
|
* associated with this event is sent to the remote process
|
||||||
|
* so that it will wait on this event prior to copying data
|
||||||
|
* out of the device memory. */
|
||||||
|
result = cuEventRecord((CUevent)cuda_reg->event, 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
|
true, result, base);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function is called by the local side that called the cuda_getmemhandle.
|
||||||
|
* There is nothing to be done so just return.
|
||||||
|
*/
|
||||||
|
int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||||
|
{
|
||||||
|
CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_rcuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
|
||||||
|
opal_output_verbose(5, mca_common_cuda_output,
|
||||||
|
"CUDA: cuda_ungetmemhandle: base=%p",
|
||||||
|
reg_data);
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Open a memory handle that refers to remote memory so we can get an address
|
||||||
|
* that works on the local side. This is the registration function for the
|
||||||
|
* remote side of a transfer. newreg contains the new handle. hddrreg contains
|
||||||
|
* the memory handle that was received from the remote side.
|
||||||
|
*/
|
||||||
|
int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg,
|
||||||
|
mca_mpool_base_registration_t *hdrreg)
|
||||||
|
{
|
||||||
|
CUresult result;
|
||||||
|
CUipcMemHandle memHandle;
|
||||||
|
mca_mpool_rcuda_reg_t *cuda_newreg = (mca_mpool_rcuda_reg_t*)newreg;
|
||||||
|
|
||||||
|
/* Need to copy into memory handle for call into CUDA library. */
|
||||||
|
memcpy(&memHandle, cuda_newreg->memHandle, sizeof(memHandle));
|
||||||
|
CUDA_DUMP_MEMHANDLE((100, &memHandle, "Before call to cuIpcOpenMemHandle"));
|
||||||
|
|
||||||
|
/* Open the memory handle and store it into the registration structure. */
|
||||||
|
result = cuIpcOpenMemHandle((CUdeviceptr *)&newreg->alloc_base, memHandle,
|
||||||
|
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
|
||||||
|
|
||||||
|
/* If there are some stale entries in the cache, they can cause other
|
||||||
|
* registrations to fail. Let the caller know that so that can attempt
|
||||||
|
* to clear them out. */
|
||||||
|
if (CUDA_ERROR_ALREADY_MAPPED == result) {
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"Failed to get handle for p=%p, signal upper layer\n", base);
|
||||||
|
return OMPI_ERR_WOULD_BLOCK;
|
||||||
|
}
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuIpcOpenMemHandle failed",
|
||||||
|
true, result, base);
|
||||||
|
/* Currently, this is a non-recoverable error */
|
||||||
|
return OMPI_ERROR;
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"CUDA: cuIpcOpenMemHandle passed: base=%p",
|
||||||
|
newreg->alloc_base);
|
||||||
|
CUDA_DUMP_MEMHANDLE((200, &memHandle, "cuIpcOpenMemHandle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Close a memory handle that refers to remote memory.
|
||||||
|
*/
|
||||||
|
int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
|
||||||
|
{
|
||||||
|
CUresult result;
|
||||||
|
mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)reg;
|
||||||
|
|
||||||
|
result = cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed",
|
||||||
|
true, result, cuda_reg->base.alloc_base);
|
||||||
|
/* We will just continue on and hope things continue to work. */
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(10, mca_common_cuda_output,
|
||||||
|
"CUDA: cuIpcCloseMemHandle passed: base=%p",
|
||||||
|
cuda_reg->base.alloc_base);
|
||||||
|
CUDA_DUMP_MEMHANDLE((10, cuda_reg->memHandle, "cuIpcCloseMemHandle"));
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle)
|
||||||
|
{
|
||||||
|
CUresult result;
|
||||||
|
|
||||||
|
result = cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
result = cuIpcGetEventHandle((CUipcEventHandle *)handle, (CUevent)*event);
|
||||||
|
if (CUDA_SUCCESS != result){
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuIpcGetEventHandle failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_DUMP_EVTHANDLE((10, handle, "construct_event_and_handle"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void mca_common_cuda_destruct_event(uint64_t *event)
|
||||||
|
{
|
||||||
|
CUresult result;
|
||||||
|
|
||||||
|
result = cuEventDestroy((CUevent)event);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Put remote event on stream to ensure that the the start of the
|
||||||
|
* copy does not start until the completion of the event.
|
||||||
|
*/
|
||||||
|
void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg)
|
||||||
|
{
|
||||||
|
CUipcEventHandle evtHandle;
|
||||||
|
CUevent event;
|
||||||
|
CUresult result;
|
||||||
|
|
||||||
|
memcpy(&evtHandle, rget_reg->evtHandle, sizeof(evtHandle));
|
||||||
|
CUDA_DUMP_EVTHANDLE((2, &evtHandle, "stream_synchronize"));
|
||||||
|
|
||||||
|
result = cuIpcOpenEventHandle(&event, evtHandle);
|
||||||
|
if (CUDA_SUCCESS != result){
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
|
||||||
|
* versions. Need to record an event on the stream, even though
|
||||||
|
* it is not used, to make sure we do not short circuit our way
|
||||||
|
* out of the cuStreamWaitEvent test.
|
||||||
|
*/
|
||||||
|
result = cuEventRecord(event, 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
/* END of Workaround */
|
||||||
|
|
||||||
|
result = cuStreamWaitEvent(0, event, 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuStreamWaitEvent failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* All done with this event. */
|
||||||
|
result = cuEventDestroy(event);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventDestroy failed",
|
||||||
|
true, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Start the asynchronous copy. Then record and save away an event that will
|
||||||
|
* be queried to indicate the copy has completed.
|
||||||
|
*/
|
||||||
|
int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
||||||
|
struct mca_btl_base_descriptor_t *frag, int *done)
|
||||||
|
{
|
||||||
|
CUresult result;
|
||||||
|
int iter;
|
||||||
|
|
||||||
|
/* First make sure there is room to store the event. If not, then
|
||||||
|
* return an error. The error message will tell the user to try and
|
||||||
|
* run again, but with a larger array for storing events. */
|
||||||
|
if (cuda_event_status_num_used == cuda_event_max) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "Out of cuEvent handles",
|
||||||
|
true, cuda_event_max, cuda_event_max+100, cuda_event_max+100);
|
||||||
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This is the standard way to run. Running with synchronous copies is available
|
||||||
|
* to measure the advantages of asynchronous copies. */
|
||||||
|
if (OPAL_LIKELY(mca_common_cuda_async)) {
|
||||||
|
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
||||||
|
true, dst, src, amount, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
|
||||||
|
dst, src, (int)amount);
|
||||||
|
}
|
||||||
|
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
|
true, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
|
||||||
|
|
||||||
|
/* Bump up the first available slot and number used by 1 */
|
||||||
|
cuda_event_status_first_avail++;
|
||||||
|
if (cuda_event_status_first_avail >= cuda_event_max) {
|
||||||
|
cuda_event_status_first_avail = 0;
|
||||||
|
}
|
||||||
|
cuda_event_status_num_used++;
|
||||||
|
|
||||||
|
*done = 0;
|
||||||
|
} else {
|
||||||
|
/* Mimic the async function so they use the same memcpy call. */
|
||||||
|
result = cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
|
||||||
|
true, dst, src, amount, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
|
||||||
|
dst, src, (int)amount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Record an event, then wait for it to complete with calls to cuEventQuery */
|
||||||
|
result = cuEventRecord(cuda_event_status_array[cuda_event_status_first_avail], 0);
|
||||||
|
if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
|
||||||
|
true, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_event_frag_array[cuda_event_status_first_avail] = frag;
|
||||||
|
|
||||||
|
/* Bump up the first available slot and number used by 1 */
|
||||||
|
cuda_event_status_first_avail++;
|
||||||
|
if (cuda_event_status_first_avail >= cuda_event_max) {
|
||||||
|
cuda_event_status_first_avail = 0;
|
||||||
|
}
|
||||||
|
cuda_event_status_num_used++;
|
||||||
|
|
||||||
|
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
||||||
|
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
||||||
|
true, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
iter = 0;
|
||||||
|
while (CUDA_ERROR_NOT_READY == result) {
|
||||||
|
if (0 == (iter % 10)) {
|
||||||
|
opal_output(-1, "EVENT NOT DONE (iter=%d)", iter);
|
||||||
|
}
|
||||||
|
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
||||||
|
if ((CUDA_SUCCESS != result) && (CUDA_ERROR_NOT_READY != result)) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
||||||
|
true, result);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
iter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
--cuda_event_status_num_used;
|
||||||
|
++cuda_event_status_first_used;
|
||||||
|
if (cuda_event_status_first_used >= cuda_event_max) {
|
||||||
|
cuda_event_status_first_used = 0;
|
||||||
|
}
|
||||||
|
*done = 1;
|
||||||
|
}
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Function is called every time progress is called with the sm BTL. If there
|
||||||
|
* are outstanding events, check to see if one has completed. If so, hand
|
||||||
|
* back the fragment for further processing.
|
||||||
|
*/
|
||||||
|
int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
|
||||||
|
CUresult result;
|
||||||
|
|
||||||
|
if (cuda_event_status_num_used > 0) {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: progress_one_cuda_event, outstanding_events=%d",
|
||||||
|
cuda_event_status_num_used);
|
||||||
|
|
||||||
|
result = cuEventQuery(cuda_event_status_array[cuda_event_status_first_used]);
|
||||||
|
|
||||||
|
/* We found an event that is not ready, so return. */
|
||||||
|
if (CUDA_ERROR_NOT_READY == result) {
|
||||||
|
opal_output_verbose(20, mca_common_cuda_output,
|
||||||
|
"CUDA: cuEventQuery returned CUDA_ERROR_NOT_READY");
|
||||||
|
*frag = NULL;
|
||||||
|
return 0;
|
||||||
|
} else if (CUDA_SUCCESS != result) {
|
||||||
|
orte_show_help("help-mpi-common-cuda.txt", "cuEventQuery failed",
|
||||||
|
true, result);
|
||||||
|
*frag = NULL;
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
*frag = cuda_event_frag_array[cuda_event_status_first_used];
|
||||||
|
opal_output_verbose(5, mca_common_cuda_output,
|
||||||
|
"CUDA: cuEventQuery returned %d", result);
|
||||||
|
|
||||||
|
/* Bump counters, loop around the circular buffer if necessary */
|
||||||
|
--cuda_event_status_num_used;
|
||||||
|
++cuda_event_status_first_used;
|
||||||
|
if (cuda_event_status_first_used >= cuda_event_max) {
|
||||||
|
cuda_event_status_first_used = 0;
|
||||||
|
}
|
||||||
|
/* A return value of 1 indicates an event completed and a frag was returned */
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Need to make sure the handle we are retrieving from the cache is still
|
||||||
|
* valid. Compare the cached handle to the one received.
|
||||||
|
*/
|
||||||
|
int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg,
|
||||||
|
mca_mpool_rcuda_reg_t *old_reg)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (0 == memcmp(new_reg->memHandle, old_reg->memHandle, sizeof(new_reg->memHandle))) {
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Function to dump memory handle information. This is based on
|
||||||
|
* definitions from cuiinterprocess_private.h.
|
||||||
|
*/
|
||||||
|
static void cuda_dump_memhandle(int verbose, void *memHandle, char *str) {
|
||||||
|
|
||||||
|
struct InterprocessMemHandleInternal
|
||||||
|
{
|
||||||
|
/* The first two entries are the CUinterprocessCtxHandle */
|
||||||
|
int64_t ctxId; /* unique (within a process) id of the sharing context */
|
||||||
|
int pid; /* pid of sharing context */
|
||||||
|
|
||||||
|
int64_t size;
|
||||||
|
int64_t blocksize;
|
||||||
|
int64_t offset;
|
||||||
|
int gpuId;
|
||||||
|
int subDeviceIndex;
|
||||||
|
int64_t serial;
|
||||||
|
} memH;
|
||||||
|
|
||||||
|
if (NULL == str) {
|
||||||
|
str = "CUDA";
|
||||||
|
}
|
||||||
|
memcpy(&memH, memHandle, sizeof(memH));
|
||||||
|
opal_output_verbose(verbose, mca_common_cuda_output,
|
||||||
|
"%s:ctxId=%d, pid=%d, size=%d, blocksize=%d, offset=%d, gpuId=%d, "
|
||||||
|
"subDeviceIndex=%d, serial=%d",
|
||||||
|
str, (int)memH.ctxId, memH.pid, (int)memH.size, (int)memH.blocksize, (int)memH.offset,
|
||||||
|
memH.gpuId, memH.subDeviceIndex, (int)memH.serial);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Function to dump memory handle information. This is based on
|
||||||
|
* definitions from cuiinterprocess_private.h.
|
||||||
|
*/
|
||||||
|
static void cuda_dump_evthandle(int verbose, void *evtHandle, char *str) {
|
||||||
|
|
||||||
|
struct InterprocessEventHandleInternal
|
||||||
|
{
|
||||||
|
/* The first two entries are the CUinterprocessCtxHandle */
|
||||||
|
int64_t ctxId; /* unique (within a process) id of the sharing context */
|
||||||
|
int pid; /* pid of sharing context */
|
||||||
|
|
||||||
|
int pad; /* pad to match the structure */
|
||||||
|
int index;
|
||||||
|
} evtH;
|
||||||
|
|
||||||
|
if (NULL == str) {
|
||||||
|
str = "CUDA";
|
||||||
|
}
|
||||||
|
memcpy(&evtH, evtHandle, sizeof(evtH));
|
||||||
|
opal_output_verbose(verbose, mca_common_cuda_output,
|
||||||
|
"%s:ctxId=%d, pid=%d, index=%d",
|
||||||
|
str, (int)evtH.ctxId, evtH.pid, (int)evtH.index);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Return microseconds of elapsed time. Microseconds are relevant when
|
||||||
|
* trying to understand the fixed overhead of the communication. Used
|
||||||
|
* when trying to time various functions.
|
||||||
|
*
|
||||||
|
* Cut and past the following to get timings where wanted.
|
||||||
|
*
|
||||||
|
* clock_gettime(CLOCK_MONOTONIC, &ts_start);
|
||||||
|
* FUNCTION OF INTEREST
|
||||||
|
* clock_gettime(CLOCK_MONOTONIC, &ts_end);
|
||||||
|
* accum = mydifftime(ts_start, ts_end);
|
||||||
|
* opal_output(0, "Function took %7.2f usecs\n", accum);
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#if CUDA_COMMON_TIMING
|
||||||
|
static float mydifftime(struct timespec ts_start, struct timespec ts_end) {
|
||||||
|
float seconds;
|
||||||
|
float microseconds;
|
||||||
|
float nanoseconds;
|
||||||
|
|
||||||
|
/* If we did not rollover the seconds clock, then we just take
|
||||||
|
* the difference between the nanoseconds clock for actual time */
|
||||||
|
if (0 == (ts_end.tv_sec - ts_start.tv_sec)) {
|
||||||
|
nanoseconds = (float)(ts_end.tv_nsec - ts_start.tv_nsec);
|
||||||
|
return nanoseconds / THOUSAND;
|
||||||
|
} else {
|
||||||
|
seconds = (float)(ts_end.tv_sec - ts_start.tv_sec);
|
||||||
|
|
||||||
|
/* Note that this value can be negative or positive
|
||||||
|
* which is fine. In the case that it is negative, it
|
||||||
|
* just gets subtracted from the difference which is what
|
||||||
|
* we want. */
|
||||||
|
nanoseconds = (float)(ts_end.tv_nsec - ts_start.tv_nsec);
|
||||||
|
microseconds = (seconds * MILLION) + (nanoseconds/THOUSAND);
|
||||||
|
return microseconds;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* CUDA_COMMON_TIMING */
|
||||||
|
|
||||||
|
#endif /* OMPI_CUDA_SUPPORT_41 */
|
||||||
|
@ -19,9 +19,40 @@
|
|||||||
|
|
||||||
#ifndef OMPI_MCA_COMMON_CUDA_H
|
#ifndef OMPI_MCA_COMMON_CUDA_H
|
||||||
#define OMPI_MCA_COMMON_CUDA_H
|
#define OMPI_MCA_COMMON_CUDA_H
|
||||||
|
#include "ompi/mca/btl/btl.h"
|
||||||
|
|
||||||
|
struct mca_mpool_rcuda_reg_t {
|
||||||
|
mca_mpool_base_registration_t base;
|
||||||
|
uint64_t memHandle[8];
|
||||||
|
uint64_t evtHandle[8];
|
||||||
|
uint64_t event;
|
||||||
|
};
|
||||||
|
typedef struct mca_mpool_rcuda_reg_t mca_mpool_rcuda_reg_t;
|
||||||
|
|
||||||
|
|
||||||
OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);
|
OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);
|
||||||
|
|
||||||
OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);
|
OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
|
||||||
|
struct mca_btl_base_descriptor_t *, int *done);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC int progress_one_cuda_event(struct mca_btl_base_descriptor_t **);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg,
|
||||||
|
mca_mpool_rcuda_reg_t *old_reg);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle);
|
||||||
|
OMPI_DECLSPEC void mca_common_cuda_destruct_event(uint64_t *event);
|
||||||
|
|
||||||
|
OMPI_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_mpool_base_registration_t *newreg,
|
||||||
|
mca_mpool_base_registration_t *hdrreg);
|
||||||
|
OMPI_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg);
|
||||||
|
OMPI_DECLSPEC int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg,
|
||||||
|
mca_mpool_base_registration_t *hdrreg);
|
||||||
|
OMPI_DECLSPEC int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg);
|
||||||
|
|
||||||
|
|
||||||
#endif /* OMPI_MCA_COMMON_CUDA_H */
|
#endif /* OMPI_MCA_COMMON_CUDA_H */
|
||||||
|
@ -25,6 +25,11 @@ AC_DEFUN([MCA_ompi_common_cuda_CONFIG],[
|
|||||||
[$1],
|
[$1],
|
||||||
[$2])
|
[$2])
|
||||||
|
|
||||||
|
# Check to see if we have features of CUDA 4.1 available as well.
|
||||||
|
AM_CONDITIONAL([MCA_ompi_cuda_support_41], [test "x$CUDA_SUPPORT_41" = "x1"])
|
||||||
|
AC_DEFINE_UNQUOTED([OMPI_CUDA_SUPPORT_41],$CUDA_SUPPORT_41,
|
||||||
|
[Whether we want support CUDA 4.1 features])
|
||||||
|
|
||||||
# Copy over the includes and libs needed to build CUDA
|
# Copy over the includes and libs needed to build CUDA
|
||||||
common_cuda_CPPFLAGS=$opal_datatype_CPPFLAGS
|
common_cuda_CPPFLAGS=$opal_datatype_CPPFLAGS
|
||||||
common_cuda_LIBS=$opal_datatype_LIBS
|
common_cuda_LIBS=$opal_datatype_LIBS
|
||||||
|
@ -46,3 +46,90 @@ The call to cuMemHostUnregister(%p) failed.
|
|||||||
cuMemHostUnregister return value: %d
|
cuMemHostUnregister return value: %d
|
||||||
Memory Pool: %s
|
Memory Pool: %s
|
||||||
#
|
#
|
||||||
|
[cuIpcGetMemHandle failed]
|
||||||
|
The call to cuIpcGetMemHandle failed. This means the GPU RDMA protocol
|
||||||
|
cannot be used.
|
||||||
|
cuIpcGetMemHandle return value: %d
|
||||||
|
address: %p
|
||||||
|
Check the cuda.h file for what the return value means. Perhaps a reboot
|
||||||
|
of the node will clear the problem.
|
||||||
|
#
|
||||||
|
[cuMemGetAddressRange failed]
|
||||||
|
The call to cuMemGetAddressRange failed. This means the GPU RDMA protocol
|
||||||
|
cannot be used.
|
||||||
|
cuMemGetAddressRange return value: %d
|
||||||
|
address: %p
|
||||||
|
Check the cuda.h file for what the return value means. Perhaps a reboot
|
||||||
|
of the node will clear the problem.
|
||||||
|
#
|
||||||
|
[Out of cuEvent handles]
|
||||||
|
The library has exceeded its number of outstanding event handles.
|
||||||
|
For better performance, this number should be increased.
|
||||||
|
Current maximum handles: %4d
|
||||||
|
Suggested new maximum: %4d
|
||||||
|
Rerun with --mca mpi_common_cuda_event_max %d
|
||||||
|
#
|
||||||
|
[cuIpcOpenMemHandle failed]
|
||||||
|
The call to cuIpcOpenMemHandle failed. This is an unrecoverable error
|
||||||
|
and will cause the program to abort.
|
||||||
|
cuIpcOpenMemHandle return value: %d
|
||||||
|
address: %p
|
||||||
|
Check the cuda.h file for what the return value means. Perhaps a reboot
|
||||||
|
of the node will clear the problem.
|
||||||
|
#
|
||||||
|
[cuIpcCloseMemHandle failed]
|
||||||
|
The call to cuIpcCloseMemHandle failed. This is a warning and the program
|
||||||
|
will continue to run.
|
||||||
|
cuIpcOpenMemHandle return value: %d
|
||||||
|
address: %p
|
||||||
|
Check the cuda.h file for what the return value means. Perhaps a reboot
|
||||||
|
of the node will clear the problem.
|
||||||
|
#
|
||||||
|
[cuMemcpyAsync failed]
|
||||||
|
The call to cuMemcpyAsync failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuMemcpyAsync(%p, %p, %d) returned value %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuEventCreate failed]
|
||||||
|
The call to cuEventCreate failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuEventCreate return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuEventRecord failed]
|
||||||
|
The call to cuEventRecord failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuEventRecord return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuEventQuery failed]
|
||||||
|
The call to cuEventQuery failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuEventQuery return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuIpcGetEventHandle failed]
|
||||||
|
The call to cuIpcGetEventHandle failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuIpcGetEventHandle return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuIpcOpenEventHandle failed]
|
||||||
|
The call to cuIpcOpenEventHandle failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuIpcOpenEventHandle return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuStreamWaitEvent failed]
|
||||||
|
The call to cuStreamWaitEvent failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuStreamWaitEvent return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
[cuEventDestroy failed]
|
||||||
|
The call to cuEventDestory failed. This is a unrecoverable error and will
|
||||||
|
cause the program to abort.
|
||||||
|
cuEventDestory return value: %d
|
||||||
|
Check the cuda.h file for what the return value means.
|
||||||
|
#
|
||||||
|
@ -24,14 +24,14 @@ static bool initialized = false;
|
|||||||
static int opal_cuda_verbose;
|
static int opal_cuda_verbose;
|
||||||
static int opal_cuda_output = 0;
|
static int opal_cuda_output = 0;
|
||||||
static void opal_cuda_support_init(void);
|
static void opal_cuda_support_init(void);
|
||||||
static void (*common_cuda_initialization_function)(void) = NULL;
|
static int (*common_cuda_initialization_function)(void) = NULL;
|
||||||
|
|
||||||
/* This function allows the common cuda code to register an
|
/* This function allows the common cuda code to register an
|
||||||
* initialization function that gets called the first time an attempt
|
* initialization function that gets called the first time an attempt
|
||||||
* is made to send or receive a GPU pointer. This allows us to delay
|
* is made to send or receive a GPU pointer. This allows us to delay
|
||||||
* some CUDA initialization until after MPI_Init().
|
* some CUDA initialization until after MPI_Init().
|
||||||
*/
|
*/
|
||||||
void opal_cuda_add_initialization_function(void (*fptr)(void)) {
|
void opal_cuda_add_initialization_function(int (*fptr)(void)) {
|
||||||
common_cuda_initialization_function = fptr;
|
common_cuda_initialization_function = fptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,6 +14,6 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
|
|||||||
bool opal_cuda_check_bufs(char *dest, char *src);
|
bool opal_cuda_check_bufs(char *dest, char *src);
|
||||||
void* opal_cuda_memcpy(void * dest, void * src, size_t size);
|
void* opal_cuda_memcpy(void * dest, void * src, size_t size);
|
||||||
void* opal_cuda_memmove(void * dest, void * src, size_t size);
|
void* opal_cuda_memmove(void * dest, void * src, size_t size);
|
||||||
void opal_cuda_add_initialization_function(void (*fptr)(void));
|
void opal_cuda_add_initialization_function(int (*fptr)(void));
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user