diff --git a/ompi/mca/common/cuda/common_cuda.c b/ompi/mca/common/cuda/common_cuda.c index 0535b37bda..68faf3fe86 100644 --- a/ompi/mca/common/cuda/common_cuda.c +++ b/ompi/mca/common/cuda/common_cuda.c @@ -25,24 +25,45 @@ #include "opal/align.h" #include "opal/mca/base/mca_base_param.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/datatype/opal_datatype_cuda.h" #include "opal/util/output.h" #include "orte/util/show_help.h" #include "common_cuda.h" -static bool initialized = false; +static bool common_cuda_initialized = false; +static bool common_cuda_init_function_added = false; static int mca_common_cuda_verbose; static int mca_common_cuda_output = 0; static bool mca_common_cuda_enabled = false; static bool mca_common_cuda_register_memory = true; static bool mca_common_cuda_warning = true; +static opal_list_t common_cuda_memory_registrations; -void mca_common_cuda_init(void) +/* Structure to hold memory registrations that are delayed until first + * call to send or receive a GPU pointer */ +struct common_cuda_mem_regs_t { + opal_list_item_t super; + void *ptr; + size_t amount; + char *msg; +}; +typedef struct common_cuda_mem_regs_t common_cuda_mem_regs_t; +OBJ_CLASS_DECLARATION(common_cuda_mem_regs_t); +OBJ_CLASS_INSTANCE( common_cuda_mem_regs_t, + opal_list_item_t, + NULL, + NULL ); + + +static void mca_common_cuda_init(void) { - int id, value; + int id, value, i, s; CUresult res; CUcontext cuContext; + common_cuda_mem_regs_t *mem_reg; - if (initialized) { + if (common_cuda_initialized) { return; } @@ -96,16 +117,38 @@ void mca_common_cuda_init(void) mca_common_cuda_register_memory = false; } else { /* All is good. mca_common_cuda_register_memory will retain its original - * value. Normally, that is 1, but the user can override it to disable - * registration of the internal buffers. */ + * value. Normally, that is 1, but the user can override it to disable + * registration of the internal buffers. */ mca_common_cuda_enabled = true; opal_output_verbose(20, mca_common_cuda_output, "CUDA: cuCtxGetCurrent succeeded"); } + s = opal_list_get_size(&common_cuda_memory_registrations); + for(i = 0; i < s; i++) { + mem_reg = (common_cuda_mem_regs_t *) + opal_list_remove_first(&common_cuda_memory_registrations); + if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { + res = cuMemHostRegister(mem_reg->ptr, mem_reg->amount, 0); + if (res != CUDA_SUCCESS) { + /* If registering the memory fails, print a message and continue. + * This is not a fatal error. */ + orte_show_help("help-mpi-common-cuda.txt", "cuMemHostRegister failed", + true, mem_reg->ptr, mem_reg->amount, res, mem_reg->msg); + } else { + opal_output_verbose(20, mca_common_cuda_output, + "CUDA: cuMemHostRegister OK on mpool %s: " + "address=%p, bufsize=%d", + mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount); + } + } + free(mem_reg->msg); + OBJ_RELEASE(mem_reg); + } + opal_output_verbose(30, mca_common_cuda_output, "CUDA: initialized"); - initialized = true; + common_cuda_initialized = true; } @@ -116,8 +159,20 @@ void mca_common_cuda_init(void) void mca_common_cuda_register(void *ptr, size_t amount, char *msg) { int res; - if (!initialized) { - mca_common_cuda_init(); + if (!common_cuda_initialized) { + common_cuda_mem_regs_t *regptr; + if (!common_cuda_init_function_added) { + opal_cuda_add_initialization_function(&mca_common_cuda_init); + OBJ_CONSTRUCT(&common_cuda_memory_registrations, opal_list_t); + common_cuda_init_function_added = true; + } + regptr = OBJ_NEW(common_cuda_mem_regs_t); + regptr->ptr = ptr; + regptr->amount = amount; + regptr->msg = strdup(msg); + opal_list_append(&common_cuda_memory_registrations, + (opal_list_item_t*)regptr); + return; } if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { @@ -143,9 +198,7 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) { void mca_common_cuda_unregister(void *ptr, char *msg) { int res; - if (!initialized) { - mca_common_cuda_init(); - } + assert(true == common_cuda_initialized); if (mca_common_cuda_enabled && mca_common_cuda_register_memory) { res = cuMemHostUnregister(ptr); diff --git a/ompi/mca/common/cuda/common_cuda.h b/ompi/mca/common/cuda/common_cuda.h index b23efbe934..2194c33e89 100644 --- a/ompi/mca/common/cuda/common_cuda.h +++ b/ompi/mca/common/cuda/common_cuda.h @@ -20,8 +20,6 @@ #ifndef OMPI_MCA_COMMON_CUDA_H #define OMPI_MCA_COMMON_CUDA_H -OMPI_DECLSPEC void mca_common_cuda_init(void); - OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg); OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg); diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c index 121fa60a42..f1f62331f0 100755 --- a/opal/datatype/opal_datatype_cuda.c +++ b/opal/datatype/opal_datatype_cuda.c @@ -24,6 +24,16 @@ static bool initialized = false; static int opal_cuda_verbose; static int opal_cuda_output = 0; static void opal_cuda_support_init(void); +static void (*common_cuda_initialization_function)(void) = NULL; + +/* This function allows the common cuda code to register an + * initialization function that gets called the first time an attempt + * is made to send or receive a GPU pointer. This allows us to delay + * some CUDA initialization until after MPI_Init(). + */ +void opal_cuda_add_initialization_function(void (*fptr)(void)) { + common_cuda_initialization_function = fptr; +} void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) { @@ -31,10 +41,6 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) CUmemorytype memType; CUdeviceptr dbuf = (CUdeviceptr)pUserBuf; - if (!initialized) { - opal_cuda_support_init(); - } - res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf); if (res != CUDA_SUCCESS) { @@ -48,6 +54,11 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) /* Must be a device pointer */ assert(memType == CU_MEMORYTYPE_DEVICE); + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; convertor->flags |= CONVERTOR_CUDA; } @@ -132,6 +143,12 @@ static void opal_cuda_support_init(void) return; } + /* Callback into the common cuda initialization routine. This is only + * set if some work had been done already in the common cuda code.*/ + if (NULL != common_cuda_initialization_function) { + common_cuda_initialization_function(); + } + /* Set different levels of verbosity in the cuda related code. */ id = mca_base_param_reg_int_name("opal", "cuda_verbose", "Set level of opal cuda verbosity", diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h index f9de625484..09f59fefc0 100755 --- a/opal/datatype/opal_datatype_cuda.h +++ b/opal/datatype/opal_datatype_cuda.h @@ -10,11 +10,10 @@ #ifndef _OPAL_DATATYPE_CUDA_H #define _OPAL_DATATYPE_CUDA_H -#include "cuda.h" - void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf); bool opal_cuda_check_bufs(char *dest, char *src); void* opal_cuda_memcpy(void * dest, void * src, size_t size); void* opal_cuda_memmove(void * dest, void * src, size_t size); +void opal_cuda_add_initialization_function(void (*fptr)(void)); #endif