diff --git a/ompi/mca/common/cuda/common_cuda.c b/ompi/mca/common/cuda/common_cuda.c index 1b9b18a227..242950566e 100644 --- a/ompi/mca/common/cuda/common_cuda.c +++ b/ompi/mca/common/cuda/common_cuda.c @@ -1614,7 +1614,13 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base) } #if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT -int mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg) +/* Check to see if the memory was freed between the time it was stored in + * the registration cache and now. Return true if the memory was previously + * freed. This is indicated by the BUFFER_ID value in the registration cache + * not matching the BUFFER_ID of the buffer we are checking. Return false + * if the registration is still good. + */ +bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg) { int res; unsigned long long bufID; @@ -1622,17 +1628,20 @@ int mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg) res = cuFunc.cuPointerGetAttribute(&bufID, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr)dbuf); + /* If we cannot determine the BUFFER_ID, then print a message and default + * to forcing the registration to be kicked out. */ if (res != CUDA_SUCCESS) { - opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", true, res); - return 0; + opal_show_help("help-mpi-common-cuda.txt", "bufferID failed", + true, ompi_process_info.nodename, res); + return true; } opal_output_verbose(50, mca_common_cuda_output, "CUDA: base=%p, bufID=%llu, reg->gpu_bufID=%llu, %s", dbuf, bufID, reg->gpu_bufID, (reg->gpu_bufID == bufID ? "BUFFER_ID match":"BUFFER_ID do not match")); if (bufID != reg->gpu_bufID) { - return 1; + return true; } else { - return 0; + return false; } } diff --git a/ompi/mca/common/cuda/common_cuda.h b/ompi/mca/common/cuda/common_cuda.h index 343734de85..e404027419 100644 --- a/ompi/mca/common/cuda/common_cuda.h +++ b/ompi/mca/common/cuda/common_cuda.h @@ -76,7 +76,7 @@ OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void); OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base); #if OPAL_CUDA_SUPPORT_60 && OMPI_GDR_SUPPORT -OMPI_DECLSPEC int mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg); +OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg); OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg); #endif /* OPAL_CUDA_SUPPORT_60 */ /** diff --git a/ompi/mca/common/cuda/help-mpi-common-cuda.txt b/ompi/mca/common/cuda/help-mpi-common-cuda.txt index fc2d53a039..75173a4b1b 100644 --- a/ompi/mca/common/cuda/help-mpi-common-cuda.txt +++ b/ompi/mca/common/cuda/help-mpi-common-cuda.txt @@ -167,9 +167,10 @@ An error occurred while trying to map in the address of a function. CUDA-aware support is disabled. # [bufferID failed] -An error occurred while trying to get the BUFFER_ID of a GPU memory regiion. This -could cause incorrect results. Turn of GPU Direct RDMA support by running with ---mca btl_openib_cuda_want_gdr_support 0. +An error occurred while trying to get the BUFFER_ID of a GPU memory +region. This could cause incorrect results. Turn of GPU Direct RDMA +support by running with --mca btl_openib_cuda_want_gdr_support 0. + Hostname: %s cuPointerGetAttribute return value: %d Check the cuda.h file for what the return value means. [cuPointerSetAttribute failed]