From b955dbd6d96ebbbf2424b14586f7a56572bd2bf7 Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Fri, 13 Dec 2013 21:25:07 +0000 Subject: [PATCH] Fix various items discovered by review of ticket #3951. This commit was SVN r29900. --- ompi/mca/btl/base/btl_base_mca.c | 4 ++-- ompi/mca/btl/openib/btl_openib_mca.c | 6 +++--- ompi/mca/common/cuda/common_cuda.c | 5 +---- ompi/mca/common/cuda/common_cuda.h | 3 +-- ompi/mca/mpool/grdma/mpool_grdma_module.c | 4 ++-- ompi/mca/pml/ob1/help-mpi-pml-ob1.txt | 4 ++-- ompi/mca/pml/ob1/pml_ob1.c | 2 ++ 7 files changed, 13 insertions(+), 15 deletions(-) diff --git a/ompi/mca/btl/base/btl_base_mca.c b/ompi/mca/btl/base/btl_base_mca.c index 4b250d4f75..b5ad3e7bd1 100644 --- a/ompi/mca/btl/base/btl_base_mca.c +++ b/ompi/mca/btl/base/btl_base_mca.c @@ -84,12 +84,12 @@ int mca_btl_base_param_register(mca_base_component_t *version, } (void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, - OPAL_INFO_LVL_9, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &module->btl_cuda_eager_limit); (void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, - OPAL_INFO_LVL_9, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &module->btl_cuda_rdma_limit); #endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 02aa2b869a..036423e351 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -588,7 +588,7 @@ int btl_openib_register_mca_params(void) "Whether CUDA GPU Direct RDMA support is built into library or not", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_4, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_CONSTANT, &mca_btl_openib_component.cuda_have_gdr); @@ -602,14 +602,14 @@ int btl_openib_register_mca_params(void) "Whether Infiniband driver has GPU Direct RDMA support", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_4, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_CONSTANT, &mca_btl_openib_component.driver_have_gdr); /* Default for GPU Direct RDMA is off for now */ CHECK(reg_bool("want_cuda_gdr", NULL, "Enable or disable CUDA GPU Direct RDMA support " - "(true = yes; false = no)", + "(true = enabled; false = disabled)", false, &mca_btl_openib_component.cuda_want_gdr)); if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) { diff --git a/ompi/mca/common/cuda/common_cuda.c b/ompi/mca/common/cuda/common_cuda.c index 62c14a0168..eb4c459102 100644 --- a/ompi/mca/common/cuda/common_cuda.c +++ b/ompi/mca/common/cuda/common_cuda.c @@ -1644,7 +1644,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base) return 0; } -#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT +#if OPAL_CUDA_GDR_SUPPORT /* Check to see if the memory was freed between the time it was stored in * the registration cache and now. Return true if the memory was previously * freed. This is indicated by the BUFFER_ID value in the registration cache @@ -1702,10 +1702,7 @@ void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg) if (CUDA_SUCCESS != res) { opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", true, ompi_process_info.nodename, res, dbuf); - return OMPI_ERROR; } - - } #endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/ompi/mca/common/cuda/common_cuda.h b/ompi/mca/common/cuda/common_cuda.h index ea2aca48bd..cfc8f32027 100644 --- a/ompi/mca/common/cuda/common_cuda.h +++ b/ompi/mca/common/cuda/common_cuda.h @@ -32,7 +32,6 @@ struct mca_mpool_common_cuda_reg_t { }; typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t; extern bool mca_common_cuda_enabled; -#define OMPI_GDR_SUPPORT 1 OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void); @@ -75,7 +74,7 @@ OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum); OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2); OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void); OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base); -#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT +#if OPAL_CUDA_GDR_SUPPORT OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg); OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg); #endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/ompi/mca/mpool/grdma/mpool_grdma_module.c b/ompi/mca/mpool/grdma/mpool_grdma_module.c index 8f207634ed..437d815caf 100644 --- a/ompi/mca/mpool/grdma/mpool_grdma_module.c +++ b/ompi/mca/mpool/grdma/mpool_grdma_module.c @@ -476,12 +476,12 @@ static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *add mpool->rcache->rcache_find(mpool->rcache, addr, size, ®); if (NULL == reg) { - return 0; + return OMPI_SUCCESS; } /* If not previously freed memory, just return 0 */ if (!(mca_common_cuda_previously_freed_memory(reg))) { - return 0; + return OMPI_SUCCESS; } /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */ diff --git a/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt b/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt index e7828bca6c..4278bd2059 100644 --- a/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt +++ b/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt @@ -17,7 +17,7 @@ name and its corresponding minimum value is shown below. BTL eager limit value: %d (set via btl_%s_eager_limit) BTL eager limit minimum: %d MCA parameter name: btl_%s_eager_limit - +# [cuda_eager_limit_too_small] The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which is too low for Open MPI to function properly. Please re-run your job @@ -29,7 +29,7 @@ name and its corresponding minimum value is shown below. BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit) BTL CUDA eager limit minimum: %d MCA parameter name: btl_%s_cuda_eager_limit - +# [cuda_rdma_limit_too_small] The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which is too low for Open MPI to function properly. Please re-run your job diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index 25c69075ef..bfb975afa5 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -372,6 +372,8 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs) if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) { sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t); } + /* If size is 0, then this value is unused. If it is non-zero then do some + * extra checking of it. */ if (0 != sm->btl_module->btl_cuda_eager_limit) { if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) { opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",