1
1

Fix various items discovered by review of ticket #3951.

This commit was SVN r29900.
Этот коммит содержится в:
Rolf vandeVaart 2013-12-13 21:25:07 +00:00
родитель 1bc8f41edb
Коммит b955dbd6d9
7 изменённых файлов: 13 добавлений и 15 удалений

Просмотреть файл

@ -84,12 +84,12 @@ int mca_btl_base_param_register(mca_base_component_t *version,
} }
(void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).", (void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_cuda_eager_limit); &module->btl_cuda_eager_limit);
(void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.", (void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_cuda_rdma_limit); &module->btl_cuda_rdma_limit);
#endif /* OPAL_CUDA_GDR_SUPPORT */ #endif /* OPAL_CUDA_GDR_SUPPORT */

Просмотреть файл

@ -588,7 +588,7 @@ int btl_openib_register_mca_params(void)
"Whether CUDA GPU Direct RDMA support is built into library or not", "Whether CUDA GPU Direct RDMA support is built into library or not",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY, MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_4, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT, MCA_BASE_VAR_SCOPE_CONSTANT,
&mca_btl_openib_component.cuda_have_gdr); &mca_btl_openib_component.cuda_have_gdr);
@ -602,14 +602,14 @@ int btl_openib_register_mca_params(void)
"Whether Infiniband driver has GPU Direct RDMA support", "Whether Infiniband driver has GPU Direct RDMA support",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY, MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_4, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT, MCA_BASE_VAR_SCOPE_CONSTANT,
&mca_btl_openib_component.driver_have_gdr); &mca_btl_openib_component.driver_have_gdr);
/* Default for GPU Direct RDMA is off for now */ /* Default for GPU Direct RDMA is off for now */
CHECK(reg_bool("want_cuda_gdr", NULL, CHECK(reg_bool("want_cuda_gdr", NULL,
"Enable or disable CUDA GPU Direct RDMA support " "Enable or disable CUDA GPU Direct RDMA support "
"(true = yes; false = no)", "(true = enabled; false = disabled)",
false, &mca_btl_openib_component.cuda_want_gdr)); false, &mca_btl_openib_component.cuda_want_gdr));
if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) { if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {

Просмотреть файл

@ -1644,7 +1644,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
return 0; return 0;
} }
#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT #if OPAL_CUDA_GDR_SUPPORT
/* Check to see if the memory was freed between the time it was stored in /* Check to see if the memory was freed between the time it was stored in
* the registration cache and now. Return true if the memory was previously * the registration cache and now. Return true if the memory was previously
* freed. This is indicated by the BUFFER_ID value in the registration cache * freed. This is indicated by the BUFFER_ID value in the registration cache
@ -1702,10 +1702,7 @@ void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg)
if (CUDA_SUCCESS != res) { if (CUDA_SUCCESS != res) {
opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed", opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
true, ompi_process_info.nodename, res, dbuf); true, ompi_process_info.nodename, res, dbuf);
return OMPI_ERROR;
} }
} }
#endif /* OPAL_CUDA_GDR_SUPPORT */ #endif /* OPAL_CUDA_GDR_SUPPORT */

Просмотреть файл

@ -32,7 +32,6 @@ struct mca_mpool_common_cuda_reg_t {
}; };
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t; typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
extern bool mca_common_cuda_enabled; extern bool mca_common_cuda_enabled;
#define OMPI_GDR_SUPPORT 1
OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void); OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);
@ -75,7 +74,7 @@ OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2); OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void); OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base); OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT #if OPAL_CUDA_GDR_SUPPORT
OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg); OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg); OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
#endif /* OPAL_CUDA_GDR_SUPPORT */ #endif /* OPAL_CUDA_GDR_SUPPORT */

Просмотреть файл

@ -476,12 +476,12 @@ static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *add
mpool->rcache->rcache_find(mpool->rcache, addr, size, &reg); mpool->rcache->rcache_find(mpool->rcache, addr, size, &reg);
if (NULL == reg) { if (NULL == reg) {
return 0; return OMPI_SUCCESS;
} }
/* If not previously freed memory, just return 0 */ /* If not previously freed memory, just return 0 */
if (!(mca_common_cuda_previously_freed_memory(reg))) { if (!(mca_common_cuda_previously_freed_memory(reg))) {
return 0; return OMPI_SUCCESS;
} }
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */ /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */

Просмотреть файл

@ -17,7 +17,7 @@ name and its corresponding minimum value is shown below.
BTL eager limit value: %d (set via btl_%s_eager_limit) BTL eager limit value: %d (set via btl_%s_eager_limit)
BTL eager limit minimum: %d BTL eager limit minimum: %d
MCA parameter name: btl_%s_eager_limit MCA parameter name: btl_%s_eager_limit
#
[cuda_eager_limit_too_small] [cuda_eager_limit_too_small]
The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job is too low for Open MPI to function properly. Please re-run your job
@ -29,7 +29,7 @@ name and its corresponding minimum value is shown below.
BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit) BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit)
BTL CUDA eager limit minimum: %d BTL CUDA eager limit minimum: %d
MCA parameter name: btl_%s_cuda_eager_limit MCA parameter name: btl_%s_cuda_eager_limit
#
[cuda_rdma_limit_too_small] [cuda_rdma_limit_too_small]
The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job is too low for Open MPI to function properly. Please re-run your job

Просмотреть файл

@ -372,6 +372,8 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) { if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t); sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
} }
/* If size is 0, then this value is unused. If it is non-zero then do some
* extra checking of it. */
if (0 != sm->btl_module->btl_cuda_eager_limit) { if (0 != sm->btl_module->btl_cuda_eager_limit) {
if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) { if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small", opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",