Fix various items discovered by review of ticket #3951.
This commit was SVN r29900.
Этот коммит содержится в:
родитель
1bc8f41edb
Коммит
b955dbd6d9
@ -84,12 +84,12 @@ int mca_btl_base_param_register(mca_base_component_t *version,
|
|||||||
}
|
}
|
||||||
(void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).",
|
(void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).",
|
||||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9,
|
OPAL_INFO_LVL_5,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&module->btl_cuda_eager_limit);
|
&module->btl_cuda_eager_limit);
|
||||||
(void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.",
|
(void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.",
|
||||||
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9,
|
OPAL_INFO_LVL_5,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&module->btl_cuda_rdma_limit);
|
&module->btl_cuda_rdma_limit);
|
||||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||||
|
@ -588,7 +588,7 @@ int btl_openib_register_mca_params(void)
|
|||||||
"Whether CUDA GPU Direct RDMA support is built into library or not",
|
"Whether CUDA GPU Direct RDMA support is built into library or not",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||||
OPAL_INFO_LVL_4,
|
OPAL_INFO_LVL_5,
|
||||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||||
&mca_btl_openib_component.cuda_have_gdr);
|
&mca_btl_openib_component.cuda_have_gdr);
|
||||||
|
|
||||||
@ -602,14 +602,14 @@ int btl_openib_register_mca_params(void)
|
|||||||
"Whether Infiniband driver has GPU Direct RDMA support",
|
"Whether Infiniband driver has GPU Direct RDMA support",
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
||||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||||
OPAL_INFO_LVL_4,
|
OPAL_INFO_LVL_5,
|
||||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||||
&mca_btl_openib_component.driver_have_gdr);
|
&mca_btl_openib_component.driver_have_gdr);
|
||||||
|
|
||||||
/* Default for GPU Direct RDMA is off for now */
|
/* Default for GPU Direct RDMA is off for now */
|
||||||
CHECK(reg_bool("want_cuda_gdr", NULL,
|
CHECK(reg_bool("want_cuda_gdr", NULL,
|
||||||
"Enable or disable CUDA GPU Direct RDMA support "
|
"Enable or disable CUDA GPU Direct RDMA support "
|
||||||
"(true = yes; false = no)",
|
"(true = enabled; false = disabled)",
|
||||||
false, &mca_btl_openib_component.cuda_want_gdr));
|
false, &mca_btl_openib_component.cuda_want_gdr));
|
||||||
|
|
||||||
if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
|
if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
|
||||||
|
@ -1644,7 +1644,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT
|
#if OPAL_CUDA_GDR_SUPPORT
|
||||||
/* Check to see if the memory was freed between the time it was stored in
|
/* Check to see if the memory was freed between the time it was stored in
|
||||||
* the registration cache and now. Return true if the memory was previously
|
* the registration cache and now. Return true if the memory was previously
|
||||||
* freed. This is indicated by the BUFFER_ID value in the registration cache
|
* freed. This is indicated by the BUFFER_ID value in the registration cache
|
||||||
@ -1702,10 +1702,7 @@ void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg)
|
|||||||
if (CUDA_SUCCESS != res) {
|
if (CUDA_SUCCESS != res) {
|
||||||
opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
|
opal_show_help("help-mpi-common-cuda.txt", "cuPointerSetAttribute failed",
|
||||||
true, ompi_process_info.nodename, res, dbuf);
|
true, ompi_process_info.nodename, res, dbuf);
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||||
|
|
||||||
|
@ -32,7 +32,6 @@ struct mca_mpool_common_cuda_reg_t {
|
|||||||
};
|
};
|
||||||
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
|
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
|
||||||
extern bool mca_common_cuda_enabled;
|
extern bool mca_common_cuda_enabled;
|
||||||
#define OMPI_GDR_SUPPORT 1
|
|
||||||
|
|
||||||
OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);
|
OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);
|
||||||
|
|
||||||
@ -75,7 +74,7 @@ OMPI_DECLSPEC int mca_common_cuda_get_device(int *devicenum);
|
|||||||
OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
|
OMPI_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2);
|
||||||
OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
|
OMPI_DECLSPEC int mca_common_cuda_stage_one_init(void);
|
||||||
OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
|
OMPI_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
|
||||||
#if OPAL_CUDA_GDR_SUPPORT && OMPI_GDR_SUPPORT
|
#if OPAL_CUDA_GDR_SUPPORT
|
||||||
OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
|
OMPI_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
|
||||||
OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
|
OMPI_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
|
||||||
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
#endif /* OPAL_CUDA_GDR_SUPPORT */
|
||||||
|
@ -476,12 +476,12 @@ static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *add
|
|||||||
|
|
||||||
mpool->rcache->rcache_find(mpool->rcache, addr, size, ®);
|
mpool->rcache->rcache_find(mpool->rcache, addr, size, ®);
|
||||||
if (NULL == reg) {
|
if (NULL == reg) {
|
||||||
return 0;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If not previously freed memory, just return 0 */
|
/* If not previously freed memory, just return 0 */
|
||||||
if (!(mca_common_cuda_previously_freed_memory(reg))) {
|
if (!(mca_common_cuda_previously_freed_memory(reg))) {
|
||||||
return 0;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */
|
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */
|
||||||
|
@ -17,7 +17,7 @@ name and its corresponding minimum value is shown below.
|
|||||||
BTL eager limit value: %d (set via btl_%s_eager_limit)
|
BTL eager limit value: %d (set via btl_%s_eager_limit)
|
||||||
BTL eager limit minimum: %d
|
BTL eager limit minimum: %d
|
||||||
MCA parameter name: btl_%s_eager_limit
|
MCA parameter name: btl_%s_eager_limit
|
||||||
|
#
|
||||||
[cuda_eager_limit_too_small]
|
[cuda_eager_limit_too_small]
|
||||||
The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which
|
The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which
|
||||||
is too low for Open MPI to function properly. Please re-run your job
|
is too low for Open MPI to function properly. Please re-run your job
|
||||||
@ -29,7 +29,7 @@ name and its corresponding minimum value is shown below.
|
|||||||
BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit)
|
BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit)
|
||||||
BTL CUDA eager limit minimum: %d
|
BTL CUDA eager limit minimum: %d
|
||||||
MCA parameter name: btl_%s_cuda_eager_limit
|
MCA parameter name: btl_%s_cuda_eager_limit
|
||||||
|
#
|
||||||
[cuda_rdma_limit_too_small]
|
[cuda_rdma_limit_too_small]
|
||||||
The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which
|
The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which
|
||||||
is too low for Open MPI to function properly. Please re-run your job
|
is too low for Open MPI to function properly. Please re-run your job
|
||||||
|
@ -372,6 +372,8 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
|
if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
|
||||||
sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
|
sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
|
||||||
}
|
}
|
||||||
|
/* If size is 0, then this value is unused. If it is non-zero then do some
|
||||||
|
* extra checking of it. */
|
||||||
if (0 != sm->btl_module->btl_cuda_eager_limit) {
|
if (0 != sm->btl_module->btl_cuda_eager_limit) {
|
||||||
if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
|
if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
|
||||||
opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
|
opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user