diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index b964901822..f1f2744b2e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -965,9 +965,17 @@ cannot_pack: /* makes sure that we don't exceed BTL max send size */ if(bml_btl->btl->btl_max_send_size != 0) { +#if OPAL_CUDA_SUPPORT + size_t max_send_size; + if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && (bml_btl->btl->btl_cuda_max_send_size != 0)) { + max_send_size = bml_btl->btl->btl_cuda_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t); + } else { + max_send_size = bml_btl->btl->btl_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t); + } +#else /* OPAL_CUDA_SUPPORT */ size_t max_send_size = bml_btl->btl->btl_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t); - +#endif /* OPAL_CUDA_SUPPORT */ if (size > max_send_size) { size = max_send_size; } diff --git a/opal/mca/btl/base/btl_base_mca.c b/opal/mca/btl/base/btl_base_mca.c index 0e46d591fd..3d8a2e54c6 100644 --- a/opal/mca/btl/base/btl_base_mca.c +++ b/opal/mca/btl/base/btl_base_mca.c @@ -135,6 +135,14 @@ int mca_btl_base_param_register(mca_base_component_t *version, MCA_BASE_VAR_SCOPE_READONLY, &module->btl_cuda_rdma_limit); #endif /* OPAL_CUDA_GDR_SUPPORT */ +#if OPAL_CUDA_SUPPORT + module->btl_cuda_max_send_size = 0; + (void) mca_base_component_var_register(version, "cuda_max_send_size", "Maximum size (in bytes) of a single GPU \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1) (only valid on smcuda btl)", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &module->btl_cuda_max_send_size); +#endif /* OPAL_CUDA_SUPPORT */ (void) mca_base_component_var_register(version, "max_send_size", "Maximum size (in bytes) of a single \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1)", MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index b41e54b353..885a6fc0f4 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -1170,6 +1170,9 @@ struct mca_btl_base_module_t { size_t btl_cuda_eager_limit; /**< switch from eager to RDMA */ size_t btl_cuda_rdma_limit; /**< switch from RDMA to rndv pipeline */ #endif /* OPAL_CUDA_GDR_SUPPORT */ +#if OPAL_CUDA_SUPPORT + size_t btl_cuda_max_send_size; /**< set if CUDA max send_size is different from host max send size */ +#endif /* OPAL_CUDA_SUPPORT */ }; typedef struct mca_btl_base_module_t mca_btl_base_module_t; diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 19fa22c338..1c3edc5a7e 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -815,6 +815,11 @@ int btl_openib_verify_mca_params (void) } } #endif /* Workaround */ + if (0 != mca_btl_openib_module.super.btl_cuda_max_send_size) { + opal_show_help("help-mpi-btl-openib.txt", "do_not_set_openib_value", + true, opal_process_info.nodename); + mca_btl_openib_module.super.btl_cuda_max_send_size = 0; + } #endif #if BTL_OPENIB_MALLOC_HOOKS_ENABLED diff --git a/opal/mca/btl/openib/help-mpi-btl-openib.txt b/opal/mca/btl/openib/help-mpi-btl-openib.txt index 94dcc7b8f5..7266893b6e 100644 --- a/opal/mca/btl/openib/help-mpi-btl-openib.txt +++ b/opal/mca/btl/openib/help-mpi-btl-openib.txt @@ -700,3 +700,9 @@ with CUDA GPU Direct RDMA. Either disable GPU Direct RDMA support or enable "leave pinned" support. Deactivating the openib BTL. Local host: %s +# +[do_not_set_openib_value] +Open MPI has detected that you have attempted to set the btl_openib_cuda_max_send_size +value. This is not supported. Setting back to default value of 0. + + Local host: %s diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index f9639f13d3..f18e95e079 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -173,7 +173,7 @@ static int smcuda_register(void) #endif /* OPAL_CUDA_SUPPORT */ mca_btl_smcuda.super.btl_eager_limit = 4*1024; mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024; - mca_btl_smcuda.super.btl_max_send_size = 128*1024; + mca_btl_smcuda.super.btl_max_send_size = 32*1024; mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024; mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024; mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024; @@ -185,7 +185,12 @@ static int smcuda_register(void) /* Call the BTL based to register its MCA params */ mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version, &mca_btl_smcuda.super); - +#if OPAL_CUDA_SUPPORT + /* If user has not set the value, then set to the defalt */ + if (0 == mca_btl_smcuda.super.btl_cuda_max_send_size) { + mca_btl_smcuda.super.btl_cuda_max_send_size = 128*1024; + } +#endif /* OPAL_CUDA_SUPPORT */ return mca_btl_smcuda_component_verify(); } @@ -214,6 +219,17 @@ static int mca_btl_smcuda_component_open(void) mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size; mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit; +#if OPAL_CUDA_SUPPORT + /* Possibly adjust max_frag_size if the cuda size is bigger */ + if (mca_btl_smcuda.super.btl_cuda_max_send_size > mca_btl_smcuda.super.btl_max_send_size) { + mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_cuda_max_send_size; + } + opal_output_verbose(10, opal_btl_base_framework.framework_output, + "btl: smcuda: cuda_max_send_size=%d, max_send_size=%d, max_frag_size=%d", + (int)mca_btl_smcuda.super.btl_cuda_max_send_size, (int)mca_btl_smcuda.super.btl_max_send_size, + (int)mca_btl_smcuda_component.max_frag_size); +#endif /* OPAL_CUDA_SUPPORT */ + /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, opal_free_list_t);