Merge pull request #687 from rolfv/pr/fix-smcuda-perfprob

Add the ability use different size buffers for host and CUDA buffers
2015-07-02 18:42:41 -04:00 · 2015-07-02 18:42:41 -04:00 · 77367ca02c
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -965,9 +965,17 @@ cannot_pack:

        /* makes sure that we don't exceed BTL max send size */
        if(bml_btl->btl->btl_max_send_size != 0) {
+#if OPAL_CUDA_SUPPORT
+            size_t max_send_size;
+            if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && (bml_btl->btl->btl_cuda_max_send_size != 0)) {
+                max_send_size = bml_btl->btl->btl_cuda_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t);
+            } else {
+                max_send_size = bml_btl->btl->btl_max_send_size - sizeof(mca_pml_ob1_frag_hdr_t);
+            }
+#else /* OPAL_CUDA_SUPPORT */
            size_t max_send_size = bml_btl->btl->btl_max_send_size -
                sizeof(mca_pml_ob1_frag_hdr_t);
-
+#endif /* OPAL_CUDA_SUPPORT */
            if (size > max_send_size) {
                size = max_send_size;
            }
--- a/opal/mca/btl/base/btl_base_mca.c
+++ b/opal/mca/btl/base/btl_base_mca.c
@ -135,6 +135,14 @@ int mca_btl_base_param_register(mca_base_component_t *version,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &module->btl_cuda_rdma_limit);
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+#if OPAL_CUDA_SUPPORT
+    module->btl_cuda_max_send_size = 0;
+    (void) mca_base_component_var_register(version, "cuda_max_send_size", "Maximum size (in bytes) of a single GPU \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1) (only valid on smcuda btl)",
+                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &module->btl_cuda_max_send_size);
+#endif /* OPAL_CUDA_SUPPORT */

    (void) mca_base_component_var_register(version, "max_send_size", "Maximum size (in bytes) of a single \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1)",
                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@ -1170,6 +1170,9 @@ struct mca_btl_base_module_t {
    size_t      btl_cuda_eager_limit;  /**< switch from eager to RDMA */
    size_t      btl_cuda_rdma_limit;   /**< switch from RDMA to rndv pipeline */
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+#if OPAL_CUDA_SUPPORT
+    size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+#endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;

--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@ -815,6 +815,11 @@ int btl_openib_verify_mca_params (void)
        }
    }
 #endif /* Workaround */
+    if (0 != mca_btl_openib_module.super.btl_cuda_max_send_size) {
+        opal_show_help("help-mpi-btl-openib.txt", "do_not_set_openib_value",
+                       true, opal_process_info.nodename);
+        mca_btl_openib_module.super.btl_cuda_max_send_size = 0;
+    }
 #endif

 #if BTL_OPENIB_MALLOC_HOOKS_ENABLED
--- a/opal/mca/btl/openib/help-mpi-btl-openib.txt
+++ b/opal/mca/btl/openib/help-mpi-btl-openib.txt
@ -700,3 +700,9 @@ with CUDA GPU Direct RDMA. Either disable GPU Direct RDMA support or
 enable "leave pinned" support. Deactivating the openib BTL.

  Local host:              %s
+#
+[do_not_set_openib_value]
+Open MPI has detected that you have attempted to set the btl_openib_cuda_max_send_size
+value. This is not supported. Setting back to default value of 0.
+
+  Local host:              %s
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@ -173,7 +173,7 @@ static int smcuda_register(void)
 #endif /* OPAL_CUDA_SUPPORT */
    mca_btl_smcuda.super.btl_eager_limit = 4*1024;
    mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
-    mca_btl_smcuda.super.btl_max_send_size = 128*1024;
+    mca_btl_smcuda.super.btl_max_send_size = 32*1024;
    mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
    mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
    mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
@ -185,7 +185,12 @@ static int smcuda_register(void)
    /* Call the BTL based to register its MCA params */
    mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
                                &mca_btl_smcuda.super);
-
+#if OPAL_CUDA_SUPPORT
+    /* If user has not set the value, then set to the defalt */
+    if (0 == mca_btl_smcuda.super.btl_cuda_max_send_size) {
+        mca_btl_smcuda.super.btl_cuda_max_send_size = 128*1024;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
    return mca_btl_smcuda_component_verify();
 }

@ -214,6 +219,17 @@ static int mca_btl_smcuda_component_open(void)
    mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
    mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;

+#if OPAL_CUDA_SUPPORT
+    /* Possibly adjust max_frag_size if the cuda size is bigger */
+    if (mca_btl_smcuda.super.btl_cuda_max_send_size > mca_btl_smcuda.super.btl_max_send_size) {
+        mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_cuda_max_send_size;
+    }
+    opal_output_verbose(10, opal_btl_base_framework.framework_output,
+                        "btl: smcuda: cuda_max_send_size=%d, max_send_size=%d, max_frag_size=%d",
+                        (int)mca_btl_smcuda.super.btl_cuda_max_send_size, (int)mca_btl_smcuda.super.btl_max_send_size,
+                        (int)mca_btl_smcuda_component.max_frag_size);
+#endif /* OPAL_CUDA_SUPPORT */
+
    /* initialize objects */
    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, opal_free_list_t);