Per this RFC from October 8, 2013 and as discuessed in telecon.

http://www.open-mpi.org/community/lists/devel/2013/10/13072.php Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. Note that none of this is compiled in unless CUDA-aware support is requested. This commit was SVN r29680.
2013-11-13 13:22:39 +00:00 · 2013-11-13 13:22:39 +00:00 · 4964a5e98b
--- a/ompi/mca/btl/base/btl_base_mca.c
+++ b/ompi/mca/btl/base/btl_base_mca.c
@ -12,6 +12,7 @@
 * Copyright (c) 2006-2007 Voltaire All rights reserved.
 * Copyright (c) 2007      Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
 *
 * $COPYRIGHT$
 * 
@ -75,6 +76,23 @@ int mca_btl_base_param_register(mca_base_component_t *version,
                                           OPAL_INFO_LVL_4,
                                           MCA_BASE_VAR_SCOPE_READONLY,
                                           &module->btl_eager_limit);
+#if OPAL_CUDA_SUPPORT_60
+    /* If no CUDA RDMA support, zero them out */
+    if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
+        module->btl_cuda_eager_limit = 0;
+        module->btl_cuda_rdma_limit = SIZE_MAX;
+    }
+    (void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).",
+                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &module->btl_cuda_eager_limit);
+    (void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.",
+                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &module->btl_cuda_rdma_limit);
+#endif /* OPAL_CUDA_SUPPORT_60 */

    (void) mca_base_component_var_register(version, "max_send_size", "Maximum size (in bytes) of a single \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1)",
                                           MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
--- a/ompi/mca/btl/btl.h
+++ b/ompi/mca/btl/btl.h
@ -13,7 +13,7 @@
 * Copyright (c) 2006-2013 Los Alamos National Security, LLC.  All rights
 *                         reserved. 
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
- * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -843,6 +843,10 @@ struct mca_btl_base_module_t {
    mca_btl_base_module_register_error_fn_t btl_register_error;
    /** fault tolerant even notification */
    mca_btl_base_module_ft_event_fn_t btl_ft_event;
+#if OPAL_CUDA_SUPPORT_60
+    size_t      btl_cuda_eager_limit;  /**< switch from eager to RDMA */
+    size_t      btl_cuda_rdma_limit;   /**< switch from RDMA to rndv pipeline */
+#endif /* OPAL_CUDA_SUPPORT_60 */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;

--- a/ompi/mca/btl/openib/btl_openib.c
+++ b/ompi/mca/btl/openib/btl_openib.c
@ -1219,7 +1219,11 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(

    openib_btl = (mca_btl_openib_module_t*)btl;

+#if OPAL_CUDA_SUPPORT_60
+    if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
+#else
    if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
+#endif /* OPAL_CUDA_SUPPORT_60 */
        /* GMS  bloody HACK! */
        if(registration != NULL || max_data > btl->btl_max_send_size) {
            frag = alloc_send_user_frag();
@ -1376,7 +1380,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
        /* we didn't get a memory registration passed in, so we have to
         * register the region ourselves
         */
-        rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, 0,
+        uint32_t mflags = 0;
+#if OPAL_CUDA_SUPPORT
+        if (convertor->flags & CONVERTOR_CUDA) {
+            mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
+        }
+#endif /* OPAL_CUDA_SUPPORT */
+        rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
                &registration);
        if(OMPI_SUCCESS != rc || NULL == registration) {
            MCA_BTL_IB_FRAG_RETURN(frag);
--- a/ompi/mca/btl/openib/btl_openib.h
+++ b/ompi/mca/btl/openib/btl_openib.h
@ -311,9 +311,11 @@ struct mca_btl_openib_component_t {
    size_t memalign_threshold;
    void* (*previous_malloc_hook)(size_t __size, const void*);
 #endif
-#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+#if OPAL_CUDA_SUPPORT
    bool cuda_async_send;
    bool cuda_async_recv;
+    bool cuda_have_gdr;
+    bool cuda_want_gdr;
 #endif /* OPAL_CUDA_SUPPORT */
 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
    bool rroce_enable;
--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -580,6 +580,37 @@ int btl_openib_register_mca_params(void)
    mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
    /* Turn of message coalescing - not sure if it works with GPU buffers */
    mca_btl_openib_component.use_message_coalescing = 0;
+
+    /* Indicates if library was built with GPU Direct RDMA support.  Not changeable.  */
+    mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT_60);
+    (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr_support",
+                                           "Whether CUDA GPU Direct RDMA support is built into library or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
+                                           MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
+                                           OPAL_INFO_LVL_4,
+                                           MCA_BASE_VAR_SCOPE_CONSTANT,
+                                           &mca_btl_openib_component.cuda_have_gdr);
+
+    /* Default for GPU Direct RDMA is off for now */
+    CHECK(reg_bool("cuda_want_gdr_support", NULL,
+                   "Enable or disable CUDA GPU Direct RDMA support "
+                   "(true = yes; false = no)",
+                   false, &mca_btl_openib_component.cuda_want_gdr));
+
+    if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
+        opal_output(0, "GDR support requested but library does not have it built in.");
+        return OMPI_ERROR;
+    }
+#if OPAL_CUDA_SUPPORT_60
+    if (mca_btl_openib_component.cuda_want_gdr) {
+        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
+        mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */
+        mca_btl_openib_module.super.btl_cuda_rdma_limit = 1024 * 20;  /* default switchover is 20K to pipeline */
+    } else {
+        mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */
+        mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
+    }
+#endif /* OPAL_CUDA_SUPPORT_60 */
 #endif /* OPAL_CUDA_SUPPORT */
    CHECK(mca_btl_base_param_register(
            &mca_btl_openib_component.super.btl_version,
--- a/ompi/mca/common/cuda/common_cuda.h
+++ b/ompi/mca/common/cuda/common_cuda.h
@ -32,7 +32,7 @@ struct mca_mpool_common_cuda_reg_t {
 };
 typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
 extern bool mca_common_cuda_enabled;
-#define OMPI_GDR_SUPPORT 0
+#define OMPI_GDR_SUPPORT 1

 OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);

--- a/ompi/mca/mpool/grdma/Makefile.am
+++ b/ompi/mca/mpool/grdma/Makefile.am
@ -47,6 +47,10 @@ mcacomponent_LTLIBRARIES = $(component_install)
 mca_mpool_grdma_la_SOURCES = $(sources)
 mca_mpool_grdma_la_LDFLAGS = -module -avoid-version
 mca_mpool_grdma_la_LIBADD = $(mpool_grdma_LIBS)
+if OPAL_cuda_support
+mca_mpool_grdma_la_LIBADD += \
+    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+endif

 noinst_LTLIBRARIES = $(component_noinst)
 libmca_mpool_grdma_la_SOURCES = $(sources)
--- a/ompi/mca/mpool/grdma/mpool_grdma_module.c
+++ b/ompi/mca/mpool/grdma/mpool_grdma_module.c
@ -36,6 +36,9 @@

 #include "opal/align.h"

+#if OPAL_CUDA_SUPPORT_60
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT_60 */
 #include "ompi/mca/rcache/rcache.h"
 #include "ompi/mca/rcache/base/base.h"
 #include "ompi/mca/rte/rte.h"
@ -44,6 +47,9 @@
 #include "ompi/mca/mpool/base/base.h"
 #include "mpool_grdma.h"

+#if OPAL_CUDA_SUPPORT_60
+static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size);
+#endif /* OPAL_CUDA_SUPPORT_60 */
 static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool)
 {
    memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super));
@ -230,6 +236,17 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
    if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
        do_unregistration_gc(mpool);

+#if OPAL_CUDA_SUPPORT_60
+    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
+        size_t psize;
+        mca_common_cuda_get_address_range(&base, &psize, addr);
+        bound = base + psize - 1;
+        /* Check to see if this memory is in the cache and if it has been freed. If so,
+         * this call will boot it out of the cache. */
+        check_for_cuda_freed_memory(mpool, base, psize);
+    }
+#endif /* OPAL_CUDA_SUPPORT_60 */
+
    /* look through existing regs if not persistent registration requested.
     * Persistent registration are always registered and placed in the cache */
    if(!(bypass_cache || persist)) {
@ -270,6 +287,11 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
    grdma_reg->base = base;
    grdma_reg->bound = bound;
    grdma_reg->flags = flags;
+#if OPAL_CUDA_SUPPORT_60
+    if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
+        mca_common_cuda_get_buffer_id(grdma_reg);
+    }
+#endif /* OPAL_CUDA_SUPPORT_60 */

    if (false == bypass_cache) {
        rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);
@ -440,6 +462,61 @@ int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool,
    return rc;
 }

+/* Make sure this registration request is not stale.  In other words, ensure
+ * that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state.  If we do
+ * kick out the regisrations and deregister.  This function needs to be called
+ * with the mpool->rcache->lock held. */
+#if OPAL_CUDA_SUPPORT_60
+static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size)
+{
+    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
+    mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
+    int reg_cnt, i, rc = OMPI_SUCCESS;
+    mca_mpool_base_registration_t *reg;
+
+    mpool->rcache->rcache_find(mpool->rcache, addr, size, &reg);
+    if (NULL == reg) {
+        return 0;
+    }
+
+    /* If not previously freed memory, just return 0 */
+    if (!(mca_common_cuda_previously_freed_memory(reg))) {
+        return 0;
+    }
+
+    /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */
+
+    /* This memory has been freed.  Find all registrations and delete */
+    do {
+        reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1,
+                                                 regs, GRDMA_MPOOL_NREGS);
+        for(i = 0 ; i < reg_cnt ; ++i) {
+            regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID;
+            if (regs[i]->ref_count) {
+                opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d",
+                            regs[i]->ref_count, regs[i]->base, regs[i]->bound,
+                            (int) (regs[i]->bound - regs[i]->base + 1));
+                /* memory is being freed, but there are registration in use that
+                 * covers the memory. This can happen even in a correct program,
+                 * but may also be an user error. We can't tell. Mark the
+                 * registration as invalid. It will not be used any more and
+                 * will be unregistered when ref_count will become zero */
+                rc = OMPI_ERROR; /* tell caller that something was wrong */
+            } else {
+                opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]);
+                /* Now deregister.  Do not use gc_list as we need to kick this out now. */
+                dereg_mem(regs[i]);
+            }
+        }
+    } while(reg_cnt == GRDMA_MPOOL_NREGS);
+
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+    /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/
+
+    return rc;
+}
+#endif /* OPAL_CUDA_SUPPORT_60 */
+
 void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool)
 {
    mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;
--- a/ompi/mca/mpool/mpool.h
+++ b/ompi/mca/mpool/mpool.h
@ -38,6 +38,7 @@
 #define MCA_MPOOL_FLAGS_INVALID 0x8
 #define MCA_MPOOL_FLAGS_SO_MEM 0x10
 #define MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM 0x20
+#define MCA_MPOOL_FLAGS_CUDA_GPU_MEM 0x40

 struct mca_mpool_base_resources_t;

@ -49,6 +50,9 @@ struct mca_mpool_base_registration_t {
    unsigned char* alloc_base;
    int32_t ref_count; 
    uint32_t flags;
+#if OPAL_CUDA_SUPPORT_60
+    unsigned long long gpu_bufID;
+#endif /* OPAL_CUDA_SUPPORT_60 */
 };  

 typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t; 
--- a/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt
+++ b/ompi/mca/pml/ob1/help-mpi-pml-ob1.txt
@ -12,9 +12,32 @@ The "eager limit" MCA parameter in the %s BTL was set to a value which
 is too low for Open MPI to function properly.  Please re-run your job
 with a higher eager limit value for this BTL; the exact MCA parameter
 name and its corresponding minimum value is shown below.
-
  Local host:              %s
  BTL name:                %s
  BTL eager limit value:   %d (set via btl_%s_eager_limit)
  BTL eager limit minimum: %d
  MCA parameter name:      btl_%s_eager_limit 
+
+[cuda_eager_limit_too_small]
+The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which
+is too low for Open MPI to function properly.  Please re-run your job
+with a higher CUDA eager limit value for this BTL; the exact MCA parameter
+name and its corresponding minimum value is shown below.
+
+  Local host:                   %s
+  BTL name:                     %s
+  BTL CUDA eager limit value:   %d (set via btl_%s_cuda_eager_limit)
+  BTL CUDA eager limit minimum: %d
+  MCA parameter name:           btl_%s_cuda_eager_limit 
+
+[cuda_rdma_limit_too_small]
+The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which
+is too low for Open MPI to function properly.  Please re-run your job
+with a higher CUDA rdma limit value for this BTL; the exact MCA parameter
+name and its corresponding minimum value is shown below.
+
+  Local host:                   %s
+  BTL name:                     %s
+  BTL CUDA rndv limit value:    %d (set via btl_%s_cuda_rdma_limit)
+  BTL CUDA rndv limit minimum:  %d
+  MCA parameter name:           btl_%s_cuda_rdma_limit 
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@ -366,6 +366,46 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
            rc = OMPI_ERR_BAD_PARAM;
            goto cleanup_and_return;
        }
+#if OPAL_CUDA_SUPPORT_60
+        /* If size is SIZE_MAX, then we know we want to set this to the minimum possible
+         * value which is the size of the PML header. */
+        if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
+            sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
+        }
+        if (0 != sm->btl_module->btl_cuda_eager_limit) {
+            if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
+                opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
+                               true, 
+                               sm->btl_component->btl_version.mca_component_name,
+                               ompi_process_info.nodename,
+                               sm->btl_component->btl_version.mca_component_name,
+                               sm->btl_module->btl_cuda_eager_limit,
+                               sm->btl_component->btl_version.mca_component_name,
+                               sizeof(mca_pml_ob1_hdr_t),
+                               sm->btl_component->btl_version.mca_component_name);
+                rc = OMPI_ERR_BAD_PARAM;
+                goto cleanup_and_return;
+            }
+        }
+        if (0 == sm->btl_module->btl_cuda_rdma_limit) {
+            /* All is fine.  0 means to ignore value so set to SIZE_MAX */
+            sm->btl_module->btl_cuda_rdma_limit = SIZE_MAX;
+        } else {
+            if (sm->btl_module->btl_cuda_rdma_limit < sm->btl_module->btl_cuda_eager_limit) {
+                opal_show_help("help-mpi-pml-ob1.txt", "cuda_rdma_limit_too_small",
+                               true, 
+                               sm->btl_component->btl_version.mca_component_name,
+                               ompi_process_info.nodename,
+                               sm->btl_component->btl_version.mca_component_name,
+                               sm->btl_module->btl_cuda_rdma_limit,
+                               sm->btl_component->btl_version.mca_component_name,
+                               sm->btl_module->btl_cuda_eager_limit,
+                               sm->btl_component->btl_version.mca_component_name);
+                rc = OMPI_ERR_BAD_PARAM;
+                goto cleanup_and_return;
+            }
+        }
+#endif /* OPAL_CUDA_SUPPORT_60 */
    }


--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -11,7 +11,7 @@
 *                         All rights reserved.
 * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
- * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2012-2013 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -54,6 +54,14 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                        size_t size) {
    int rc;
 #if OPAL_CUDA_SUPPORT_41
+#if OPAL_CUDA_SUPPORT_60
+    /* With some BTLs, switch to RNDV from RGET at large messages */
+    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && 
+        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+    }
+#endif /* OPAL_CUDA_SUPPORT_60 */    
+
    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
@ -120,7 +128,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(

            if( NULL != btl_mpool ) {
                /* register the memory */
-                btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
+                btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg);
            }

            if(NULL == reg)
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@ -366,6 +366,12 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
    size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t);
    int rc;

+#if OPAL_CUDA_SUPPORT_60
+    if (btl->btl_cuda_eager_limit && (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        eager_limit = btl->btl_cuda_eager_limit - sizeof(mca_pml_ob1_hdr_t);
+    }
+#endif /* OPAL_CUDA_SUPPORT_60 */
+
    if( OPAL_LIKELY(size <= eager_limit) ) {
        switch(sendreq->req_send.req_send_mode) {
        case MCA_PML_BASE_SEND_SYNCHRONOUS: