Two new mpools. They are not used now (and by default, not compiled)

but they will be soon. Provide support for GPU buffer transfers within a node. This commit was SVN r26008.
2012-02-22 23:32:36 +00:00 · 2012-02-22 23:32:36 +00:00 · c7a0ce2755
--- a/ompi/mca/common/cuda/common_cuda.c
+++ b/ompi/mca/common/cuda/common_cuda.c
@ -355,7 +355,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
    CUdeviceptr pbase;
    size_t psize;

-    mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)newreg;
+    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;

    /* We should only be there if this is a CUDA device pointer */
    result = cuPointerGetAttribute(&memType,
@ -416,7 +416,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
 */
 int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg) 
 {
-    CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_rcuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
+    CUDA_DUMP_EVTHANDLE((10, ((mca_mpool_common_cuda_reg_t *)reg)->evtHandle, "cuda_ungetmemhandle"));
    opal_output_verbose(5, mca_common_cuda_output,
                        "CUDA: cuda_ungetmemhandle: base=%p",
                        reg_data);
@ -434,7 +434,7 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
 {
    CUresult result;
    CUipcMemHandle memHandle;
-    mca_mpool_rcuda_reg_t *cuda_newreg = (mca_mpool_rcuda_reg_t*)newreg;
+    mca_mpool_common_cuda_reg_t *cuda_newreg = (mca_mpool_common_cuda_reg_t*)newreg;

    /* Need to copy into memory handle for call into CUDA library. */
    memcpy(&memHandle, cuda_newreg->memHandle, sizeof(memHandle));
@ -473,7 +473,7 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n
 int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg)
 {
    CUresult result;
-    mca_mpool_rcuda_reg_t *cuda_reg = (mca_mpool_rcuda_reg_t*)reg;
+    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg;

    result = cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base);
    if (CUDA_SUCCESS != result) {
@ -526,7 +526,7 @@ void mca_common_cuda_destruct_event(uint64_t *event)
 * Put remote event on stream to ensure that the the start of the
 * copy does not start until the completion of the event.
 */
-void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg)
+void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg)
 {
    CUipcEventHandle evtHandle;
    CUevent event;
@ -724,8 +724,8 @@ int progress_one_cuda_event(struct mca_btl_base_descriptor_t **frag) {
 * Need to make sure the handle we are retrieving from the cache is still
 * valid.  Compare the cached handle to the one received. 
 */
-int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg, 
-                                      mca_mpool_rcuda_reg_t *old_reg)
+int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg, 
+                                      mca_mpool_common_cuda_reg_t *old_reg)
 {

    if (0 == memcmp(new_reg->memHandle, old_reg->memHandle, sizeof(new_reg->memHandle))) {
--- a/ompi/mca/common/cuda/common_cuda.h
+++ b/ompi/mca/common/cuda/common_cuda.h
@ -21,28 +21,30 @@
 #define OMPI_MCA_COMMON_CUDA_H
 #include "ompi/mca/btl/btl.h"

-struct mca_mpool_rcuda_reg_t {
+#define MEMHANDLE_SIZE 8
+#define EVTHANDLE_SIZE 8
+struct mca_mpool_common_cuda_reg_t {
    mca_mpool_base_registration_t base;
-    uint64_t memHandle[8];
-    uint64_t evtHandle[8];
+    uint64_t memHandle[MEMHANDLE_SIZE];
+    uint64_t evtHandle[EVTHANDLE_SIZE];
    uint64_t event;
 };
-typedef struct mca_mpool_rcuda_reg_t mca_mpool_rcuda_reg_t;
+typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;


 OMPI_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);

 OMPI_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);

-OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_rcuda_reg_t *rget_reg);
+OMPI_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg);

 OMPI_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
                                         struct mca_btl_base_descriptor_t *, int *done);

 OMPI_DECLSPEC int progress_one_cuda_event(struct mca_btl_base_descriptor_t **);

-OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_rcuda_reg_t *new_reg,
-                                                    mca_mpool_rcuda_reg_t *old_reg);
+OMPI_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg,
+                                                    mca_mpool_common_cuda_reg_t *old_reg);

 OMPI_DECLSPEC void mca_common_cuda_construct_event_and_handle(uint64_t **event, void **handle);
 OMPI_DECLSPEC void mca_common_cuda_destruct_event(uint64_t *event);
--- a/ompi/mca/mpool/gpusm/Makefile.am
+++ b/ompi/mca/mpool/gpusm/Makefile.am
@ -0,0 +1,57 @@
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+AM_CPPFLAGS = $(mpool_gpusm_CPPFLAGS)
+
+sources = \
+    mpool_gpusm_module.c \
+    mpool_gpusm_component.c
+
+if WANT_INSTALL_HEADERS
+ompidir = $(includedir)/openmpi/$(subdir)
+ompi_HEADERS = mpool_gpusm.h
+endif
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_ompi_mpool_gpusm_DSO
+component_noinst =
+component_install = mca_mpool_gpusm.la
+else
+component_noinst = libmca_mpool_gpusm.la
+component_install =
+endif
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_mpool_gpusm_la_SOURCES = $(sources)
+mca_mpool_gpusm_la_LDFLAGS = -module -avoid-version
+mca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS)
+if MCA_ompi_cuda_support
+mca_mpool_gpusm_la_LIBADD += \
+    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+endif
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_mpool_gpusm_la_SOURCES = $(sources)
+libmca_mpool_gpusm_la_LDFLAGS = -module -avoid-version
+libmca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS)
--- a/ompi/mca/mpool/gpusm/configure.m4
+++ b/ompi/mca/mpool/gpusm/configure.m4
@ -0,0 +1,25 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If CUDA support was requested, then build the CUDA memory pools.
+# This code checks the variable CUDA_SUPPORT which was set earlier in
+# the configure sequence by the opal_configure_options.m4 code.
+#
+
+AC_DEFUN([MCA_ompi_mpool_gpusm_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/mpool/gpusm/Makefile])
+
+    # Use CUDA_SUPPORT which was filled in by the opal configure code.
+    AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
+          [$1],
+          [$2])
+
+])dnl
--- a/ompi/mca/mpool/gpusm/mpool_gpusm.h
+++ b/ompi/mca/mpool/gpusm/mpool_gpusm.h
@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_MPOOL_GPUSM_H
+#define MCA_MPOOL_GPUSM_H
+
+#include "ompi_config.h"
+#include "opal/class/opal_list.h"
+#include "ompi/class/ompi_free_list.h"
+#include "ompi/mca/mpool/mpool.h"
+
+BEGIN_C_DECLS
+
+#define MEMHANDLE_SIZE 8
+#define EVTHANDLE_SIZE 8
+struct mca_mpool_gpusm_registration_t { 
+    mca_mpool_base_registration_t base;
+    uint64_t memHandle[MEMHANDLE_SIZE];
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+	uint64_t event;
+};  
+typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t; 
+OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t); 
+
+struct mca_mpool_gpusm_component_t {
+    mca_mpool_base_component_t super;
+};
+typedef struct mca_mpool_gpusm_component_t mca_mpool_gpusm_component_t;
+
+OMPI_DECLSPEC extern mca_mpool_gpusm_component_t mca_mpool_gpusm_component;
+
+struct mca_mpool_base_resources_t {
+    void *reg_data;
+    size_t sizeof_reg;
+    int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg,
+                        mca_mpool_base_registration_t *hdrreg);
+    int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg);
+};
+typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t;
+
+struct mca_mpool_gpusm_module_t {
+    mca_mpool_base_module_t super;
+    struct mca_mpool_base_resources_t resources;
+    ompi_free_list_t reg_list;
+}; typedef struct mca_mpool_gpusm_module_t mca_mpool_gpusm_module_t;
+
+/*
+ *  Initializes the mpool module.
+ */
+void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t *mpool);
+
+/**
+  * register block of memory
+  */
+int mca_mpool_gpusm_register(mca_mpool_base_module_t* mpool, void *addr,
+        size_t size, uint32_t flags, mca_mpool_base_registration_t **reg);
+
+/**
+ * deregister memory
+ */
+int mca_mpool_gpusm_deregister(mca_mpool_base_module_t *mpool,
+        mca_mpool_base_registration_t *reg);
+
+/**
+ * find registration for a given block of memory
+ */
+int mca_mpool_gpusm_find(struct mca_mpool_base_module_t* mpool, void* addr,
+        size_t size, mca_mpool_base_registration_t **reg);
+
+/**
+ * finalize mpool
+ */
+void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool);
+
+/**
+ * Fault Tolerance Event Notification Function
+ * @param state Checkpoint Stae
+ * @return OMPI_SUCCESS or failure status
+ */
+int mca_mpool_gpusm_ft_event(int state);
+
+END_C_DECLS
+#endif
--- a/ompi/mca/mpool/gpusm/mpool_gpusm_component.c
+++ b/ompi/mca/mpool/gpusm/mpool_gpusm_component.c
@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2007-2009 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
+#include "ompi_config.h"
+#include "opal/mca/base/base.h"
+#include "opal/mca/base/mca_base_param.h"
+#include "mpool_gpusm.h"
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+/*
+ * Local functions
+ */
+static int gpusm_open(void);
+static int gpusm_close(void);
+static int gpusm_register(void);
+static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t* resources);
+
+mca_mpool_gpusm_component_t mca_mpool_gpusm_component = {
+    {
+      /* First, the mca_base_component_t struct containing meta
+         information about the component itself */
+
+      {
+          MCA_MPOOL_BASE_VERSION_2_0_0,
+
+          "gpusm", /* MCA component name */
+          OMPI_MAJOR_VERSION,  /* MCA component major version */
+          OMPI_MINOR_VERSION,  /* MCA component minor version */
+          OMPI_RELEASE_VERSION,  /* MCA component release version */
+          gpusm_open,  /* component open  */
+          gpusm_close,
+          NULL,
+          gpusm_register
+      },
+      {
+          /* The component is checkpoint ready */
+          MCA_BASE_METADATA_PARAM_CHECKPOINT
+      },
+
+      gpusm_init
+    }
+};
+
+/**
+  * Component open/close/init/register functions.  Most do not do anything,
+  * but keep around for placeholders.
+  */
+static int gpusm_open(void)
+{
+    return OMPI_SUCCESS;
+}
+
+
+static int gpusm_register(void)
+{
+    return OMPI_SUCCESS;
+}
+
+
+static int gpusm_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+
+static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t *resources)
+{
+    mca_mpool_gpusm_module_t* mpool_module;
+
+    mpool_module =
+        (mca_mpool_gpusm_module_t*)malloc(sizeof(mca_mpool_gpusm_module_t));
+
+    mpool_module->resources = *resources;
+
+    mca_mpool_gpusm_module_init(mpool_module);
+
+    return &mpool_module->super;
+}
--- a/ompi/mca/mpool/gpusm/mpool_gpusm_module.c
+++ b/ompi/mca/mpool/gpusm/mpool_gpusm_module.c
@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2009 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2007      Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2010      IBM Corporation.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/**
+ * @file:
+ *
+ * This file implements a simple memory pool that is used by the GPU
+ * buffer on the sending side.  It just gets a memory handle and event
+ * handle that can be sent to the remote side which can then use the
+ * handles to get access to the memory and the event to determine when
+ * it can start accessing the memory.  There is no caching of the
+ * memory handles as getting new ones is fast.  The event handles are
+ * cached by the cuda_common code.
+ */
+
+#include "ompi_config.h"
+#include "ompi/mca/mpool/base/base.h"
+#include "ompi/mca/mpool/gpusm/mpool_gpusm.h"
+#include "ompi/runtime/params.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+
+/**
+ * Called when the registration free list is created.  An event is created
+ * for each entry. 
+ */
+static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item )
+{
+    mca_common_cuda_construct_event_and_handle((uint64_t **)&item->event,
+                                               (void **)&item->evtHandle);
+}
+
+/**
+ * Called when the program is exiting.  This destroys the events.
+ */
+static void mca_mpool_gpusm_registration_destructor( mca_mpool_gpusm_registration_t *item )
+{
+    mca_common_cuda_destruct_event((uint64_t *)item->event);
+}
+
+OBJ_CLASS_INSTANCE(mca_mpool_gpusm_registration_t, mca_mpool_base_registration_t, 
+                   mca_mpool_gpusm_registration_constructor,
+                   mca_mpool_gpusm_registration_destructor);
+
+/*
+ *  Initializes the mpool module.
+ */
+void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t* mpool)
+{
+    mpool->super.mpool_component = &mca_mpool_gpusm_component.super;
+    mpool->super.mpool_base = NULL;
+    mpool->super.mpool_alloc = NULL;
+    mpool->super.mpool_realloc = NULL;
+    mpool->super.mpool_free = NULL;
+    mpool->super.mpool_register = mca_mpool_gpusm_register;
+    mpool->super.mpool_find = mca_mpool_gpusm_find;
+    mpool->super.mpool_deregister = mca_mpool_gpusm_deregister;
+    mpool->super.mpool_release_memory = NULL;
+    mpool->super.mpool_finalize = mca_mpool_gpusm_finalize;
+    mpool->super.mpool_ft_event = mca_mpool_gpusm_ft_event;
+    mpool->super.rcache = NULL;
+    mpool->super.flags = 0;
+
+    mpool->resources.reg_data = NULL;
+    mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t);
+    mpool->resources.register_mem = cuda_getmemhandle;
+    mpool->resources.deregister_mem = cuda_ungetmemhandle;
+
+    OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
+
+    /* Start with 0 entries in the free list since CUDA may not have
+     * been initialized when this free list is created and there is
+     * some CUDA specific activities that need to be done. */
+    ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
+            opal_cache_line_size,
+            OBJ_CLASS(mca_mpool_gpusm_registration_t), 
+            0,opal_cache_line_size,
+            0, -1, 64, NULL);
+
+}
+
+/**
+ * Just go ahead and get a new registration.  The find and register
+ * functions are the same thing for this memory pool.
+ */
+int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr,
+                         size_t size,
+                         mca_mpool_base_registration_t **reg)
+{
+    return mca_mpool_gpusm_register(mpool, addr, size, 0, reg);
+}
+
+/*
+ * This is the one function that does all the work.  It will call into
+ * the register function to get the memory handle for the sending
+ * buffer.  There is no need to deregister the memory handle so the
+ * deregister function is a no-op.
+ */
+int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr,
+                             size_t size, uint32_t flags,
+                             mca_mpool_base_registration_t **reg)
+{
+    mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t*)mpool;
+    mca_mpool_base_registration_t *gpusm_reg;
+    ompi_free_list_item_t *item;
+    unsigned char *base, *bound;
+    int rc;
+
+    /* In spite of the fact we return an error code, the existing code
+     * checks the registration for a NULL value rather than looking at
+     * the return code.  So, initialize the registration to NULL in
+     * case we run into a failure. */
+    *reg = NULL;
+
+    base = addr;
+    bound = (unsigned char *)addr + size - 1;
+
+    OMPI_FREE_LIST_GET(&mpool_gpusm->reg_list, item, rc);
+    if(OMPI_SUCCESS != rc) {
+        return rc;
+    }
+    gpusm_reg = (mca_mpool_base_registration_t*)item;
+
+    gpusm_reg->mpool = mpool;
+    gpusm_reg->base = base;
+    gpusm_reg->bound = bound;
+    gpusm_reg->flags = flags;
+
+    rc = mpool_gpusm->resources.register_mem(base, size, gpusm_reg, NULL);
+
+    if(rc != OMPI_SUCCESS) {
+        OMPI_FREE_LIST_RETURN(&mpool_gpusm->reg_list, item);
+        return rc;
+    }
+
+    *reg = gpusm_reg;
+    (*reg)->ref_count++;
+    return OMPI_SUCCESS;
+
+}
+
+/*
+ * Return the registration to the free list.
+ */
+int mca_mpool_gpusm_deregister(struct mca_mpool_base_module_t *mpool,
+                               mca_mpool_base_registration_t *reg)
+{
+    int rc;
+    mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool;
+
+    rc = mpool_gpusm->resources.deregister_mem(mpool, reg);
+    OMPI_FREE_LIST_RETURN(&mpool_gpusm->reg_list, (ompi_free_list_item_t*)reg);
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Free up the resources.  
+ */
+void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool)
+{
+    ompi_free_list_item_t *item;
+    mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool;
+
+    /* Need to run the destructor on each item in the free list explicitly.
+     * The destruction of the free list only runs the destructor on the
+     * main free list, not each item. */
+    while (NULL != (item = (ompi_free_list_item_t *)opal_atomic_lifo_pop(&(mpool_gpusm->reg_list.super)))) {
+        OBJ_DESTRUCT(item);
+    }
+
+    OBJ_DESTRUCT(&mpool_gpusm->reg_list);
+    return;
+}
+
+int mca_mpool_gpusm_ft_event(int state) {
+    return OMPI_SUCCESS;
+}
--- a/ompi/mca/mpool/rgpusm/Makefile.am
+++ b/ompi/mca/mpool/rgpusm/Makefile.am
@ -0,0 +1,57 @@
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+AM_CPPFLAGS = $(mpool_rgpusm_CPPFLAGS)
+
+sources = \
+    mpool_rgpusm_module.c \
+    mpool_rgpusm_component.c
+
+if WANT_INSTALL_HEADERS
+ompidir = $(includedir)/openmpi/$(subdir)
+ompi_HEADERS = mpool_rgpusm.h
+endif
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_ompi_mpool_rgpusm_DSO
+component_noinst =
+component_install = mca_mpool_rgpusm.la
+else
+component_noinst = libmca_mpool_rgpusm.la
+component_install =
+endif
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_mpool_rgpusm_la_SOURCES = $(sources)
+mca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version
+mca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS)
+if MCA_ompi_cuda_support
+mca_mpool_rgpusm_la_LIBADD += \
+    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+endif
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_mpool_rgpusm_la_SOURCES = $(sources)
+libmca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version
+libmca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS)
--- a/ompi/mca/mpool/rgpusm/configure.m4
+++ b/ompi/mca/mpool/rgpusm/configure.m4
@ -0,0 +1,25 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+#
+# If CUDA support was requested, then build the CUDA memory pools.
+# This code checks the variable CUDA_SUPPORT which was set earlier in
+# the configure sequence by the opal_configure_options.m4 code.
+#
+
+AC_DEFUN([MCA_ompi_mpool_rgpusm_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/mpool/rgpusm/Makefile])
+
+    # Use CUDA_SUPPORT which was filled in by the opal configure code.
+    AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
+          [$1],
+          [$2])
+
+])dnl
--- a/ompi/mca/mpool/rgpusm/mpool_rgpusm.h
+++ b/ompi/mca/mpool/rgpusm/mpool_rgpusm.h
@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_MPOOL_RGPUSM_H
+#define MCA_MPOOL_RGPUSM_H
+
+#include "ompi_config.h"
+#include "opal/class/opal_list.h"
+#include "ompi/class/ompi_free_list.h"
+#include "ompi/mca/mpool/mpool.h"
+
+BEGIN_C_DECLS
+
+struct mca_mpool_rgpusm_component_t {
+    mca_mpool_base_component_t super;
+    char* rcache_name;
+    size_t rcache_size_limit;
+    bool print_stats;
+    uint32_t leave_pinned;
+    int output;
+};
+typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t;
+
+OMPI_DECLSPEC extern mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component;
+
+struct mca_mpool_base_resources_t {
+    void *reg_data;
+    size_t sizeof_reg;
+    int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg,
+                        mca_mpool_base_registration_t *hdrreg);
+    int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg);
+};
+typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t;
+
+struct mca_mpool_rgpusm_module_t {
+    mca_mpool_base_module_t super;
+    struct mca_mpool_base_resources_t resources;
+    ompi_free_list_t reg_list;
+    opal_list_t lru_list;
+    uint32_t stat_cache_hit;
+    uint32_t stat_cache_valid;
+    uint32_t stat_cache_invalid;
+    uint32_t stat_cache_miss;
+    uint32_t stat_evicted;
+    uint32_t stat_cache_found;
+    uint32_t stat_cache_notfound;
+}; typedef struct mca_mpool_rgpusm_module_t mca_mpool_rgpusm_module_t;
+
+/*
+ *  Initializes the mpool module.
+ */
+void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t *mpool);
+
+/**
+  * register block of memory
+  */
+int mca_mpool_rgpusm_register(mca_mpool_base_module_t* mpool, void *addr,
+        size_t size, uint32_t flags, mca_mpool_base_registration_t **reg);
+
+/**
+ * deregister memory
+ */
+int mca_mpool_rgpusm_deregister(mca_mpool_base_module_t *mpool,
+        mca_mpool_base_registration_t *reg);
+
+/**
+  * free memory allocated by alloc function
+  */
+void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void * addr,
+        mca_mpool_base_registration_t *reg);
+
+/**
+ * find registration for a given block of memory
+ */
+int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t* mpool, void* addr,
+        size_t size, mca_mpool_base_registration_t **reg);
+
+/**
+ * unregister all registration covering the block of memory
+ */
+int mca_mpool_rgpusm_release_memory(mca_mpool_base_module_t* mpool, void *base,
+        size_t size);
+
+/**
+ * finalize mpool
+ */
+void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool);
+
+/**
+ * Fault Tolerance Event Notification Function
+ * @param state Checkpoint Stae
+ * @return OMPI_SUCCESS or failure status
+ */
+int mca_mpool_rgpusm_ft_event(int state);
+
+END_C_DECLS
+#endif
--- a/ompi/mca/mpool/rgpusm/mpool_rgpusm_component.c
+++ b/ompi/mca/mpool/rgpusm/mpool_rgpusm_component.c
@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2007-2009 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
+#include "ompi_config.h"
+#include "opal/mca/base/base.h"
+#include "opal/mca/base/mca_base_param.h"
+#include "mpool_rgpusm.h"
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+/*
+ * Local functions
+ */
+static int rgpusm_open(void);
+static int rgpusm_close(void);
+static int rgpusm_register(void);
+static mca_mpool_base_module_t* rgpusm_init(struct mca_mpool_base_resources_t* resources);
+
+mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component = {
+    {
+      /* First, the mca_base_component_t struct containing meta
+         information about the component itself */
+
+      {
+          MCA_MPOOL_BASE_VERSION_2_0_0,
+
+          "rgpusm", /* MCA component name */
+          OMPI_MAJOR_VERSION,  /* MCA component major version */
+          OMPI_MINOR_VERSION,  /* MCA component minor version */
+          OMPI_RELEASE_VERSION,  /* MCA component release version */
+          rgpusm_open,  /* component open  */
+          rgpusm_close,
+          NULL,
+          rgpusm_register
+      },
+      {
+          /* The component is checkpoint ready */
+          MCA_BASE_METADATA_PARAM_CHECKPOINT
+      },
+
+      rgpusm_init
+    }
+};
+
+/**
+  * component open/close/init function
+  */
+static int rgpusm_open(void)
+{
+    return OMPI_SUCCESS;
+}
+
+
+static int rgpusm_register(void)
+{
+    int val;
+
+    mca_base_param_reg_string(&mca_mpool_rgpusm_component.super.mpool_version,
+            "rcache_name",
+            "The name of the registration cache the mpool should use",
+            false, false, "vma", &mca_mpool_rgpusm_component.rcache_name);
+
+    mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
+            "rcache_size_limit",
+            "the maximum size of registration cache in bytes. "
+            "0 is unlimited (default 0)", false, false, 0, &val);
+
+    mca_mpool_rgpusm_component.rcache_size_limit = (size_t)val;
+
+    mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
+            "leave_pinned",
+            "Whether to keep memory handles around or release them when done. ",
+            false, false, 1, &val);
+    mca_mpool_rgpusm_component.leave_pinned = (size_t)val;
+
+    mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
+            "print_stats",
+            "print pool usage statistics at the end of the run",
+            false, false, 0, &val);
+
+    mca_mpool_rgpusm_component.print_stats = val?true:false;
+
+    /* Set different levels of verbosity in the rgpusm related code. */
+    mca_base_param_reg_int(&mca_mpool_rgpusm_component.super.mpool_version,
+             "verbose", 
+             "Set level of mpool rgpusm verbosity",
+             false, false, 0, &val);
+    mca_mpool_rgpusm_component.output = opal_output_open(NULL);
+    opal_output_set_verbosity(mca_mpool_rgpusm_component.output, val);
+
+    return OMPI_SUCCESS;
+}
+
+
+static int rgpusm_close(void)
+{
+    if (NULL != mca_mpool_rgpusm_component.rcache_name) {
+        free(mca_mpool_rgpusm_component.rcache_name);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+
+static mca_mpool_base_module_t* rgpusm_init(
+     struct mca_mpool_base_resources_t *resources)
+{
+    mca_mpool_rgpusm_module_t* mpool_module;
+
+    mpool_module =
+        (mca_mpool_rgpusm_module_t*)malloc(sizeof(mca_mpool_rgpusm_module_t));
+
+    mpool_module->resources = *resources;
+
+    mca_mpool_rgpusm_module_init(mpool_module);
+
+    return &mpool_module->super;
+}
--- a/ompi/mca/mpool/rgpusm/mpool_rgpusm_module.c
+++ b/ompi/mca/mpool/rgpusm/mpool_rgpusm_module.c
@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2009 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2006      Voltaire. All rights reserved.
+ * Copyright (c) 2007      Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2010      IBM Corporation.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/**
+ * @file:
+ *
+ * This memory pool is used for getting the memory handle of remote
+ * GPU memory when using CUDA.  Hence, the name is "rgpusm" for "remote
+ * CUDA" GPU memory.  There is a cache that can be used to store the
+ * remote handles in case they are reused to save on the registration
+ * cost as that can be expensive, on the order of 100 usecs.  The
+ * cache can also be used just to track how many handles are in use at
+ * a time.  It is best to look at this with the three different
+ * scenarios that are possible.
+ * 1. mpool_rgpusm_leave_pinned=0, cache_size=unlimited
+ * 2. mpool_rgpusm_leave_pinned=0, cache_size=limited
+ * 3. mpool_rgpusm_leave_pinned=1, cache_size=unlimited (default)
+ * 4. mpool_rgpusm_leave_pinned=1, cache_size=limited.
+ * 
+ * Case 1: The cache is unused and remote memory is registered and
+ * unregistered for each transaction.  The amount of outstanding
+ * registered memory is unlimited.
+ * Case 2: The cache keeps track of how much memory is registered at a
+ * time.  Since leave pinned is 0, any memory that is registered is in
+ * use.  If the amount to register exceeds the amount, we will error
+ * out.  This could be handled more gracefully, but this is not a
+ * common way to run, so we will leave as is.
+ * Case 3: The cache is needed to track current and past transactions.
+ * However, there is no limit on the number that can be stored.
+ * Therefore, once memory enters the cache, and gets registered, it
+ * stays that way forever.
+ * Case 4: The cache is needed to track current and past transactions.
+ * In addition, a list of most recently used (but no longer in use)
+ * registrations is stored so that it can be used to evict
+ * registrations from the cache.  In addition, these registrations are
+ * deregistered.
+ * 
+ * I also want to capture how we can run into the case where we do not
+ * find something in the cache, but when we try to register it, we get
+ * an error back from the CUDA library saying the memory is in use.
+ * This can happen in the following scenario.  The application mallocs
+ * a buffer of size 32K.  The library loads this in the cache and
+ * registers it.  The application then frees the buffer.  It then
+ * mallocs a buffer of size 64K.  This malloc returns the same base
+ * address as the first 32K allocation.  The library searches the
+ * cache, but since the size is larger than the original allocation it
+ * does not find the registration.  It then attempts to register this.
+ * The CUDA library returns an error saying it is already mapped.  To
+ * handle this, we return an error of OMPI_ERR_WOULD_BLOCK to the
+ * memory pool.  The memory pool then looks for the registration based
+ * on the base address and a size of 4.  We use the small size to make
+ * sure that we find the registration.  This registration is evicted,
+ * and we try to register again.
+ */
+
+#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
+#include "ompi_config.h"
+#include "opal/align.h"
+#include "orte/util/name_fns.h"
+#include "orte/runtime/orte_globals.h"
+#include "ompi/mca/mpool/rgpusm/mpool_rgpusm.h"
+#include <errno.h>
+#include <string.h>
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#include "ompi/mca/rcache/rcache.h"
+#include "ompi/mca/rcache/base/base.h"
+#include "ompi/mca/mpool/base/base.h"
+#include "ompi/runtime/params.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+
+
+/* A hack so that page alignment is disabled in my instantiation of
+ * the rcache.  This needs to be fixed. */
+static size_t saved_page_size;
+#define SET_PAGE_ALIGNMENT_TO_ZERO() \
+    saved_page_size = mca_mpool_base_page_size_log; \
+    mca_mpool_base_page_size_log = 0;
+
+#define RESTORE_PAGE_ALIGNMENT() \
+    mca_mpool_base_page_size_log = saved_page_size;
+
+static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpool) {
+    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) mpool;
+    mca_mpool_base_registration_t *old_reg;
+    int rc;
+
+    /* Remove the registration from the cache and list before
+       deregistering the memory */
+    old_reg = (mca_mpool_base_registration_t*)
+        opal_list_remove_first (&mpool_rgpusm->lru_list);
+    if (NULL == old_reg) {
+        return false;
+    }
+
+    mpool->rcache->rcache_delete(mpool->rcache, old_reg);
+
+    /* Drop the rcache lock while we deregister the memory */
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+    assert(old_reg->ref_count == 0);
+    rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
+                                                old_reg);
+    OPAL_THREAD_LOCK(&mpool->rcache->lock);
+
+    /* This introduces a potential leak of registrations if
+       the deregistration fails to occur as we no longer have
+       a reference to it. Is this possible? */
+    if (OMPI_SUCCESS != rc) {
+        return false;
+    }
+
+    OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
+                          (ompi_free_list_item_t*)old_reg);
+    mpool_rgpusm->stat_evicted++;
+
+    return true;
+}
+
+
+/*
+ *  Initializes the mpool module.
+ */
+void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t* mpool)
+{
+    mpool->super.mpool_component = &mca_mpool_rgpusm_component.super;
+    mpool->super.mpool_base = NULL; /* no base .. */
+    mpool->super.mpool_alloc = NULL;
+    mpool->super.mpool_realloc = NULL;
+    mpool->super.mpool_free = mca_mpool_rgpusm_free;
+    mpool->super.mpool_register = mca_mpool_rgpusm_register;
+    mpool->super.mpool_find = mca_mpool_rgpusm_find;
+    mpool->super.mpool_deregister = mca_mpool_rgpusm_deregister;
+    mpool->super.mpool_release_memory = NULL;
+    mpool->super.mpool_finalize = mca_mpool_rgpusm_finalize;
+    mpool->super.mpool_ft_event = mca_mpool_rgpusm_ft_event;
+    mpool->super.rcache =
+        mca_rcache_base_module_create(mca_mpool_rgpusm_component.rcache_name);
+    mpool->super.flags = 0;
+
+    mpool->resources.reg_data = NULL;
+    mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t);
+    mpool->resources.register_mem = cuda_openmemhandle;
+    mpool->resources.deregister_mem = cuda_closememhandle;
+
+    OBJ_CONSTRUCT(&mpool->reg_list, ompi_free_list_t);
+    ompi_free_list_init_new(&mpool->reg_list, mpool->resources.sizeof_reg,
+            opal_cache_line_size,
+            OBJ_CLASS(mca_mpool_base_registration_t), 
+            0,opal_cache_line_size,
+            0, -1, 32, NULL);
+    OBJ_CONSTRUCT(&mpool->lru_list, opal_list_t);
+    mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0;
+    mpool->stat_cache_found = mpool->stat_cache_notfound = 0;
+    mpool->stat_cache_valid = mpool->stat_cache_invalid = 0;
+
+}
+
+/*
+ * This function opens and handle using the handle that was received
+ * from the remote memory.  It uses the addr and size of the remote
+ * memory for caching the registration.
+ */
+int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
+                             size_t size, uint32_t flags,
+                             mca_mpool_base_registration_t **reg)
+{
+    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
+    mca_mpool_common_cuda_reg_t *rgpusm_reg;
+    mca_mpool_common_cuda_reg_t *rget_reg;
+    ompi_free_list_item_t *item;
+    int rc;
+    int mypeer;  /* just for debugging */
+
+    /* In order to preserve the signature of the mca_mpool_rgpusm_register
+     * function, we are using the **reg variable to not only get back the
+     * registration information, but to hand in the memory handle received
+     * from the remote side. */
+    rget_reg = (mca_mpool_common_cuda_reg_t *)*reg;
+
+    mypeer = flags;
+    flags = 0;
+    /* No need to support MCA_MPOOL_FLAGS_CACHE_BYPASS in here. It is not used. */
+    assert(0 == (flags & MCA_MPOOL_FLAGS_CACHE_BYPASS));
+
+    /* This chunk of code handles the case where leave pinned is not
+     * set and we do not use the cache.  This is not typically how we
+     * will be running.  This means that one can have an unlimited
+     * number of registrations occuring at the same time.  Since we
+     * are not leaving the registrations pinned, the number of
+     * registrations is unlimited and there is no need for a cache. */
+    if(!mca_mpool_rgpusm_component.leave_pinned && 0 == mca_mpool_rgpusm_component.rcache_size_limit) {
+        OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
+        if(OMPI_SUCCESS != rc) {
+            return rc;
+        }
+        rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
+        rgpusm_reg->base.mpool = mpool;
+        rgpusm_reg->base.base = addr;
+        rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;;
+        rgpusm_reg->base.flags = flags;
+
+        /* Copy the memory handle received into the registration */
+        memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));
+
+        /* The rget_reg registration is holding the memory handle needed
+         * to register the remote memory.  This was received from the remote
+         * process.  A pointer to the memory is returned in the alloc_base field. */
+        rc = mpool_rgpusm->resources.register_mem(addr, size,
+                                                 (mca_mpool_base_registration_t *)rgpusm_reg,
+                                                 (mca_mpool_base_registration_t *)rget_reg);
+
+        /* This error should not happen with no cache in use. */
+        assert(OMPI_ERR_WOULD_BLOCK != rc);
+
+        if(rc != OMPI_SUCCESS) {
+            OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
+            return rc;
+        }
+        rgpusm_reg->base.ref_count++;
+        *reg = (mca_mpool_base_registration_t *)rgpusm_reg;
+        return OMPI_SUCCESS;
+    }
+
+    /* Check to see if memory is registered and stored in the cache. */
+    OPAL_THREAD_LOCK(&mpool->rcache->lock);
+    SET_PAGE_ALIGNMENT_TO_ZERO();
+    mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
+    RESTORE_PAGE_ALIGNMENT();
+
+    /* If *reg is not NULL, we have a registration.  Let us see if the
+     * memory handle matches the one we were looking for.  If not, the
+     * registration is invalid and needs to be removed. This happens
+     * if memory was allocated, freed, and allocated again and ends up
+     * with the same virtual address and within the limits of the
+     * previous registration.  The memory handle check will catch that
+     * scenario as the handles have unique serial numbers.  */
+    if (*reg != NULL) {
+        mpool_rgpusm->stat_cache_hit++;
+        opal_output_verbose(10, mca_mpool_rgpusm_component.output,
+                            "Found addr=%p, size=%d (base=%p,size=%d)in cache",
+                            addr, (int)size, (*reg)->base,
+                            (int)((*reg)->bound - (*reg)->base));
+
+        if (mca_common_cuda_memhandle_matches((mca_mpool_common_cuda_reg_t *)*reg, rget_reg)) {
+            /* Registration matches what was requested.  All is good. */
+            mpool_rgpusm->stat_cache_valid++;
+        } else {
+            /* This is an old registration.  Need to boot it. */
+            opal_output_verbose(10, mca_mpool_rgpusm_component.output,
+                                "Mismatched Handle: Evicting addr=%p, size=%d in cache",
+                                addr, (int)size);
+            /* The ref_count has to be zero as this memory cannot possibly
+             * be in use.  Assert on that just to make sure. */
+            assert(0 == (*reg)->ref_count);
+            if (mca_mpool_rgpusm_component.leave_pinned) {
+                opal_list_remove_item(&mpool_rgpusm->lru_list,
+                                      (opal_list_item_t*)(*reg));
+            }
+
+            /* Bump the reference count to keep things copacetic in deregister */
+            (*reg)->ref_count++;
+            /* Invalidate the registration so it will get booted out. */
+            (*reg)->flags |= MCA_MPOOL_FLAGS_INVALID;
+            mca_mpool_rgpusm_deregister(mpool, *reg);
+            *reg = NULL;
+            mpool_rgpusm->stat_cache_invalid++;
+        }
+    } else {
+        /* Nothing was found in the cache. */
+        mpool_rgpusm->stat_cache_miss++;
+    }
+
+    /* If we have a registration here, then we know it is valid. */
+    if (*reg != NULL) {
+        opal_output_verbose(10, mca_mpool_rgpusm_component.output,
+                            "CACHE HIT is good: ep=%d, addr=%p, size=%d in cache",
+                            mypeer, addr, (int)size);
+
+        /* When using leave pinned, we keep an LRU list. */
+        if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) {
+            opal_output_verbose(20, mca_mpool_rgpusm_component.output,
+                                "POP OFF LRU: ep=%d, addr=%p, size=%d in cache",
+                                mypeer, addr, (int)size);
+            opal_list_remove_item(&mpool_rgpusm->lru_list,
+                                  (opal_list_item_t*)(*reg));
+        }
+        (*reg)->ref_count++;
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+        opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
+        opal_output_verbose(80, mca_mpool_rgpusm_component.output,
+                           "Found entry in cache addr=%p, size=%d", addr, (int)size);
+        return OMPI_SUCCESS;
+    }
+
+    /* If we are here, then we did not find a registration, or it was invalid,
+     * so this is a new one, and we are going to use the cache. */
+    assert(NULL == *reg);
+    opal_output_verbose(10, mca_mpool_rgpusm_component.output,
+                        "New registration ep=%d, addr=%p, size=%d in cache",
+                         mypeer, addr, (int)size);
+
+    OMPI_FREE_LIST_GET(&mpool_rgpusm->reg_list, item, rc);
+    if(OMPI_SUCCESS != rc) {
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+        return rc;
+    }
+    rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
+
+    rgpusm_reg->base.mpool = mpool;
+    rgpusm_reg->base.base = addr;
+    rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;
+    rgpusm_reg->base.flags = flags;
+
+    /* Need the memory handle saved in the registration */
+    memcpy(rgpusm_reg->memHandle, rget_reg->memHandle, sizeof(rget_reg->memHandle));
+
+    /* Actually register the memory, which opens the memory handle.
+     * Need to do this prior to putting in the cache as the base and
+     * bound values may be changed by the registration.  The memory
+     * associated with the handle comes back in the alloc_base
+     * value. */
+    rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
+                                             (mca_mpool_base_registration_t *)rget_reg);
+    /* There is a chance we can get the OMPI_ERR_WOULD_BLOCK from the
+     * CUDA codes attempt to register the memory.  The case that this
+     * can happen is as follows.  A block of memory is registered.
+     * Then the sending side frees the memory.  The sending side then
+     * cuMemAllocs memory again and gets the same base
+     * address. However, it cuMemAllocs a block that is larger than
+     * the one in the cache.  The cache will return that memory is not
+     * registered and call into CUDA to register it.  However, that
+     * will fail with CUDA_ERROR_ALREADY_MAPPED.  Therefore we need to
+     * boot that previous allocation out and deregister it first.
+     */
+    if (OMPI_ERR_WOULD_BLOCK == rc) {
+        mca_mpool_base_registration_t *oldreg;
+
+        SET_PAGE_ALIGNMENT_TO_ZERO();
+        /* Need to make sure it is at least 4 bytes in size  This will
+         * ensure we get the hit in the cache. */
+        mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg);
+        RESTORE_PAGE_ALIGNMENT();
+        /* The ref_count has to be zero as this memory cannot possibly
+         * be in use.  Assert on that just to make sure. */
+        assert(0 == oldreg->ref_count);
+        if (mca_mpool_rgpusm_component.leave_pinned) {
+            opal_list_remove_item(&mpool_rgpusm->lru_list,
+                                  (opal_list_item_t*)oldreg);
+        }
+
+        /* Bump the reference count to keep things copacetic in deregister */
+        oldreg->ref_count++;
+        /* Invalidate the registration so it will get booted out. */
+        oldreg->flags |= MCA_MPOOL_FLAGS_INVALID;
+        mca_mpool_rgpusm_deregister(mpool, oldreg);
+        mpool_rgpusm->stat_evicted++;
+
+        /* And try again.  This only needs to be attempted one other time. */
+        rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg,
+                                                 (mca_mpool_base_registration_t *)rget_reg);
+    }
+
+    if(rc != OMPI_SUCCESS) {
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+        OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
+        return rc;
+    }
+
+    opal_output_verbose(80, mca_mpool_rgpusm_component.output,
+                        "About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
+    SET_PAGE_ALIGNMENT_TO_ZERO();
+    while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
+             mca_mpool_rgpusm_component.rcache_size_limit)) ==
+            OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
+        opal_output(-1, "No room in the cache - boot one out");
+        if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
+            break;
+        }
+    }
+    RESTORE_PAGE_ALIGNMENT();
+
+    if(rc != OMPI_SUCCESS) {
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+        OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list, item);
+        /* We cannot recover from this.  We can be here if the size of the cache
+         * is smaller than the amount of memory we are trying to register in a single
+         * transfer.  In that case, rc is MPI_ERR_OUT_OF_RESOURCES, but everything is
+         * stuck at that point.  Therefore, just error out completely.
+         */
+        return OMPI_ERROR;
+    }
+
+    rgpusm_reg->base.ref_count++;
+    *reg = (mca_mpool_base_registration_t *)rgpusm_reg;
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+
+    /* Cleanup any vmas that we have deferred deletion on */
+    mpool->rcache->rcache_clean(mpool->rcache);
+    return OMPI_SUCCESS;
+}
+
+
+/**
+  * free function
+  */
+void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void *addr,
+                         mca_mpool_base_registration_t *registration)
+{
+    void *alloc_base = registration->alloc_base;
+    mca_mpool_rgpusm_deregister(mpool, registration);
+    free(alloc_base);
+}
+
+int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
+        size_t size, mca_mpool_base_registration_t **reg)
+{
+    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
+    int rc;
+    unsigned char *base, *bound;
+
+    base = addr;
+    bound = base + size - 1; /* To keep cache hits working correctly */
+
+    OPAL_THREAD_LOCK(&mpool->rcache->lock);
+    opal_output(-1, "Looking for addr=%p, size=%d", addr, (int)size);
+    SET_PAGE_ALIGNMENT_TO_ZERO();
+    rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
+    RESTORE_PAGE_ALIGNMENT();
+    if(*reg != NULL && mca_mpool_rgpusm_component.leave_pinned) {
+        if(0 == (*reg)->ref_count && mca_mpool_rgpusm_component.leave_pinned) {
+            opal_list_remove_item(&mpool_rgpusm->lru_list, (opal_list_item_t*)(*reg));
+        }
+        mpool_rgpusm->stat_cache_found++;
+        (*reg)->ref_count++;
+    } else {
+        mpool_rgpusm->stat_cache_notfound++;
+    }
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+
+    return rc;
+}
+
+static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg)
+{
+     return !(reg->flags &
+             (MCA_MPOOL_FLAGS_CACHE_BYPASS |
+              MCA_MPOOL_FLAGS_INVALID));
+}
+
+int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
+                            mca_mpool_base_registration_t *reg)
+{
+    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
+    int rc = OMPI_SUCCESS;
+    assert(reg->ref_count > 0);
+
+    OPAL_THREAD_LOCK(&mpool->rcache->lock);
+    reg->ref_count--;
+    opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count);
+    if(reg->ref_count > 0) {
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+        return OMPI_SUCCESS;
+    }
+    if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
+    {
+        /* if leave_pinned is set don't deregister memory, but put it
+         * on LRU list for future use */
+        opal_list_prepend(&mpool_rgpusm->lru_list, (opal_list_item_t*)reg);
+    } else {
+        /* Remove from rcache first */
+        if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS))
+            mpool->rcache->rcache_delete(mpool->rcache, reg);
+
+        /* Drop the rcache lock before deregistring the memory */
+        OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+
+        {
+             mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool;
+
+             assert(reg->ref_count == 0);
+             rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
+                                                         reg);
+         }
+
+        OPAL_THREAD_LOCK(&mpool->rcache->lock);
+
+        if(OMPI_SUCCESS == rc) {
+            OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
+                                  (ompi_free_list_item_t*)reg);
+        }
+    }
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+
+    /* Cleanup any vmas that we have deferred deletion on */
+    mpool->rcache->rcache_clean(mpool->rcache);
+
+    return rc;
+}
+
+#define RGPUSM_MPOOL_NREGS 100
+
+void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
+{
+    mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
+    mca_mpool_base_registration_t *reg;
+    mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS];
+    int reg_cnt, i;
+    int rc;
+
+    /* Statistic */
+    if(true == mca_mpool_rgpusm_component.print_stats) {
+        opal_output(0, "%s rgpusm: stats "
+                "(hit/valid/invalid/miss/evicted): %d/%d/%d/%d/%d\n",
+                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                mpool_rgpusm->stat_cache_hit, mpool_rgpusm->stat_cache_valid, 
+                mpool_rgpusm->stat_cache_invalid, mpool_rgpusm->stat_cache_miss,
+                mpool_rgpusm->stat_evicted);
+    }
+
+    OPAL_THREAD_LOCK(&mpool->rcache->lock);
+    do {
+        reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
+                regs, RGPUSM_MPOOL_NREGS);
+        opal_output(-1, "Registration size at finalize = %d", reg_cnt);
+
+        for(i = 0; i < reg_cnt; i++) {
+            reg = regs[i];
+
+            if(reg->ref_count) {
+                reg->ref_count = 0; /* otherway dereg will fail on assert */
+            } else if (mca_mpool_rgpusm_component.leave_pinned) {
+                opal_list_remove_item(&mpool_rgpusm->lru_list,
+                        (opal_list_item_t*)reg);
+            }
+
+            /* Remove from rcache first */
+            mpool->rcache->rcache_delete(mpool->rcache, reg);
+
+            /* Drop lock before deregistering memory */
+            OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+            assert(reg->ref_count == 0);
+            rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
+                                                   reg);
+            OPAL_THREAD_LOCK(&mpool->rcache->lock);
+
+            if(rc != OMPI_SUCCESS) {
+                /* Potentially lose track of registrations
+                   do we have to put it back? */
+                continue;
+            }
+
+            OMPI_FREE_LIST_RETURN(&mpool_rgpusm->reg_list,
+                                  (ompi_free_list_item_t*)reg);
+        }
+    } while(reg_cnt == RGPUSM_MPOOL_NREGS);
+
+    OBJ_DESTRUCT(&mpool_rgpusm->lru_list);
+    OBJ_DESTRUCT(&mpool_rgpusm->reg_list);
+    OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
+
+    /* Cleanup any vmas that we have deferred deletion on */
+    mpool->rcache->rcache_clean(mpool->rcache);
+
+}
+
+int mca_mpool_rgpusm_ft_event(int state) {
+    return OMPI_SUCCESS;
+}