New btl that extends sm btl to support GPU transfers within a node.

Uses new CUDA IPC support. Also, a few minor changes in PML to take advantage of it. This code has no effect unless user asks for it explicitly via configure arguments. Otherwise, it is either #ifdef'ed out or not compiled. This commit was SVN r26039.
2012-02-24 02:13:33 +00:00 · 2012-02-24 02:13:33 +00:00 · b0a84b0a7d
--- a/contrib/check-btl-sm-diffs.pl
+++ b/contrib/check-btl-sm-diffs.pl
@ -0,0 +1,245 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+# This script is run to see the differences between some other BTL and
+# the sm BTL.  By default, it looks at the sm BTL but it can also be
+# used with other BTLs.  Prior to running the diff, it does some
+# preprocessing on the files.  First, it removes all the copyright
+# headers as differences in them are of no concern.  Secondly, it
+# converts all the BTL specific strings in the BTL to be compared to
+# "sm".  For example, with the smcuda, all "smcuda" strings are converted
+# to "sm" and all "SMCUDA" strings are converted to "SM".  In this way,
+# we avoid any spurious differences just related to the difference in
+# the names of the functions and variables.
+#
+# Lastly, in the case of smcuda only, it can strip out all code within
+# the BTL that is contained within specific #ifdef strings.  See the
+# code and comments below to see how it works.
+#
+# This script must be run from this directory as it makes assumptions
+# about where the PML directories are located.  Here are some
+# examples.
+#
+# Run using all defaults.
+#  > check-btl-sm-diffs.pl
+#
+# Do not remove the SMCUDA specific code
+#  > check-ob1-pml-diffs.pl -s
+#
+# Do not remove the SMCUDA specific code and save results in DIFFS
+#  > check-ob1-pml-diffs.pl -s -o DIFFS
+#
+#
+use strict;
+
+use File::Copy;
+use File::Path;
+use Getopt::Long;
+
+my $diffdir = "diffdir";
+my $btlsdir = "../ompi/mca/btl";
+my $cmd;
+my $cmd_output;
+my $contents;
+my $smcudafile;
+my @smcudafiles;
+my $smfile;
+my @smfiles;
+my $alloutput;
+
+
+# Command line parsing
+my $verbose_arg = 0;
+my $show_arg = 1;
+my $showall_arg = 0;
+my $help_arg = 0;
+my $btl_arg = "smcuda";
+my $output_arg = "";
+
+&Getopt::Long::Configure("bundling");
+my $ok = Getopt::Long::GetOptions("verbose|v!" => \$verbose_arg,
+                                  "showall|s!" => \$showall_arg,
+                                  "show|S!" => \$show_arg,
+                                  "btl|p=s" => \$btl_arg,
+                                  "output|o=s" => \$output_arg,
+                                  "help|h!" => \$help_arg);
+
+if (!$ok || $help_arg) {
+    print "
+Usage: $0 [--show|-S] [--showall|-s] [--btl|-p=BTL]  [--output|-o=OUTPUTFILE]
+        [--verbose|-v] [--help|-h]
+
+Runs a diff between the the files in the sm and the smcuda directory
+and prints the output to stdout.  Prior to checking the differences,
+the script removes all copyright header code.  It also first removes
+all CUDA specific code in the smcuda files.  Specifically, the script
+removes all code that is within the following ifdefs.
+#ifdef OMPI_CUDA_SUPPORT
+ ...
+#endif /*OMPI_CUDA_SUPPORT */
+To view the smcuda specific code in the diff, run with the -s switch.
+
+-s   Show all the differences between the files.
+-S   Show all the differences between the files that are not
+     within \"\#ifdef OMPI_CUDA_SUPPORT\" statements.  (default: -S)
+-p   BTL - which BTL to compare to sm (default: smcuda)
+-o   File name where to write the output to (instead of stdout).
+-v   Verbose - show more details of script activities.
+-h   This help
+\n";
+    exit(0);
+}
+
+my $btl = $btl_arg;
+my $BTL = $btl;
+$BTL =~ tr/a-z/A-Z/;
+
+# Change into BTL directory that is being compared to sm.
+# In the default case, we just end up where we started in
+# the smcuda directory.
+chdir "$btlsdir/$btl";
+
+print "\nStarting script to check differences between $btl and sm...\n";
+
+if (! -d $diffdir) {
+  mkdir ("$diffdir", 0777) || print $!;
+}
+
+# Copy smcuda files into temp directory.
+@smcudafiles = <*.[h|c]>;
+foreach $smcudafile (@smcudafiles) {
+  copy ($smcudafile, $diffdir);
+}
+if ($verbose_arg) {
+  print "Copied all $btl files to temp directory\n";
+}
+
+chdir $diffdir;
+
+# Using crude preprocessor, strip out all SMCUDA specific code.
+# If -s switch is provided, then leave SMCUDA specific code.
+foreach $smcudafile (@smcudafiles) {
+  $contents = Read($smcudafile);
+  die("Couldn't Read $smcudafile!\n") if (!$contents);
+
+  if (!$showall_arg) {
+    # First, remove all the #if-#else code.
+    # #ifdef OMPI_CUDA_SUPPORT
+    # ...stuff...
+    # # else /* OMPI_CUDA_SUPPORT */
+    # Then, remove all the #if-#endif code.
+    # #ifdef OMPI_CUDA_SUPPORT
+    # ...stuff...
+    # #endif /* OMPI_CUDA_SUPPORT */
+    # Then, remove leftover #endif from the #if-#else.
+    # So, three pattern matching steps.
+    # Some notes about the regular expression.
+    #   1. Need the .*? so the #endif is matched with the closest if.
+    #   2. Added the comment OMPI_CUDA_SUPPORT on the #endif to get the right match.
+    #   3. Need the \n at the end to avoid leaving extra newlines.
+    $contents =~ s/#if OMPI_CUDA_SUPPORT(.*?)((#else \/\* OMPI_CUDA_SUPPORT \*\/\n)|(#endif \/\* OMPI_CUDA_SUPPORT \*\/\n))//gis;
+    $contents =~ s/#endif \/\* OMPI_CUDA_SUPPORT \*\/\n//gis;
+  }
+
+  # Strip off the copyright header also.
+  $contents =~ s/\/\*(.*?)\$HEADER\$\n \*\/\n//is;
+
+  # Now replace the string $btl with sm so we can
+  # not get spurious diffs when comparing to sm.
+  $contents =~ s/$btl/sm/g;
+  $contents =~ s/$BTL/SM/g;
+
+  Write($smcudafile, $contents);
+}
+if ($verbose_arg) {
+  print "All $btl specific code and copyrights has been removed from $btl files\n";
+  print "All $btl/$BTL strings converted to sm/SM strings in bfo files\n";
+}
+
+# Copy sm files into temp directory
+chdir "../../sm";
+@smfiles = <*.[h|c]>;
+foreach $smfile (@smfiles) {
+  copy ($smfile, "../$btl/$diffdir");
+}
+if ($verbose_arg) {
+  print "Copied all sm files to temp directory\n";
+}
+
+chdir "../$btl/$diffdir";
+
+# Strip off copyright from sm files.
+foreach $smfile (@smfiles) {
+  # Strip off the copyright header also.
+  $contents = Read($smfile);
+  die("Couldn't Read $smfile!\n") if (!$contents);
+  $contents =~ s/\/\*(.*?)\$HEADER\$\n \*\/\n//is;
+  # Strip away KNEM as that is not in smcuda
+  $contents =~ s/#if OMPI_BTL_SM_HAVE_KNEM(.*?)((#else\n)|(#endif\n)|(#endif  \/\* OMPI_BTL_SM_HAVE_KNEM \*\/\n))//gis;
+  $contents =~ s/#endif  \/\* OMPI_BTL_SM_HAVE_KNEM \*\/\n//gis;
+  Write($smfile, $contents);
+}
+if ($verbose_arg) {
+  print "Removed copyright strings from all sm files\n";
+}
+
+
+# Now do a diff on the files.
+if ($verbose_arg) {
+  print "Now running diffs on all the files...\n\n";
+}
+foreach $smfile (@smfiles) {
+  $smcudafile = $smfile;
+  $smcudafile =~ s/sm/$btl/;
+  $cmd = "diff -c $smfile $smcudafile";
+  $cmd_output = "";
+  open (CMD, "$cmd|");
+  $cmd_output .= $_
+      while (<CMD>);
+  close(CMD);
+  if ($output_arg eq "") {
+    print "Files Compared: $smfile and $smcudafile\n";
+    print "$cmd_output";
+  } else {
+    if ($cmd_output ne "No differences encountered\n") {
+      $alloutput = $alloutput . $cmd_output;
+    }
+  }
+}
+
+chdir "..";
+if ($output_arg ne "") {
+  rmtree($output_arg);
+  Write($output_arg, $alloutput);
+}
+rmtree("$diffdir");
+
+# Function to read file into a string.
+sub Read {
+    my ($file) = @_;
+
+    my $contents;
+    open (INPUT, $file) or warn "Can't open $file: $!";
+    while (<INPUT>) {
+        $contents .= $_;
+    }
+    close(INPUT) or warn "Can't close $file: $!";
+    return $contents;
+}
+
+# Function to write string to a file.
+sub Write {
+    my ($filename, $body) = @_;
+
+    # Write out the file
+    die("Failed to write to file: $!") if (! open(FILE, "> $filename"));
+
+    print FILE $body;
+    close FILE;
+}
--- a/ompi/mca/btl/btl.h
+++ b/ompi/mca/btl/btl.h
@ -13,6 +13,7 @@
 * Copyright (c) 2006-2007 Los Alamos National Security, LLC.  All rights
 *                         reserved. 
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -195,6 +196,10 @@ typedef uint8_t mca_btl_base_tag_t;
 /* btl can support failover if enabled */
 #define MCA_BTL_FLAGS_FAILOVER_SUPPORT 0x0200

+#define MCA_BTL_FLAGS_CUDA_PUT        0x0400
+#define MCA_BTL_FLAGS_CUDA_GET        0x0800
+#define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
+
 /* Default exclusivity levels */
 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
 #define MCA_BTL_EXCLUSIVITY_DEFAULT  1024      /* GM/IB/etc. */
@ -241,7 +246,16 @@ struct mca_btl_base_segment_t {
        uint32_t  key32[4];
        uint64_t  key64[2];
        uint8_t   key8[16];
+#if OMPI_CUDA_SUPPORT
+        uint8_t cudakey[128]; /* 64 bytes for CUDA mem handle, 64 bytes for CUDA event handle */
+#endif /* OMPI_CUDA_SUPPORT */
    } seg_key;
+#if OMPI_CUDA_SUPPORT
+    /** Address of the entire memory handle */
+    ompi_ptr_t memh_seg_addr;        
+     /** Length in bytes of entire memory handle */
+    uint32_t   memh_seg_len;           
+#endif /* OMPI_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_segment_t mca_btl_base_segment_t;

--- a/ompi/mca/btl/smcuda/Makefile.am
+++ b/ompi/mca/btl/smcuda/Makefile.am
@ -0,0 +1,62 @@
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2009 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, 
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+dist_pkgdata_DATA = help-mpi-btl-smcuda.txt
+
+libmca_btl_smcuda_la_sources = \
+    btl_smcuda.c \
+    btl_smcuda.h \
+    btl_smcuda_component.c \
+    btl_smcuda_endpoint.h \
+    btl_smcuda_fifo.h \
+    btl_smcuda_frag.c \
+    btl_smcuda_frag.h 
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_ompi_btl_smcuda_DSO
+component_noinst =
+component_install = mca_btl_smcuda.la
+else
+component_noinst = libmca_btl_smcuda.la
+component_install =
+endif
+
+# See ompi/mca/common/cuda/Makefile.am for an explanation of
+# libmca_common_sm.la.
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
+mca_btl_smcuda_la_LIBADD = \
+    $(top_ompi_builddir)/ompi/mca/common/sm/libmca_common_sm.la
+mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+if MCA_ompi_cuda_support
+mca_btl_smcuda_la_LIBADD += \
+    $(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
+endif
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
+libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
--- a/ompi/mca/btl/smcuda/btl_smcuda.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda.c
--- a/ompi/mca/btl/smcuda/btl_smcuda.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda.h
@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010      Los Alamos National Security, LLC.  
+ *                         All rights reserved. 
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_BTL_SMCUDA_H
+#define MCA_BTL_SMCUDA_H
+
+#include "ompi_config.h"
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif  /* HAVE_STDINT_H */
+#ifdef HAVE_SCHED_H
+#include <sched.h>
+#endif  /* HAVE_SCHED_H */
+
+#include "opal/util/bit_ops.h"
+#include "opal/class/opal_free_list.h"
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/common/sm/common_sm.h"
+
+BEGIN_C_DECLS
+
+/*
+ * Shared Memory FIFOs
+ *
+ * The FIFO is implemented as a circular queue with head and tail pointers
+ * (integer indices).  For efficient wraparound indexing, the size of the
+ * queue is constrained to be a power of two and we "&" indices with a "mask".
+ *
+ * More than one process can write to the FIFO head.  Therefore, there is a head
+ * lock.  One cannot write until the head slot is empty, indicated by the special
+ * queue entry SM_FIFO_FREE.
+ *
+ * Only the receiver can read the FIFO tail.  Therefore, the tail lock is
+ * required only in multithreaded applications.  If a tail read returns the
+ * SM_FIFO_FREE value, that means the FIFO is empty.  Once a non-FREE value
+ * has been read, the queue slot is *not* automatically reset to SM_FIFO_FREE.
+ * Rather, read tail slots are reset "lazily" (see "lazy_free" and "num_to_clear")
+ * to reduce the number of memory barriers and improve performance.
+ *
+ * Since the FIFO lives in shared memory that is mapped differently into
+ * each address space, the "queue" pointer is relative (each process must
+ * add its own offset) and the queue_recv pointer is meaningful only in the
+ * receiver's address space.
+ *
+ * Since multiple processes access different parts of the FIFO structure in
+ * different ways, we introduce padding to keep different parts on different
+ * cachelines.
+ */
+
+#define SM_FIFO_FREE  (void *) (-2)
+/* We can't use opal_cache_line_size here because we need a
+   compile-time constant for padding the struct.  We can't really have
+   a compile-time constant that is portable, either (e.g., compile on
+   one machine and run on another).  So just use a big enough cache
+   line that should hopefully be good in most places. */
+#define SM_CACHE_LINE_PAD 128
+
+struct sm_fifo_t {
+    /* This queue pointer is used only by the heads. */
+    volatile void **queue;           
+    char pad0[SM_CACHE_LINE_PAD - sizeof(void **)];
+    /* This lock is used by the heads. */
+    opal_atomic_lock_t head_lock;    
+    char pad1[SM_CACHE_LINE_PAD - sizeof(opal_atomic_lock_t)];
+    /* This index is used by the head holding the head lock. */
+    volatile int head;               
+    char pad2[SM_CACHE_LINE_PAD - sizeof(int)];
+    /* This mask is used "read only" by all processes. */
+    unsigned int mask;               
+    char pad3[SM_CACHE_LINE_PAD - sizeof(int)];
+    /* The following are used only by the tail. */
+    volatile void **queue_recv;
+    opal_atomic_lock_t tail_lock;
+    volatile int tail;
+    int num_to_clear;
+    int lazy_free;                   
+    char pad4[SM_CACHE_LINE_PAD - sizeof(void **) -
+              sizeof(opal_atomic_lock_t) -
+              sizeof(int) * 3];
+};
+typedef struct sm_fifo_t sm_fifo_t;
+
+/*
+ * Shared Memory resource managment
+ */
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+#define DATA (char)0
+#define DONE (char)1
+#endif
+
+typedef struct mca_btl_smcuda_mem_node_t {
+    mca_mpool_base_module_t* sm_mpool; /**< shared memory pool */
+} mca_btl_smcuda_mem_node_t;
+
+/**
+ * Shared Memory (SM) BTL module.
+ */
+struct mca_btl_smcuda_component_t {
+    mca_btl_base_component_2_0_0_t super;  /**< base BTL component */
+    int sm_free_list_num;              /**< initial size of free lists */
+    int sm_free_list_max;              /**< maximum size of free lists */
+    int sm_free_list_inc;              /**< number of elements to alloc when growing free lists */
+    int32_t sm_max_procs;              /**< upper limit on the number of processes using the shared memory pool */
+    int sm_extra_procs;                /**< number of extra procs to allow */
+    char* sm_mpool_name;               /**< name of shared memory pool module */
+    mca_mpool_base_module_t **sm_mpools; /**< shared memory pools (one for each memory node) */
+    mca_mpool_base_module_t *sm_mpool; /**< mpool on local node */
+    void* sm_mpool_base;               /**< base address of shared memory pool */
+    size_t eager_limit;                /**< first fragment size */
+    size_t max_frag_size;              /**< maximum (second and beyone) fragment size */
+    opal_mutex_t sm_lock;
+    mca_common_sm_module_t *sm_seg;   /**< description of shared memory segment */
+    volatile sm_fifo_t **shm_fifo;     /**< pointer to fifo 2D array in shared memory */
+    char **shm_bases;                  /**< pointer to base pointers in shared memory */
+    uint16_t *shm_mem_nodes;           /**< pointer to mem noded in shared memory */
+    sm_fifo_t **fifo;                  /**< cached copy of the pointer to the 2D
+                                          fifo array.  The address in the shared
+                                          memory segment sm_ctl_header is a relative,
+                                          but this one, in process private memory, is
+                                          a real virtual address */
+    uint16_t *mem_nodes;               /**< cached copy of mem nodes of each local rank */
+    size_t fifo_size;                  /**< number of FIFO queue entries */
+    size_t fifo_lazy_free;             /**< number of reads before lazy fifo free is triggered */
+    int nfifos;                        /**< number of FIFOs per receiver */
+    int32_t num_smp_procs;             /**< current number of smp procs on this host */
+    int32_t my_smp_rank;               /**< My SMP process rank.  Used for accessing
+                                        *   SMP specfic data structures. */
+    ompi_free_list_t sm_frags_eager;   /**< free list of sm first */
+    ompi_free_list_t sm_frags_max;     /**< free list of sm second */
+    ompi_free_list_t sm_frags_user;
+    ompi_free_list_t sm_first_frags_to_progress;  /**< list of first
+                                                    fragments that are
+                                                    awaiting resources */
+    struct mca_btl_base_endpoint_t **sm_peers;
+
+    opal_free_list_t pending_send_fl;
+    int num_outstanding_frags;         /**< number of fragments sent but not yet returned to free list */
+    int num_pending_sends;             /**< total number on all of my pending-send queues */
+    int mem_node;
+    int num_mem_nodes;
+    
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    char sm_fifo_path[PATH_MAX];   /**< path to fifo used to signal this process */
+    int  sm_fifo_fd;               /**< file descriptor corresponding to opened fifo */
+    opal_thread_t sm_fifo_thread;
+#endif
+    struct mca_btl_smcuda_t      **sm_btls;
+    struct mca_btl_smcuda_frag_t **table;
+    size_t sm_num_btls;
+    size_t sm_max_btls;
+
+
+    /** MCA: should we be using knem or not?  neg=try but continue if
+        not available, 0=don't try, 1=try and fail if not available */
+    int use_knem;
+
+    /** MCA: minimal message size (bytes) to offload on DMA engine
+        when using knem */
+    uint32_t knem_dma_min;
+
+    /** MCA: how many simultaneous ongoing knem operations to
+        support */
+    int knem_max_simultaneous;
+
+    /** If we want DMA and DMA is supported, this will be loaded with
+        KNEM_FLAG_DMA.  Otherwise, it'll be 0. */
+    int knem_dma_flag;
+};
+typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
+OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component;
+
+/**
+ * SM BTL Interface
+ */
+struct mca_btl_smcuda_t {
+    mca_btl_base_module_t  super;       /**< base BTL interface */
+    bool btl_inited;  /**< flag indicating if btl has been inited */
+    mca_btl_base_module_error_cb_fn_t error_cb;
+
+};
+typedef struct mca_btl_smcuda_t mca_btl_smcuda_t;
+OMPI_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda;
+
+
+
+
+
+struct btl_smcuda_pending_send_item_t
+{
+    opal_free_list_item_t super;
+    void *data;
+};
+typedef struct btl_smcuda_pending_send_item_t btl_smcuda_pending_send_item_t;
+
+/***
+ * FIFO support for sm BTL.
+ */
+
+/***
+ * One or more FIFO components may be a pointer that must be
+ * accessed by multiple processes.  Since the shared region may
+ * be mmapped differently into each process's address space,
+ * these pointers will be relative to some base address.  Here,
+ * we define macros to translate between relative addresses and
+ * virtual addresses.
+ */
+#define VIRTUAL2RELATIVE(VADDR ) ((long)(VADDR)  - (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
+#define RELATIVE2VIRTUAL(OFFSET) ((long)(OFFSET) + (long)mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank])
+
+static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool,
+                               sm_fifo_t *fifo, int lazy_free)
+{
+    int i, qsize;
+
+    /* figure out the queue size (a power of two that is at least 1) */
+    qsize = opal_next_poweroftwo_inclusive (fifo_size);
+
+    /* allocate the queue in the receiver's address space */
+    fifo->queue_recv = (volatile void **)mpool->mpool_alloc(
+            mpool, sizeof(void *) * qsize, opal_cache_line_size, 0, NULL);
+    if(NULL == fifo->queue_recv) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* initialize the queue */
+    for ( i = 0; i < qsize; i++ )
+        fifo->queue_recv[i] = SM_FIFO_FREE;
+
+    /* shift queue address to be relative */
+    fifo->queue = (volatile void **) VIRTUAL2RELATIVE(fifo->queue_recv);
+
+    /* initialize the locks */
+    opal_atomic_init(&(fifo->head_lock), OPAL_ATOMIC_UNLOCKED);
+    opal_atomic_init(&(fifo->tail_lock), OPAL_ATOMIC_UNLOCKED);
+    opal_atomic_unlock(&(fifo->head_lock));  /* should be unnecessary */
+    opal_atomic_unlock(&(fifo->tail_lock));  /* should be unnecessary */
+
+    /* other initializations */
+    fifo->head = 0;
+    fifo->mask = qsize - 1;
+    fifo->tail = 0;
+    fifo->num_to_clear = 0;
+    fifo->lazy_free = lazy_free;
+
+    return OMPI_SUCCESS;
+}
+
+
+static inline int sm_fifo_write(void *value, sm_fifo_t *fifo)
+{
+    volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo->queue);
+
+    /* if there is no free slot to write, report exhausted resource */
+    opal_atomic_rmb();
+    if ( SM_FIFO_FREE != q[fifo->head] )
+        return OMPI_ERR_OUT_OF_RESOURCE;
+
+    /* otherwise, write to the slot and advance the head index */
+    q[fifo->head] = value;
+    opal_atomic_wmb();
+    fifo->head = (fifo->head + 1) & fifo->mask;
+    return OMPI_SUCCESS;
+}
+
+
+static inline void *sm_fifo_read(sm_fifo_t *fifo)
+{
+    void *value;
+
+    /* read the next queue entry */
+    value = (void *) fifo->queue_recv[fifo->tail];
+
+    opal_atomic_rmb();
+
+    /* if you read a non-empty slot, advance the tail pointer */
+    if ( SM_FIFO_FREE != value ) {
+
+        fifo->tail = ( fifo->tail + 1 ) & fifo->mask;
+        fifo->num_to_clear += 1;
+
+        /* check if it's time to free slots, which we do lazily */
+        if ( fifo->num_to_clear >= fifo->lazy_free ) {
+            int i = (fifo->tail - fifo->num_to_clear ) & fifo->mask;
+
+            while ( fifo->num_to_clear > 0 ) {
+                fifo->queue_recv[i] = SM_FIFO_FREE;
+                i = (i+1) & fifo->mask;
+                fifo->num_to_clear -= 1;
+            }
+            opal_atomic_wmb();
+        }
+    }
+
+    return value;
+}
+
+/**
+ * shared memory component progress.
+ */
+extern int mca_btl_smcuda_component_progress(void);
+
+
+
+/**
+ * Register a callback function that is called on error..
+ *
+ * @param btl (IN)     BTL module
+ * @return             Status indicating if cleanup was successful
+ */
+
+int mca_btl_smcuda_register_error_cb(
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_module_error_cb_fn_t cbfunc
+);
+
+/**
+ * Cleanup any resources held by the BTL.
+ *
+ * @param btl  BTL instance.
+ * @return     OMPI_SUCCESS or error status on failure.
+ */
+
+extern int mca_btl_smcuda_finalize(
+    struct mca_btl_base_module_t* btl
+);
+
+
+/**
+ * PML->BTL notification of change in the process list.
+ * PML->BTL Notification that a receive fragment has been matched.
+ * Called for message that is send from process with the virtual
+ * address of the shared memory segment being different than that of
+ * the receiver.
+ *
+ * @param btl (IN)
+ * @param proc (IN)
+ * @param peer (OUT)
+ * @return     OMPI_SUCCESS or error status on failure.
+ *
+ */
+
+extern int mca_btl_smcuda_add_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t** peers,
+    struct opal_bitmap_t* reachability
+);
+
+
+/**
+ * PML->BTL notification of change in the process list.
+ *
+ * @param btl (IN)     BTL instance
+ * @param proc (IN)    Peer process
+ * @param peer (IN)    Peer addressing information.
+ * @return             Status indicating if cleanup was successful
+ *
+ */
+extern int mca_btl_smcuda_del_procs(
+    struct mca_btl_base_module_t* btl,
+    size_t nprocs,
+    struct ompi_proc_t **procs,
+    struct mca_btl_base_endpoint_t **peers
+);
+
+
+/**
+ * Allocate a segment.
+ *
+ * @param btl (IN)      BTL module
+ * @param size (IN)     Request segment size.
+ */
+extern mca_btl_base_descriptor_t* mca_btl_smcuda_alloc(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    uint8_t order,
+    size_t size,
+    uint32_t flags
+);
+
+/**
+ * Return a segment allocated by this BTL.
+ *
+ * @param btl (IN)      BTL module
+ * @param segment (IN)  Allocated segment.
+ */
+extern int mca_btl_smcuda_free(
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_descriptor_t* segment
+);
+
+
+/**
+ * Pack data
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_src(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    mca_mpool_base_registration_t* registration,
+    struct opal_convertor_t* convertor,
+    uint8_t order,
+    size_t reserve,
+    size_t* size,
+    uint32_t flags
+);
+
+
+/**
+ * Initiate an inlined send to the peer or return a descriptor.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+extern int mca_btl_smcuda_sendi( struct mca_btl_base_module_t* btl,
+                             struct mca_btl_base_endpoint_t* endpoint,
+                             struct opal_convertor_t* convertor,
+                             void* header,
+                             size_t header_size,
+                             size_t payload_size,
+                             uint8_t order,
+                             uint32_t flags,
+                             mca_btl_base_tag_t tag,
+                             mca_btl_base_descriptor_t** descriptor );
+
+/**
+ * Initiate a send to the peer.
+ *
+ * @param btl (IN)      BTL module
+ * @param peer (IN)     BTL peer addressing
+ */
+extern int mca_btl_smcuda_send(
+    struct mca_btl_base_module_t* btl,
+    struct mca_btl_base_endpoint_t* endpoint,
+    struct mca_btl_base_descriptor_t* descriptor,
+    mca_btl_base_tag_t tag
+);
+
+#if OMPI_CUDA_SUPPORT
+/**
+ * Remote get using device memory.
+ */
+extern int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t* btl,
+							   struct mca_btl_base_endpoint_t* ep,
+							   struct mca_btl_base_descriptor_t* descriptor);
+
+extern struct mca_btl_base_descriptor_t* mca_btl_smcuda_prepare_dst(
+		struct mca_btl_base_module_t* btl,
+		struct mca_btl_base_endpoint_t* endpoint,
+		struct mca_mpool_base_registration_t* registration,
+		struct opal_convertor_t* convertor,
+		uint8_t order,
+		size_t reserve,
+		size_t* size,
+		uint32_t flags);
+#endif /* OMPI_CUDA_SUPPORT */
+
+/**
+ * Fault Tolerance Event Notification Function
+ * @param state Checkpoint Stae
+ * @return OMPI_SUCCESS or failure status
+ */
+int mca_btl_smcuda_ft_event(int state);
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+void mca_btl_smcuda_component_event_thread(opal_object_t*);
+#endif
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer) \
+{ \
+    unsigned char cmd = DATA; \
+    if(write(peer->fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) { \
+        opal_output(0, "mca_btl_smcuda_send: write fifo failed: errno=%d\n", errno); \
+    } \
+}
+#else
+#define MCA_BTL_SMCUDA_SIGNAL_PEER(peer)
+#endif
+
+END_C_DECLS
+
+#endif
+
--- a/ompi/mca/btl/smcuda/btl_smcuda_component.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda_component.c
@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010-2011 Los Alamos National Security, LLC.
+ *                         All rights reserved.
+ * Copyright (c) 2011-2012 NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include <errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif  /* HAVE_UNISTD_H */
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif  /* HAVE_STRING_H */
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif  /* HAVE_FCNTL_H */
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif  /* HAVE_SYS_TYPES_H */
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif  /* HAVE_SYS_MMAN_H */
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>  /* for mkfifo */
+#endif  /* HAVE_SYS_STAT_H */
+
+#include "ompi/constants.h"
+#include "opal/mca/event/event.h"
+#include "opal/util/bit_ops.h"
+#include "opal/util/output.h"
+#include "orte/util/proc_info.h"
+#include "orte/util/show_help.h"
+#include "orte/runtime/orte_globals.h"
+
+#include "opal/mca/base/mca_base_param.h"
+#include "ompi/mca/mpool/base/base.h"
+#if OMPI_CUDA_SUPPORT
+#include "ompi/runtime/params.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */
+#include "ompi/mca/common/sm/common_sm.h"
+#include "ompi/mca/btl/base/btl_base_error.h"
+
+#if OPAL_ENABLE_FT_CR    == 1
+#include "opal/runtime/opal_cr.h"
+#endif
+
+#include "btl_smcuda.h"
+#include "btl_smcuda_frag.h"
+#include "btl_smcuda_fifo.h"
+
+static int mca_btl_smcuda_component_open(void);
+static int mca_btl_smcuda_component_close(void);
+static int smcuda_register(void);
+static mca_btl_base_module_t** mca_btl_smcuda_component_init(
+    int *num_btls,
+    bool enable_progress_threads,
+    bool enable_mpi_threads
+);
+
+
+/*
+ * Shared Memory (SM) component instance.
+ */
+mca_btl_smcuda_component_t mca_btl_smcuda_component = {
+    {  /* super is being filled in */
+        /* First, the mca_base_component_t struct containing meta information
+          about the component itself */
+        {
+            MCA_BTL_BASE_VERSION_2_0_0,
+
+            "smcuda", /* MCA component name */
+            OMPI_MAJOR_VERSION,  /* MCA component major version */
+            OMPI_MINOR_VERSION,  /* MCA component minor version */
+            OMPI_RELEASE_VERSION,  /* MCA component release version */
+            mca_btl_smcuda_component_open,  /* component open */
+            mca_btl_smcuda_component_close,  /* component close */
+            NULL,
+            smcuda_register,
+        },
+        {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        mca_btl_smcuda_component_init,
+        mca_btl_smcuda_component_progress,
+    }  /* end super */
+};
+
+
+/*
+ * utility routines for parameter registration
+ */
+
+static inline char* mca_btl_smcuda_param_register_string(
+    const char* param_name,
+    const char* default_value)
+{
+    char *param_value;
+    int id = mca_base_param_register_string("btl","sm",param_name,NULL,default_value);
+    mca_base_param_lookup_string(id, &param_value);
+    return param_value;
+}
+
+static inline int mca_btl_smcuda_param_register_int(
+    const char* param_name,
+    int default_value)
+{
+    int id = mca_base_param_register_int("btl","sm",param_name,NULL,default_value);
+    int param_value = default_value;
+    mca_base_param_lookup_int(id,&param_value);
+    return param_value;
+}
+
+
+static int smcuda_register(void)
+{
+    /* register SM component parameters */
+    mca_btl_smcuda_component.sm_free_list_num =
+        mca_btl_smcuda_param_register_int("free_list_num", 8);
+    mca_btl_smcuda_component.sm_free_list_max =
+        mca_btl_smcuda_param_register_int("free_list_max", -1);
+    mca_btl_smcuda_component.sm_free_list_inc =
+        mca_btl_smcuda_param_register_int("free_list_inc", 64);
+    mca_btl_smcuda_component.sm_max_procs =
+        mca_btl_smcuda_param_register_int("max_procs", -1);
+    mca_btl_smcuda_component.sm_mpool_name =
+        mca_btl_smcuda_param_register_string("mpool", "sm");
+    mca_btl_smcuda_component.fifo_size =
+        mca_btl_smcuda_param_register_int("fifo_size", 4096);
+    mca_btl_smcuda_component.nfifos =
+        mca_btl_smcuda_param_register_int("num_fifos", 1);
+
+    mca_btl_smcuda_component.fifo_lazy_free =
+        mca_btl_smcuda_param_register_int("fifo_lazy_free", 120);
+
+    /* default number of extra procs to allow for future growth */
+    mca_btl_smcuda_component.sm_extra_procs =
+        mca_btl_smcuda_param_register_int("sm_extra_procs", 0);
+
+#if OMPI_CUDA_SUPPORT
+    mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
+#else /* OMPI_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH-1;
+#endif /* OMPI_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_eager_limit = 4*1024;
+    mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
+    mca_btl_smcuda.super.btl_max_send_size = 32*1024;
+    mca_btl_smcuda.super.btl_rdma_pipeline_send_length = 64*1024;
+    mca_btl_smcuda.super.btl_rdma_pipeline_frag_size = 64*1024;
+    mca_btl_smcuda.super.btl_min_rdma_pipeline_size = 64*1024;
+    mca_btl_smcuda.super.btl_flags = MCA_BTL_FLAGS_SEND;
+#if OMPI_CUDA_SUPPORT
+    mca_btl_smcuda.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
+#endif /* OMPI_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_bandwidth = 9000;  /* Mbs */
+    mca_btl_smcuda.super.btl_latency   = 1;     /* Microsecs */
+
+    /* Call the BTL based to register its MCA params */
+    mca_btl_base_param_register(&mca_btl_smcuda_component.super.btl_version,
+                                &mca_btl_smcuda.super);
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ *  Called by MCA framework to open the component, registers
+ *  component parameters.
+ */
+
+static int mca_btl_smcuda_component_open(void)
+{
+    mca_btl_smcuda_component.sm_max_btls = 1;
+
+    /* make sure the number of fifos is a power of 2 */
+    mca_btl_smcuda_component.nfifos = opal_next_poweroftwo_inclusive (mca_btl_smcuda_component.nfifos);
+
+    /* make sure that queue size and lazy free parameter are compatible */
+    if (mca_btl_smcuda_component.fifo_lazy_free >= (mca_btl_smcuda_component.fifo_size >> 1) )
+        mca_btl_smcuda_component.fifo_lazy_free  = (mca_btl_smcuda_component.fifo_size >> 1);
+    if (mca_btl_smcuda_component.fifo_lazy_free <= 0)
+        mca_btl_smcuda_component.fifo_lazy_free  = 1;
+
+    mca_btl_smcuda_component.max_frag_size = mca_btl_smcuda.super.btl_max_send_size;
+    mca_btl_smcuda_component.eager_limit = mca_btl_smcuda.super.btl_eager_limit;
+
+    /* initialize objects */
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_eager, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_max, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ * component cleanup - sanity checking of queue lengths
+ */
+
+static int mca_btl_smcuda_component_close(void)
+{
+    int return_value = OMPI_SUCCESS;
+
+
+    OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
+    /**
+     * We don't have to destroy the fragment lists. They are allocated
+     * directly into the mmapped file, they will auto-magically disappear
+     * when the file get unmapped.
+     */
+    /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_eager);*/
+    /*OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_frags_max);*/
+
+    /* unmap the shared memory control structure */
+    if(mca_btl_smcuda_component.sm_seg != NULL) {
+        return_value = mca_common_sm_fini( mca_btl_smcuda_component.sm_seg );
+        if( OMPI_SUCCESS != return_value ) {
+            return_value=OMPI_ERROR;
+            opal_output(0," mca_common_sm_fini failed\n");
+            goto CLEANUP;
+        }
+
+        /* unlink file, so that it will be deleted when all references
+         * to it are gone - no error checking, since we want all procs
+         * to call this, so that in an abnormal termination scenario,
+         * this file will still get cleaned up */
+#if OPAL_ENABLE_FT_CR    == 1
+        /* Only unlink the file if we are *not* restarting
+         * If we are restarting the file will be unlinked at a later time.
+         */
+        if(OPAL_CR_STATUS_RESTART_PRE  != opal_cr_checkpointing_state &&
+           OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
+            unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
+        }
+#else
+        unlink(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
+#endif
+        OBJ_RELEASE(mca_btl_smcuda_component.sm_seg);
+    }
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    /* close/cleanup fifo create for event notification */
+    if(mca_btl_smcuda_component.sm_fifo_fd > 0) {
+        /* write a done message down the pipe */
+        unsigned char cmd = DONE;
+        if( write(mca_btl_smcuda_component.sm_fifo_fd,&cmd,sizeof(cmd)) !=
+                sizeof(cmd)){
+            opal_output(0, "mca_btl_smcuda_component_close: write fifo failed: errno=%d\n",
+                    errno);
+        }
+        opal_thread_join(&mca_btl_smcuda_component.sm_fifo_thread, NULL);
+        close(mca_btl_smcuda_component.sm_fifo_fd);
+        unlink(mca_btl_smcuda_component.sm_fifo_path);
+    }
+#endif
+
+    if (NULL != mca_btl_smcuda_component.sm_mpool_name) {
+        free(mca_btl_smcuda_component.sm_mpool_name);
+    }
+
+CLEANUP:
+
+    /* return */
+    return return_value;
+}
+
+/*
+ *  SM component initialization
+ */
+static mca_btl_base_module_t** mca_btl_smcuda_component_init(
+    int *num_btls,
+    bool enable_progress_threads,
+    bool enable_mpi_threads)
+{
+    mca_btl_base_module_t **btls = NULL;
+
+    *num_btls = 0;
+
+    /* if no session directory was created, then we cannot be used */
+    if (!orte_create_session_dirs) {
+        return NULL;
+    }
+    
+    /* lookup/create shared memory pool only when used */
+    mca_btl_smcuda_component.sm_mpool = NULL;
+    mca_btl_smcuda_component.sm_mpool_base = NULL;
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    /* create a named pipe to receive events  */
+    sprintf( mca_btl_smcuda_component.sm_fifo_path,
+             "%s"OPAL_PATH_SEP"sm_fifo.%lu", orte_process_info.job_session_dir,
+             (unsigned long)ORTE_PROC_MY_NAME->vpid );
+    if(mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
+        opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n",errno);
+        return NULL;
+    }
+    mca_btl_smcuda_component.sm_fifo_fd = open(mca_btl_smcuda_component.sm_fifo_path, O_RDWR);
+    if(mca_btl_smcuda_component.sm_fifo_fd < 0) {
+        opal_output(0, "mca_btl_smcuda_component_init: open(%s) failed with errno=%d\n",
+                    mca_btl_smcuda_component.sm_fifo_path, errno);
+        return NULL;
+    }
+
+    OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_fifo_thread, opal_thread_t);
+    mca_btl_smcuda_component.sm_fifo_thread.t_run = (opal_thread_fn_t) mca_btl_smcuda_component_event_thread;
+    opal_thread_start(&mca_btl_smcuda_component.sm_fifo_thread);
+#endif
+
+    mca_btl_smcuda_component.sm_btls = (mca_btl_smcuda_t **) malloc( mca_btl_smcuda_component.sm_max_btls * sizeof (mca_btl_smcuda_t *));
+    if (NULL == mca_btl_smcuda_component.sm_btls) {
+        return NULL;
+    }
+
+    /* allocate the Shared Memory BTL */
+    *num_btls = 1;
+    btls = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*));
+    if (NULL == btls) {
+        return NULL;
+    }
+
+    /* get pointer to the btls */
+    btls[0] = (mca_btl_base_module_t*)(&(mca_btl_smcuda));
+    mca_btl_smcuda_component.sm_btls[0] = (mca_btl_smcuda_t*)(&(mca_btl_smcuda));
+
+    /* initialize some BTL data */
+    /* start with no SM procs */
+    mca_btl_smcuda_component.num_smp_procs = 0;
+    mca_btl_smcuda_component.my_smp_rank   = -1;  /* not defined */
+    mca_btl_smcuda_component.sm_num_btls   = 1;
+    /* set flag indicating btl not inited */
+    mca_btl_smcuda.btl_inited = false;
+
+#if OMPI_CUDA_SUPPORT
+    /* Assume CUDA GET works. */
+	mca_btl_smcuda.super.btl_get = mca_btl_smcuda_get_cuda;
+#endif /* OMPI_CUDA_SUPPORT */
+
+    return btls;
+
+}
+
+
+/*
+ *  SM component progress.
+ */
+
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+void mca_btl_smcuda_component_event_thread(opal_object_t* thread)
+{
+    while(1) {
+        unsigned char cmd;
+        if(read(mca_btl_smcuda_component.sm_fifo_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) {
+            /* error condition */
+            return;
+        }
+        if( DONE == cmd ){
+            /* return when done message received */
+            return;
+        }
+        mca_btl_smcuda_component_progress();
+    }
+}
+#endif
+
+void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep) 
+{ 
+    btl_smcuda_pending_send_item_t *si; 
+    int rc; 
+
+    while ( 0 < opal_list_get_size(&ep->pending_sends) ) {
+        /* Note that we access the size of ep->pending_sends unlocked
+           as it doesn't really matter if the result is wrong as 
+           opal_list_remove_first is called with a lock and we handle it
+           not finding an item to process */
+        OPAL_THREAD_LOCK(&ep->endpoint_lock);
+        si = (btl_smcuda_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends); 
+        OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
+
+        if(NULL == si) return; /* Another thread got in before us. Thats ok. */
+    
+        OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_pending_sends, -1);
+
+        MCA_BTL_SMCUDA_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
+                          true, false, rc);
+
+        OPAL_FREE_LIST_RETURN(&mca_btl_smcuda_component.pending_send_fl, (opal_list_item_t*)si);
+
+        if ( OMPI_SUCCESS != rc )
+            return;
+    }
+} 
+
+int mca_btl_smcuda_component_progress(void)
+{
+    /* local variables */
+    mca_btl_smcuda_frag_t *frag;
+    mca_btl_smcuda_frag_t Frag;
+    sm_fifo_t *fifo = NULL;
+    mca_btl_smcuda_hdr_t *hdr;
+    int my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
+    int peer_smp_rank, j, rc = 0, nevents = 0;
+
+    /* first, deal with any pending sends */
+    /* This check should be fast since we only need to check one variable. */
+    if ( 0 < mca_btl_smcuda_component.num_pending_sends ) {
+
+        /* perform a loop to find the endpoints that have pending sends */
+        /* This can take a while longer if there are many endpoints to check. */
+        for ( peer_smp_rank = 0; peer_smp_rank < mca_btl_smcuda_component.num_smp_procs; peer_smp_rank++) {
+            struct mca_btl_base_endpoint_t* endpoint;
+            if ( peer_smp_rank == my_smp_rank )
+                continue;
+            endpoint = mca_btl_smcuda_component.sm_peers[peer_smp_rank];
+            if ( 0 < opal_list_get_size(&endpoint->pending_sends) )
+                btl_smcuda_process_pending_sends(endpoint);
+        }
+    }
+
+    /* poll each fifo */
+    for(j = 0; j < FIFO_MAP_NUM(mca_btl_smcuda_component.num_smp_procs); j++) {
+        fifo = &(mca_btl_smcuda_component.fifo[my_smp_rank][j]);
+      recheck_peer:
+        /* aquire thread lock */
+        if(opal_using_threads()) {
+            opal_atomic_lock(&(fifo->tail_lock));
+        }
+
+        hdr = (mca_btl_smcuda_hdr_t *)sm_fifo_read(fifo);
+
+        /* release thread lock */
+        if(opal_using_threads()) {
+            opal_atomic_unlock(&(fifo->tail_lock));
+        }
+
+        if(SM_FIFO_FREE == hdr) {
+            continue;
+        }
+
+        nevents++;
+        /* dispatch fragment by type */
+        switch(((uintptr_t)hdr) & MCA_BTL_SMCUDA_FRAG_TYPE_MASK) {
+            case MCA_BTL_SMCUDA_FRAG_SEND:
+            {
+                mca_btl_active_message_callback_t* reg;
+                /* change the address from address relative to the shared
+                 * memory address, to a true virtual address */
+                hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
+                peer_smp_rank = hdr->my_smp_rank;
+#if OPAL_ENABLE_DEBUG
+                if ( FIFO_MAP(peer_smp_rank) != j ) {
+                    opal_output(0, "mca_btl_smcuda_component_progress: "
+                                "rank %d got %d on FIFO %d, but this sender should send to FIFO %d\n",
+                                my_smp_rank, peer_smp_rank, j, FIFO_MAP(peer_smp_rank));
+                }
+#endif
+                /* recv upcall */
+                reg = mca_btl_base_active_message_trigger + hdr->tag;
+                Frag.segment.seg_addr.pval = ((char*)hdr) +
+                    sizeof(mca_btl_smcuda_hdr_t);
+                Frag.segment.seg_len = hdr->len;
+                Frag.base.des_dst_cnt = 1;
+                Frag.base.des_dst = &(Frag.segment);
+                reg->cbfunc(&mca_btl_smcuda.super, hdr->tag, &(Frag.base),
+                            reg->cbdata);
+                /* return the fragment */
+                MCA_BTL_SMCUDA_FIFO_WRITE(
+                        mca_btl_smcuda_component.sm_peers[peer_smp_rank],
+                        my_smp_rank, peer_smp_rank, hdr->frag, false, true, rc);
+                break;
+            }
+        case MCA_BTL_SMCUDA_FRAG_ACK:
+            {
+                int status = (uintptr_t)hdr & MCA_BTL_SMCUDA_FRAG_STATUS_MASK;
+                int btl_ownership;
+                struct mca_btl_base_endpoint_t* endpoint;
+
+                frag = (mca_btl_smcuda_frag_t *)((char*)((uintptr_t)hdr &
+                                                     (~(MCA_BTL_SMCUDA_FRAG_TYPE_MASK |
+                                                        MCA_BTL_SMCUDA_FRAG_STATUS_MASK))));
+
+                endpoint = frag->endpoint;
+                btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+                if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
+                    /* completion callback */
+                    frag->base.des_cbfunc(&mca_btl_smcuda.super, frag->endpoint,
+                                          &frag->base, status?OMPI_ERROR:OMPI_SUCCESS);
+                }
+                if( btl_ownership ) {
+                    MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+                }
+                OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
+                if ( 0 < opal_list_get_size(&endpoint->pending_sends) ) {
+                    btl_smcuda_process_pending_sends(endpoint);
+                }
+                goto recheck_peer;
+            }
+            default:
+                /* unknown */
+                /*
+                 * This code path should presumably never be called.
+                 * It's unclear if it should exist or, if so, how it should be written.
+                 * If we want to return it to the sending process,
+                 * we have to figure out who the sender is.
+                 * It seems we need to subtract the mask bits.
+                 * Then, hopefully this is an sm header that has an smp_rank field.
+                 * Presumably that means the received header was relative.
+                 * Or, maybe this code should just be removed.
+                 */
+                opal_output(0, "mca_btl_smcuda_component_progress read an unknown type of header");
+                hdr = (mca_btl_smcuda_hdr_t *) RELATIVE2VIRTUAL(hdr);
+                peer_smp_rank = hdr->my_smp_rank;
+                hdr = (mca_btl_smcuda_hdr_t*)((uintptr_t)hdr->frag |
+                        MCA_BTL_SMCUDA_FRAG_STATUS_MASK);
+                MCA_BTL_SMCUDA_FIFO_WRITE(
+                        mca_btl_smcuda_component.sm_peers[peer_smp_rank],
+                        my_smp_rank, peer_smp_rank, hdr, false, true, rc);
+                break;
+        }
+    }
+
+#if OMPI_CUDA_SUPPORT
+    /* Check to see if there are any outstanding CUDA events that have
+     * completed.  If so, issue the PML callbacks on the fragments.
+     */
+    while (1 == progress_one_cuda_event((mca_btl_base_descriptor_t **)&frag)) {
+        int btl_ownership;
+        btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+        if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
+            frag->base.des_cbfunc(&mca_btl_smcuda.super, 
+                                  frag->endpoint, &frag->base, 
+                                  OMPI_SUCCESS);
+        }
+
+        if (btl_ownership) {
+            if(frag->registration != NULL) {
+                frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool,
+                                                       (mca_mpool_base_registration_t*)frag->registration);
+                frag->registration = NULL;
+            }
+            MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+        }
+        nevents++;
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+    return nevents;
+}
--- a/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_endpoint.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_BTL_SMCUDA_ENDPOINT_H
+#define MCA_BTL_SMCUDA_ENDPOINT_H
+
+/**
+ *  An abstraction that represents a connection to a endpoint process.
+ *  An instance of mca_ptl_base_endpoint_t is associated w/ each process
+ *  and BTL pair at startup.
+ */
+
+struct mca_btl_base_endpoint_t {
+    int my_smp_rank;    /**< My SMP process rank.  Used for accessing
+                         *   SMP specfic data structures. */
+    int peer_smp_rank;  /**< My peer's SMP process rank.  Used for accessing
+                         *   SMP specfic data structures. */
+#if OMPI_CUDA_SUPPORT
+    mca_mpool_base_module_t *mpool; /**< mpool for remotely registered memory */
+#endif /* OMPI_CUDA_SUPPORT */
+#if OMPI_ENABLE_PROGRESS_THREADS == 1
+    int fifo_fd;        /**< pipe/fifo used to signal endpoint that data is queued */
+#endif
+    opal_list_t pending_sends; /**< pending data to send */
+
+    /** lock for concurrent access to endpoint state */
+    opal_mutex_t                endpoint_lock;
+
+};
+
+void btl_smcuda_process_pending_sends(struct mca_btl_base_endpoint_t *ep);
+#endif
--- a/ompi/mca/btl/smcuda/btl_smcuda_fifo.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_fifo.h
@ -0,0 +1,87 @@
+#ifndef MCA_BTL_SMCUDA_FIFO_H
+#define MCA_BTL_SMCUDA_FIFO_H
+
+#include "btl_smcuda.h"
+#include "btl_smcuda_endpoint.h"
+
+static void
+add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool resend)
+{
+    int rc;
+    btl_smcuda_pending_send_item_t *si;
+    opal_free_list_item_t *i;
+    OPAL_FREE_LIST_GET(&mca_btl_smcuda_component.pending_send_fl, i, rc);
+
+    /* don't handle error for now */
+    assert(i != NULL && rc == OMPI_SUCCESS);
+
+    si = (btl_smcuda_pending_send_item_t*)i;
+    si->data = data;
+
+    OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_pending_sends, +1);
+
+    /* if data was on pending send list then prepend it to the list to
+     * minimize reordering */
+    OPAL_THREAD_LOCK(&ep->endpoint_lock);
+    if (resend)
+        opal_list_prepend(&ep->pending_sends, (opal_list_item_t*)si);
+    else
+        opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
+    OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
+}
+
+/*
+ * FIFO_MAP(x) defines which FIFO on the receiver should be used
+ * by sender rank x.  The map is some many-to-one hash.
+ *
+ * FIFO_MAP_NUM(n) defines how many FIFOs the receiver has for
+ * n senders.
+ *
+ * That is,
+ *
+ *      for all    0 <= x < n:
+ *
+ *              0 <= FIFO_MAP(x) < FIFO_MAP_NUM(n)
+ *
+ * For example, using some power-of-two nfifos, we could have
+ *
+ *    FIFO_MAP(x)     = x & (nfifos-1)
+ *    FIFO_MAP_NUM(n) = min(nfifos,n)
+ *
+ * Interesting limits include:
+ *
+ *    nfifos very large:  In this case, each sender has its
+ *       own dedicated FIFO on each receiver and the receiver
+ *       has one FIFO per sender.
+ *
+ *    nfifos == 1:  In this case, all senders use the same
+ *       FIFO and each receiver has just one FIFO for all senders.
+ */
+#define FIFO_MAP(x)     ((x) & (mca_btl_smcuda_component.nfifos - 1))
+#define FIFO_MAP_NUM(n) ( (mca_btl_smcuda_component.nfifos) < (n) ? (mca_btl_smcuda_component.nfifos) : (n) )
+
+
+#define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank,               \
+                              peer_smp_rank, hdr, resend, retry_pending_sends, rc)        \
+do {                                                                    \
+    sm_fifo_t* fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]); \
+                                                                        \
+    if ( retry_pending_sends ) {                                        \
+        if ( 0 < opal_list_get_size(&endpoint_peer->pending_sends) ) {  \
+            btl_smcuda_process_pending_sends(endpoint_peer);                \
+        }                                                               \
+    }                                                                   \
+                                                                        \
+    opal_atomic_lock(&(fifo->head_lock));                               \
+    /* post fragment */                                                 \
+    if(sm_fifo_write(hdr, fifo) != OMPI_SUCCESS) {                      \
+        add_pending(endpoint_peer, hdr, resend);                        \
+        rc = OMPI_ERR_RESOURCE_BUSY;                                    \
+    } else {                                                            \
+        MCA_BTL_SMCUDA_SIGNAL_PEER(endpoint_peer);                          \
+        rc = OMPI_SUCCESS;                                              \
+    }                                                                   \
+    opal_atomic_unlock(&(fifo->head_lock));                             \
+} while(0)
+
+#endif
--- a/ompi/mca/btl/smcuda/btl_smcuda_frag.c
+++ b/ompi/mca/btl/smcuda/btl_smcuda_frag.c
@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+#include "ompi_config.h"
+#include "btl_smcuda_frag.h"
+
+
+static inline void mca_btl_smcuda_frag_common_constructor(mca_btl_smcuda_frag_t* frag)
+{
+    frag->hdr = (mca_btl_smcuda_hdr_t*)frag->base.super.ptr;
+    if(frag->hdr != NULL) {
+        frag->hdr->frag = (mca_btl_smcuda_frag_t*)((uintptr_t)frag |
+            MCA_BTL_SMCUDA_FRAG_ACK);
+        frag->segment.seg_addr.pval = ((char*)frag->hdr) +
+            sizeof(mca_btl_smcuda_hdr_t);
+        frag->hdr->my_smp_rank = mca_btl_smcuda_component.my_smp_rank;
+    }
+    frag->segment.seg_len = frag->size;
+    frag->base.des_src = &frag->segment;
+    frag->base.des_src_cnt = 1;
+    frag->base.des_dst = &frag->segment;
+    frag->base.des_dst_cnt = 1;
+    frag->base.des_flags = 0;
+#if OMPI_CUDA_SUPPORT
+    frag->registration = NULL;
+#endif /* OMPI_CUDA_SUPPORT */
+}
+
+static void mca_btl_smcuda_frag1_constructor(mca_btl_smcuda_frag_t* frag)
+{
+    frag->size = mca_btl_smcuda_component.eager_limit;
+    frag->my_list = &mca_btl_smcuda_component.sm_frags_eager;
+    mca_btl_smcuda_frag_common_constructor(frag);
+}
+
+static void mca_btl_smcuda_frag2_constructor(mca_btl_smcuda_frag_t* frag)
+{
+    frag->size = mca_btl_smcuda_component.max_frag_size;
+    frag->my_list = &mca_btl_smcuda_component.sm_frags_max;
+    mca_btl_smcuda_frag_common_constructor(frag);
+}
+
+static void mca_btl_smcuda_user_constructor(mca_btl_smcuda_frag_t* frag)
+{
+	frag->size = 0;
+	frag->my_list = &mca_btl_smcuda_component.sm_frags_user;
+	mca_btl_smcuda_frag_common_constructor(frag);
+}
+
+OBJ_CLASS_INSTANCE(
+    mca_btl_smcuda_frag1_t,
+    mca_btl_base_descriptor_t,
+    mca_btl_smcuda_frag1_constructor,
+    NULL);
+
+OBJ_CLASS_INSTANCE(
+    mca_btl_smcuda_frag2_t,
+    mca_btl_base_descriptor_t,
+    mca_btl_smcuda_frag2_constructor,
+    NULL);
+
+OBJ_CLASS_INSTANCE(
+    mca_btl_smcuda_user_t,
+    mca_btl_base_descriptor_t,
+    mca_btl_smcuda_user_constructor,
+    NULL);
--- a/ompi/mca/btl/smcuda/btl_smcuda_frag.h
+++ b/ompi/mca/btl/smcuda/btl_smcuda_frag.h
@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_BTL_SMCUDA_SEND_FRAG_H
+#define MCA_BTL_SMCUDA_SEND_FRAG_H
+
+#include "ompi_config.h"
+#include "btl_smcuda.h"
+
+
+#define MCA_BTL_SMCUDA_FRAG_TYPE_MASK ((uintptr_t)0x3)
+#define MCA_BTL_SMCUDA_FRAG_SEND ((uintptr_t)0x0)
+#define MCA_BTL_SMCUDA_FRAG_ACK ((uintptr_t)0x1)
+#define MCA_BTL_SMCUDA_FRAG_PUT ((uintptr_t)0x2)
+#define MCA_BTL_SMCUDA_FRAG_GET ((uintptr_t)0x3)
+
+#define MCA_BTL_SMCUDA_FRAG_STATUS_MASK ((uintptr_t)0x4)
+
+struct mca_btl_smcuda_frag_t;
+
+struct mca_btl_smcuda_hdr_t {
+    struct mca_btl_smcuda_frag_t *frag;
+    size_t len;
+    int my_smp_rank;
+    mca_btl_base_tag_t tag;
+};
+typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t;
+
+/**
+ * shared memory send fragment derived type.
+ */
+struct mca_btl_smcuda_frag_t {
+    mca_btl_base_descriptor_t base;
+    mca_btl_base_segment_t segment;
+    struct mca_btl_base_endpoint_t *endpoint;
+#if OMPI_CUDA_SUPPORT
+    struct mca_mpool_base_registration_t *registration;
+#endif /* OMPI_CUDA_SUPPORT */
+    size_t size;
+    /* pointer written to the FIFO, this is the base of the shared memory region */
+    mca_btl_smcuda_hdr_t *hdr;
+    ompi_free_list_t* my_list;
+};
+typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag_t;
+typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag1_t;
+typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_frag2_t;
+typedef struct mca_btl_smcuda_frag_t mca_btl_smcuda_user_t;
+
+
+OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag_t);
+OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag1_t);
+OBJ_CLASS_DECLARATION(mca_btl_smcuda_frag2_t);
+OBJ_CLASS_DECLARATION(mca_btl_smcuda_user_t);
+
+#define MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag, rc)                           \
+{                                                                       \
+    ompi_free_list_item_t* item;                                        \
+    OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_eager, item, rc); \
+    frag = (mca_btl_smcuda_frag_t*)item;                                    \
+}
+
+#define MCA_BTL_SMCUDA_FRAG_ALLOC_MAX(frag, rc)                             \
+{                                                                       \
+    ompi_free_list_item_t* item;                                        \
+    OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_max, item, rc);   \
+    frag = (mca_btl_smcuda_frag_t*)item;                                    \
+}
+
+#define MCA_BTL_SMCUDA_FRAG_ALLOC_USER(frag, rc)                             \
+{                                                                       \
+	ompi_free_list_item_t* item;                                        \
+	OMPI_FREE_LIST_GET(&mca_btl_smcuda_component.sm_frags_user, item, rc);   \
+	frag = (mca_btl_smcuda_frag_t*)item;                                    \
+}
+
+
+#define MCA_BTL_SMCUDA_FRAG_RETURN(frag)                                      \
+{                                                                         \
+    OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t*)(frag)); \
+}
+#endif
--- a/ompi/mca/btl/smcuda/configure.m4
+++ b/ompi/mca/btl/smcuda/configure.m4
@ -0,0 +1,26 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2009      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_btl_smcuda_CONFIG([action-if-can-compile],
+#                   [action-if-cant-compile])
+# ------------------------------------------------
+AC_DEFUN([MCA_ompi_btl_smcuda_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/btl/smcuda/Makefile])
+
+    # Only build if CUDA 4.1 support is available
+    AS_IF([test "x$CUDA_SUPPORT_41" = "x1"],
+          [$1],
+          [$2])
+
+])dnl
--- a/ompi/mca/btl/smcuda/help-mpi-btl-smcuda.txt
+++ b/ompi/mca/btl/smcuda/help-mpi-btl-smcuda.txt
@ -0,0 +1,20 @@
+# -*- text -*-
+#
+# Copyright (c) 2004-2009 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2006-2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+# This is the US/English help file for Open MPI's shared memory support.
+#
+[CUDA RDMA requested but not supported]
+WARNING: CUDA RDMA support was requested for the shared memory
+(sm) BTL, but it is not supported.  Continuing without it.
+
+  Local host: %s
--- a/ompi/mca/pml/ob1/Makefile.am
+++ b/ompi/mca/pml/ob1/Makefile.am
@ -11,6 +11,7 @@
 #                         All rights reserved.
 # Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
 # Copyright (c) 2009-2010 Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -47,6 +48,12 @@ ob1_sources  = \
 	pml_ob1_sendreq.h \
 	pml_ob1_start.c 

+# If we have CUDA support requested, build the CUDA file also
+if MCA_ompi_cuda_support
+ob1_sources += \
+    pml_ob1_cuda.c
+endif
+
 if MCA_BUILD_ompi_pml_ob1_DSO
 component_noinst =
 component_install = mca_pml_ob1.la
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2008 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+
+#include "ompi_config.h"
+#include "opal/prefetch.h"
+#include "ompi/constants.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/btl/btl.h"
+#include "orte/mca/errmgr/errmgr.h"
+#include "ompi/mca/mpool/mpool.h" 
+#include "pml_ob1.h"
+#include "pml_ob1_hdr.h"
+#include "pml_ob1_rdmafrag.h"
+#include "pml_ob1_recvreq.h"
+#include "pml_ob1_sendreq.h"
+#include "ompi/mca/bml/base/base.h"
+#include "ompi/memchecker.h"
+
+size_t mca_pml_ob1_rdma_cuda_btls(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    unsigned char* base,
+    size_t size,
+    mca_pml_ob1_com_btl_t* rdma_btls);
+
+int mca_pml_ob1_cuda_need_buffers(void * rreq,
+                                  mca_btl_base_module_t* btl);
+
+/**
+ * Handle the CUDA buffer.
+ */
+int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
+                                        mca_bml_base_btl_t* bml_btl,
+                                        size_t size) {
+    int rc;
+#if OMPI_CUDA_SUPPORT_41
+    sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+        unsigned char *base;
+        opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+        if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+            rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                     sendreq->req_send.req_bytes_packed);
+            if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                mca_pml_ob1_free_rdma_resources(sendreq);
+            }
+        } else {
+            if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_PUT) {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
+                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
+            }
+        }
+    } else {
+        /* Do not send anything with first rendezvous message as copying GPU
+         * memory into RNDV message is expensive. */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+    }
+#else
+    /* Just do the rendezvous but set initial data to be sent to zero */
+    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+#endif /* OMPI_CUDA_SUPPORT_41 */
+    return rc;
+}
+
+    
+
+size_t mca_pml_ob1_rdma_cuda_btls(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    unsigned char* base,
+    size_t size,
+    mca_pml_ob1_com_btl_t* rdma_btls)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */        
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            mca_mpool_base_registration_t* reg = NULL;
+            mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
+
+            if( NULL != btl_mpool ) {
+                /* register the memory */
+                btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
+            }
+
+            if(NULL == reg)
+                continue;
+
+            rdma_btls[num_btls_used].bml_btl = bml_btl;
+            rdma_btls[num_btls_used].btl_reg = reg;
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+     * registered amount to less then half of available bandwidth - fall back to
+     * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
+                                     weight_total);
+
+    return num_btls_used;
+}
+
+int mca_pml_ob1_cuda_need_buffers(void * rreq,
+                                  mca_btl_base_module_t* btl) 
+{
+    mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;
+    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
+        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return true;
+        } else {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return false;
+        }
+    }
+    return true;
+}
+
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@ -11,6 +11,7 @@
 *                         All rights reserved.
 * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
 * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -34,6 +35,11 @@
 #include "opal/util/arch.h"
 #include "ompi/memchecker.h"

+#if OMPI_CUDA_SUPPORT
+int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
+                                  mca_btl_base_module_t* btl);
+#endif /* OMPI_CUDA_SUPPORT */
+
 void mca_pml_ob1_recv_request_process_pending(void)
 {
    mca_pml_ob1_recv_request_t* recvreq;
@ -485,8 +491,15 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
     * sender side is already registered. We need to be smarter here, perhaps
     * do couple of RDMA reads */
    if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
+#if OMPI_CUDA_SUPPORT
+        if (mca_pml_ob1_cuda_need_buffers(recvreq, btl)) {
+            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+            return;
+        }
+#else /* OMPI_CUDA_SUPPORT */
        mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
        return;
+#endif /* OMPI_CUDA_SUPPORT */
    }
    
    MCA_PML_OB1_RDMA_FRAG_ALLOC(frag,rc);
@ -513,10 +526,29 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
        }
    }
    frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
+#if OMPI_CUDA_SUPPORT
+    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
+        if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) {
+            /* Check to see if this is a CUDA get */
+            if (btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+                frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);
+            }
+            if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
+                opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
+                orte_errmgr.abort(-1, NULL);
+            }
+        } else {
+            /* Just default back to send and receive.  Must be mix of GPU and HOST memory. */
+            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+            return;
+        }
+    }
+#else /* OMPI_CUDA_SUPPORT */
    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
        opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
        orte_errmgr.abort(-1, NULL);
    }
+#endif /* OMPI_CUDA_SUPPORT */
    frag->rdma_hdr.hdr_rget = *hdr;
    frag->rdma_req = recvreq;
    frag->rdma_ep = bml_endpoint;
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -11,6 +11,7 @@
 *                         All rights reserved.
 * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * Copyright (c) 2012      NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -637,7 +638,7 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
    int rc;

    bml_btl = sendreq->req_rdma[0].bml_btl;
-    if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & MCA_BTL_FLAGS_GET)) {
+    if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
        mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
        mca_btl_base_descriptor_t* src;
        size_t i;
@ -706,8 +707,15 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
        for( i = 0; i < src->des_src_cnt; i++ ) {
            hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval);
            hdr->hdr_rget.hdr_segs[i].seg_len       = src->des_src[i].seg_len;
+#if OMPI_CUDA_SUPPORT_41
+            memcpy(hdr->hdr_rget.hdr_segs[i].seg_key.cudakey, src->des_src[i].seg_key.cudakey,
+                   sizeof(src->des_src[i].seg_key.cudakey));
+            hdr->hdr_rget.hdr_segs[i].memh_seg_addr.lval = ompi_ptr_ptol(src->des_src[i].memh_seg_addr.pval);
+            hdr->hdr_rget.hdr_segs[i].memh_seg_len       = src->des_src[i].memh_seg_len;
+#else /* OMPI_CUDA_SUPPORT_41 */
            hdr->hdr_rget.hdr_segs[i].seg_key.key64[0] = src->des_src[i].seg_key.key64[0];
            hdr->hdr_rget.hdr_segs[i].seg_key.key64[1] = src->des_src[i].seg_key.key64[1];
+#endif /* OMPI_CUDA_SUPPORT_41 */
        }

        des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@ -10,7 +10,7 @@
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
- * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2011-2012 NVIDIA Corporation.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
@ -311,6 +311,13 @@ mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq)
    mca_pml_ob1_send_request_schedule_exclusive(sendreq);
 }

+#if OMPI_CUDA_SUPPORT
+int mca_pml_ob1_send_request_start_cuda(
+    mca_pml_ob1_send_request_t* sendreq, 
+    mca_bml_base_btl_t* bml_btl,
+    size_t size);
+#endif /* OMPI_CUDA_SUPPORT */
+
 /**
 *  Start the specified request
 */
@ -395,13 +402,11 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            }
        } else {
-#if OPAL_CUDA_SUPPORT
-            /* Do not send anything with first rendezvous message as copying GPU
-             * memory into RNDV message is expensive. */
+#if OMPI_CUDA_SUPPORT
            if (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) {
-                size = 0;
+                return mca_pml_ob1_send_request_start_cuda(sendreq, bml_btl, size);
            }
-#endif
+#endif /* OMPI_CUDA_SUPPORT */
            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
        }
    }