diff --git a/ompi/mca/pml/bfo/Makefile.am b/ompi/mca/pml/bfo/Makefile.am
new file mode 100644
index 0000000000..a5ce5464c1
--- /dev/null
+++ b/ompi/mca/pml/bfo/Makefile.am
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+# Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
+#
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+dist_pkgdata_DATA = \
+	help-mpi-pml-bfo.txt
+
+EXTRA_DIST = post_configure.sh .windows
+
+bfo_sources  = \
+	pml_bfo.c \
+	pml_bfo.h \
+	pml_bfo_comm.c \
+	pml_bfo_comm.h \
+	pml_bfo_component.c \
+	pml_bfo_component.h \
+	pml_bfo_failover.c \
+	pml_bfo_failover.h \
+	pml_bfo_hdr.h \
+	pml_bfo_iprobe.c \
+	pml_bfo_irecv.c \
+	pml_bfo_isend.c \
+	pml_bfo_progress.c \
+	pml_bfo_rdma.c \
+	pml_bfo_rdma.h \
+	pml_bfo_rdmafrag.c \
+	pml_bfo_rdmafrag.h \
+	pml_bfo_recvfrag.c \
+	pml_bfo_recvfrag.h \
+	pml_bfo_recvreq.c \
+	pml_bfo_recvreq.h \
+	pml_bfo_sendreq.c \
+	pml_bfo_sendreq.h \
+	pml_bfo_start.c 
+
+if OMPI_BUILD_pml_bfo_DSO
+component_noinst =
+component_install = mca_pml_bfo.la
+else
+component_noinst = libmca_pml_bfo.la
+component_install =
+endif
+
+mcacomponentdir = $(pkglibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_pml_bfo_la_SOURCES = $(bfo_sources)
+mca_pml_bfo_la_LDFLAGS = -module -avoid-version
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_pml_bfo_la_SOURCES = $(bfo_sources)
+libmca_pml_bfo_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/pml/bfo/check-diffs-ob1.sh b/ompi/mca/pml/bfo/check-diffs-ob1.sh
new file mode 100755
index 0000000000..a0ac4ca01a
--- /dev/null
+++ b/ompi/mca/pml/bfo/check-diffs-ob1.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+#
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+# This scripts runs a diff between the ob1 and bfo files.  This
+# allows us to quickly see the differences between the two and
+# how well the bfo files are tracking ob1 as it changes.  You
+# can also modify this, and run it from the csum directory.
+
+CP=/bin/cp
+MKDIR=/bin/mkdir
+RM=/bin/rm
+TOUCH=/bin/touch
+pml=bfo
+PML=BFO
+ob1=ob1
+OB1=OB1
+DIFF=$ob1-$pml.diff
+DIFFDIR=diff-dir
+
+$MKDIR $DIFFDIR
+
+# Only diff a subset of files that are known to be different.
+FILES="Makefile.am \
+       pml_NAME.c \
+       pml_NAME.h \
+       pml_NAME_component.c \
+       pml_NAME_component.h \
+       pml_NAME_hdr.h \
+       pml_NAME_rdmafrag.h \
+       pml_NAME_recvfrag.c \
+       pml_NAME_recvreq.c \
+       pml_NAME_recvreq.h \
+       pml_NAME_sendreq.c \
+       pml_NAME_sendreq.h"
+
+# Copy over the files from the bfo directory.
+for name in $FILES
+do 
+  $CP `echo $name | sed s/NAME/$pml/` $DIFFDIR
+done
+
+cd $DIFFDIR
+# Convert the pml/PML strings back into ob1/OB1 strings
+# to avoid spurious differences between the files.
+../../../../../contrib/search_replace.pl $pml $ob1
+../../../../../contrib/search_replace.pl $PML $OB1
+
+# Copy over the files from the ob1 directory.
+for name in $FILES
+do 
+  $CP ../../ob1/`echo $name | sed s/NAME/$ob1/` .
+done
+
+$RM -f $DIFF
+$TOUCH $DIFF
+
+# Now run the diff.
+for name in $FILES
+do 
+  diff -c `echo $name | sed s/NAME/$ob1/` `echo $name | sed s/NAME/$pml/` >> $DIFF
+done
+
+# Cleanup
+mv $DIFF ..
+cd ..
+$RM -rf $DIFFDIR
diff --git a/ompi/mca/pml/bfo/configure.params b/ompi/mca/pml/bfo/configure.params
new file mode 100644
index 0000000000..d14bd950fe
--- /dev/null
+++ b/ompi/mca/pml/bfo/configure.params
@@ -0,0 +1,25 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+#                         University Research and Technology
+#                         Corporation.  All rights reserved.
+# Copyright (c) 2004-2005 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+#                         University of Stuttgart.  All rights reserved.
+# Copyright (c) 2004-2005 The Regents of the University of California.
+#                         All rights reserved.
+# Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+#                         reserved. 
+# Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+
+# Specific to this module
+
+PARAM_CONFIG_FILES="Makefile"
diff --git a/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt b/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt
new file mode 100644
index 0000000000..b3c44ec80e
--- /dev/null
+++ b/ompi/mca/pml/bfo/help-mpi-pml-bfo.txt
@@ -0,0 +1,20 @@
+# -*- text -*-
+#
+# Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+# $COPYRIGHT$
+# 
+# Additional copyrights may follow
+# 
+# $HEADER$
+#
+[eager_limit_too_small]
+The "eager limit" MCA parameter in the %s BTL was set to a value which
+is too low for Open MPI to function properly.  Please re-run your job
+with a higher eager limit value for this BTL; the exact MCA parameter
+name and its corresponding minimum value is shown below.
+
+  Local host:              %s
+  BTL name:                %s
+  BTL eager limit value:   %d (set via btl_%s_eager_limit)
+  BTL eager limit minimum: %d
+  MCA parameter name:      btl_%s_eager_limit 
diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c
new file mode 100644
index 0000000000..f57dab4b48
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo.c
@@ -0,0 +1,898 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2006-2008 University of Houston.  All rights reserved.
+ * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "opal/class/opal_bitmap.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/pml/base/base.h"
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/pml/base/base.h"
+#include "ompi/mca/btl/base/base.h"
+#include "pml_bfo.h"
+#include "pml_bfo_component.h"
+#include "pml_bfo_comm.h"
+#include "pml_bfo_hdr.h"
+#include "pml_bfo_recvfrag.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_rdmafrag.h"
+#include "ompi/mca/bml/base/base.h"
+#include "orte/mca/errmgr/errmgr.h"
+#include "orte/mca/grpcomm/grpcomm.h"
+#include "orte/util/show_help.h"
+/* BFO FAILOVER CODE - begin */
+#include "pml_bfo_failover.h"
+/* BFO FAILOVER CODE - end */
+
+#include "ompi/runtime/ompi_cr.h"
+
+mca_pml_bfo_t mca_pml_bfo = {
+    {
+        mca_pml_bfo_add_procs,
+        mca_pml_bfo_del_procs,
+        mca_pml_bfo_enable,
+        mca_pml_bfo_progress,
+        mca_pml_bfo_add_comm,
+        mca_pml_bfo_del_comm,
+        mca_pml_bfo_irecv_init,
+        mca_pml_bfo_irecv,
+        mca_pml_bfo_recv,
+        mca_pml_bfo_isend_init,
+        mca_pml_bfo_isend,
+        mca_pml_bfo_send,
+        mca_pml_bfo_iprobe,
+        mca_pml_bfo_probe,
+        mca_pml_bfo_start,
+        mca_pml_bfo_dump,
+        mca_pml_bfo_ft_event,
+        65535,
+        INT_MAX
+    }
+};
+
+
+void mca_pml_bfo_error_handler( struct mca_btl_base_module_t* btl,
+                                int32_t flags, ompi_proc_t* errproc,
+                                char* btlinfo );
+
+int mca_pml_bfo_enable(bool enable)
+{
+    if( false == enable ) {
+        return OMPI_SUCCESS;
+    }
+
+    OBJ_CONSTRUCT(&mca_pml_bfo.lock, opal_mutex_t);
+
+    /* fragments */
+    OBJ_CONSTRUCT(&mca_pml_bfo.rdma_frags, ompi_free_list_t);
+    ompi_free_list_init_new( &mca_pml_bfo.rdma_frags,
+                         sizeof(mca_pml_bfo_rdma_frag_t),
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_rdma_frag_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+                                                                                                            
+    OBJ_CONSTRUCT(&mca_pml_bfo.recv_frags, ompi_free_list_t);
+
+    ompi_free_list_init_new( &mca_pml_bfo.recv_frags,
+                         sizeof(mca_pml_bfo_recv_frag_t) + mca_pml_bfo.unexpected_limit,
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_recv_frag_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+                                                                                                            
+    OBJ_CONSTRUCT(&mca_pml_bfo.pending_pckts, ompi_free_list_t);
+    ompi_free_list_init_new( &mca_pml_bfo.pending_pckts,
+                         sizeof(mca_pml_bfo_pckt_pending_t),
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_pckt_pending_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+
+
+    OBJ_CONSTRUCT(&mca_pml_bfo.buffers, ompi_free_list_t);
+    OBJ_CONSTRUCT(&mca_pml_bfo.send_ranges, ompi_free_list_t);
+    ompi_free_list_init_new( &mca_pml_bfo.send_ranges,
+                         sizeof(mca_pml_bfo_send_range_t) +
+                         (mca_pml_bfo.max_send_per_range - 1) * sizeof(mca_pml_bfo_com_btl_t),
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_send_range_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+
+    /* pending operations */
+    OBJ_CONSTRUCT(&mca_pml_bfo.send_pending, opal_list_t);
+    OBJ_CONSTRUCT(&mca_pml_bfo.recv_pending, opal_list_t);
+    OBJ_CONSTRUCT(&mca_pml_bfo.pckt_pending, opal_list_t);
+    OBJ_CONSTRUCT(&mca_pml_bfo.rdma_pending, opal_list_t);
+    /* missing communicator pending list */
+    OBJ_CONSTRUCT(&mca_pml_bfo.non_existing_communicator_pending, opal_list_t);
+
+    /**
+     * If we get here this is the PML who get selected for the run. We
+     * should get ownership for the send and receive requests list, and
+     * initialize them with the size of our own requests.
+     */
+    ompi_free_list_init_new( &mca_pml_base_send_requests,
+                         sizeof(mca_pml_bfo_send_request_t) +
+                         (mca_pml_bfo.max_rdma_per_request - 1) *
+                         sizeof(mca_pml_bfo_com_btl_t),
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_send_request_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+
+    ompi_free_list_init_new( &mca_pml_base_recv_requests,
+                         sizeof(mca_pml_bfo_recv_request_t) +
+                         (mca_pml_bfo.max_rdma_per_request - 1) *
+                         sizeof(mca_pml_bfo_com_btl_t),
+                         opal_cache_line_size,
+                         OBJ_CLASS(mca_pml_bfo_recv_request_t),
+                         0,opal_cache_line_size,
+                         mca_pml_bfo.free_list_num,
+                         mca_pml_bfo.free_list_max,
+                         mca_pml_bfo.free_list_inc,
+                         NULL );
+
+    mca_pml_bfo.enabled = true;
+    return OMPI_SUCCESS;
+}
+
+int mca_pml_bfo_add_comm(ompi_communicator_t* comm)
+{
+    /* allocate pml specific comm data */
+    mca_pml_bfo_comm_t* pml_comm = OBJ_NEW(mca_pml_bfo_comm_t);
+    opal_list_item_t *item, *next_item;
+    mca_pml_bfo_recv_frag_t* frag;
+    mca_pml_bfo_comm_proc_t* pml_proc;
+    mca_pml_bfo_match_hdr_t* hdr;
+    int i;
+
+    if (NULL == pml_comm) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* should never happen, but it was, so check */
+    if (comm->c_contextid > mca_pml_bfo.super.pml_max_contextid) {
+        OBJ_RELEASE(pml_comm);
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    mca_pml_bfo_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
+    comm->c_pml_comm = pml_comm;
+
+    for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
+        pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
+        OBJ_RETAIN(pml_comm->procs[i].ompi_proc);
+    }
+    /* Grab all related messages from the non_existing_communicator pending queue */
+    for( item = opal_list_get_first(&mca_pml_bfo.non_existing_communicator_pending);
+         item != opal_list_get_end(&mca_pml_bfo.non_existing_communicator_pending);
+         item = next_item ) {
+        frag = (mca_pml_bfo_recv_frag_t*)item;
+        next_item = opal_list_get_next(item);
+        hdr = &frag->hdr.hdr_match;
+
+        /* Is this fragment for the current communicator ? */
+        if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
+            continue;
+
+        /* As we now know we work on a fragment for this communicator
+         * we should remove it from the
+         * non_existing_communicator_pending list. */
+        opal_list_remove_item( &mca_pml_bfo.non_existing_communicator_pending, 
+                               item );
+
+      add_fragment_to_unexpected:
+
+        /* We generate the MSG_ARRIVED event as soon as the PML is aware
+         * of a matching fragment arrival. Independing if it is received
+         * on the correct order or not. This will allow the tools to
+         * figure out if the messages are not received in the correct
+         * order (if multiple network interfaces).
+         */
+        PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
+                               hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+
+        /* There is no matching to be done, and no lock to be held on the communicator as
+         * we know at this point that the communicator has not yet been returned to the user.
+         * The only required protection is around the non_existing_communicator_pending queue.
+         * We just have to push the fragment into the unexpected list of the corresponding
+         * proc, or into the out-of-order (cant_match) list.
+         */
+        pml_proc = &(pml_comm->procs[hdr->hdr_src]);
+
+        if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
+            /* We're now expecting the next sequence number. */
+            pml_proc->expected_sequence++;
+            opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
+            PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
+                                   hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+            /* And now the ugly part. As some fragments can be inserted in the cant_match list,
+             * every time we succesfully add a fragment in the unexpected list we have to make
+             * sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock
+             * situation as the cant_match is only checked when a new fragment is received from
+             * the network.
+             */
+           for(frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
+               frag != (mca_pml_bfo_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
+               frag = (mca_pml_bfo_recv_frag_t *)opal_list_get_next(frag)) {
+               hdr = &frag->hdr.hdr_match;
+               /* If the message has the next expected seq from that proc...  */
+               if(hdr->hdr_seq != pml_proc->expected_sequence)
+                   continue;
+
+               opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
+               goto add_fragment_to_unexpected;
+           }
+        } else {
+            opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
+        }
+    }
+    return OMPI_SUCCESS;
+}
+
+int mca_pml_bfo_del_comm(ompi_communicator_t* comm)
+{
+    mca_pml_bfo_comm_t* pml_comm = comm->c_pml_comm;
+    int i;
+
+    for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
+        OBJ_RELEASE(pml_comm->procs[i].ompi_proc);
+    }
+    OBJ_RELEASE(comm->c_pml_comm);
+    comm->c_pml_comm = NULL;
+    return OMPI_SUCCESS;
+}
+
+
+/*
+ *   For each proc setup a datastructure that indicates the BTLs
+ *   that can be used to reach the destination.
+ *
+ */
+
+int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
+{
+    opal_bitmap_t reachable;
+    int rc;
+    size_t i;
+    opal_list_item_t *item;
+
+    if(nprocs == 0)
+        return OMPI_SUCCESS;
+
+    /* we don't have any endpoint data we need to cache on the
+       ompi_proc_t, so set proc_pml to NULL */
+    for (i = 0 ; i < nprocs ; ++i) {
+        procs[i]->proc_pml = NULL;
+    }
+
+    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
+    rc = opal_bitmap_init(&reachable, (int)nprocs);
+    if(OMPI_SUCCESS != rc)
+        return rc;
+
+    /*
+     * JJH: Disable this in FT enabled builds since
+     * we use a wrapper PML. It will cause this check to 
+     * return failure as all processes will return the wrapper PML
+     * component in use instead of the wrapped PML component underneath.
+     */
+#if OPAL_ENABLE_FT_CR == 0
+    /* make sure remote procs are using the same PML as us */
+    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("bfo",
+                                                              procs,
+                                                              nprocs))) {
+        return rc;
+    }
+#endif
+
+    rc = mca_bml.bml_add_procs( nprocs,
+                                procs,
+                                &reachable );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    /* Check that values supplied by all initialized btls will work
+       for us.  Note that this is the list of all initialized BTLs,
+       not the ones used for the just added procs.  This is a little
+       overkill and inaccurate, as we may end up not using the BTL in
+       question and all add_procs calls after the first one are
+       duplicating an already completed check.  But the final
+       initialization of the PML occurs before the final
+       initialization of the BTLs, and iterating through the in-use
+       BTLs requires iterating over the procs, as the BML does not
+       expose all currently in use btls. */
+
+    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
+         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
+         item = opal_list_get_next(item)) {
+        mca_btl_base_selected_module_t *sm = 
+            (mca_btl_base_selected_module_t*) item;
+        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_bfo_hdr_t)) {
+	    orte_show_help("help-mpi-pml-bfo.txt", "eager_limit_too_small",
+			   true, 
+			   sm->btl_component->btl_version.mca_component_name,
+			   orte_process_info.nodename,
+			   sm->btl_component->btl_version.mca_component_name,
+			   sm->btl_module->btl_eager_limit,
+			   sm->btl_component->btl_version.mca_component_name,
+			   sizeof(mca_pml_bfo_hdr_t),
+			   sm->btl_component->btl_version.mca_component_name);
+            rc = OMPI_ERR_BAD_PARAM;
+            goto cleanup_and_return;
+        }
+    }
+
+		    
+    /* TODO: Move these callback registration to another place */
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_MATCH,
+                               mca_pml_bfo_recv_frag_callback_match,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+    
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDV,
+                               mca_pml_bfo_recv_frag_callback_rndv,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RGET,
+                               mca_pml_bfo_recv_frag_callback_rget,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+    
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_ACK,
+                               mca_pml_bfo_recv_frag_callback_ack,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+    
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FRAG,
+                               mca_pml_bfo_recv_frag_callback_frag,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+    
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_PUT,
+                               mca_pml_bfo_recv_frag_callback_put,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FIN,
+                               mca_pml_bfo_recv_frag_callback_fin,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+/* BFO FAILOVER CODE - begin */
+    /* The following four functions are utilized when failover
+     * support for openib is enabled. */
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
+                               mca_pml_bfo_recv_frag_callback_rndvrestartnotify,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
+                               mca_pml_bfo_recv_frag_callback_rndvrestartack,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK,
+                               mca_pml_bfo_recv_frag_callback_rndvrestartnack,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+
+    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
+                               mca_pml_bfo_recv_frag_callback_recverrnotify,
+                               NULL );
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+/* BFO FAILOVER CODE - end */
+    
+    /* register error handlers */
+    rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler);
+    if(OMPI_SUCCESS != rc)
+        goto cleanup_and_return;
+    
+  cleanup_and_return:
+    OBJ_DESTRUCT(&reachable);
+
+    return rc;
+}
+
+/*
+ * iterate through each proc and notify any PTLs associated
+ * with the proc that it is/has gone away
+ */
+
+int mca_pml_bfo_del_procs(ompi_proc_t** procs, size_t nprocs)
+{
+    return mca_bml.bml_del_procs(nprocs, procs);
+}
+
+/*
+ * diagnostics
+ */
+
+int mca_pml_bfo_dump(struct ompi_communicator_t* comm, int verbose)
+{
+    struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
+    int i;
+
+    /* iterate through all procs on communicator */
+    for( i = 0; i < (int)pml_comm->num_procs; i++ ) {
+        mca_pml_bfo_comm_proc_t* proc = &pml_comm->procs[i];
+        mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->ompi_proc->proc_bml;
+        size_t n;
+
+        opal_output(0, "[Rank %d]\n", i);
+        /* dump all receive queues */
+
+        /* dump all btls */
+        for(n=0; n<ep->btl_eager.arr_size; n++) {
+            mca_bml_base_btl_t* bml_btl = &ep->btl_eager.bml_btls[n];
+            bml_btl->btl->btl_dump(bml_btl->btl, bml_btl->btl_endpoint, verbose);
+        }
+    }
+    return OMPI_SUCCESS;
+}
+
+static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl,
+                                        struct mca_btl_base_endpoint_t* ep,
+                                        struct mca_btl_base_descriptor_t* des,
+                                        int status )
+{
+/* BFO FAILOVER CODE - begin */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+        mca_pml_bfo_repost_fin(des);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+/**
+ * Send an FIN to the peer. If we fail to send this ack (no more available
+ * fragments or the send failed) this function automatically add the FIN
+ * to the list of pending FIN, Which guarantee that the FIN will be sent
+ * later.
+ */
+int mca_pml_bfo_send_fin( ompi_proc_t* proc,
+                          mca_bml_base_btl_t* bml_btl,
+                          ompi_ptr_t hdr_des,
+                          uint8_t order,
+                          uint32_t status,
+/* BFO FAILOVER CODE - begin */
+                          uint16_t seq,
+                          uint8_t restartseq,
+                          uint16_t ctx, uint32_t src)
+/* BFO FAILOVER CODE - end */
+{
+    mca_btl_base_descriptor_t* fin;
+    mca_pml_bfo_fin_hdr_t* hdr;
+    int rc;
+
+    mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_bfo_fin_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+
+    if(NULL == fin) {
+        MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    fin->des_cbfunc = mca_pml_bfo_fin_completion;
+    fin->des_cbdata = proc;
+
+    /* fill in header */
+    hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_src->seg_addr.pval;
+    hdr->hdr_match.hdr_common.hdr_flags = 0;
+    hdr->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
+    hdr->hdr_des = hdr_des;
+    hdr->hdr_fail = status;
+    hdr->hdr_match.hdr_seq = seq;
+    hdr->hdr_restartseq = restartseq;
+    hdr->hdr_match.hdr_ctx = ctx;
+    hdr->hdr_match.hdr_src = src;
+
+    bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
+
+    /* queue request */
+    rc = mca_bml_base_send( bml_btl,
+                            fin,
+                            MCA_PML_BFO_HDR_TYPE_FIN );
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+        if( OPAL_LIKELY( 1 == rc ) ) {
+            MCA_PML_BFO_PROGRESS_PENDING(bml_btl->btl);
+        }
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, fin);
+    MCA_PML_BFO_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
+    return OMPI_ERR_OUT_OF_RESOURCE;
+}
+
+void mca_pml_bfo_process_pending_packets(struct mca_btl_base_module_t* btl)
+{
+    mca_pml_bfo_pckt_pending_t *pckt;
+    int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending);
+
+    for(i = 0; i < s; i++) {
+        mca_bml_base_btl_t *send_dst = NULL;
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        pckt = (mca_pml_bfo_pckt_pending_t*)
+            opal_list_remove_first(&mca_pml_bfo.pckt_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+        if(NULL == pckt)
+            break;
+        if(pckt->bml_btl != NULL && 
+                pckt->bml_btl->btl == btl) {
+            send_dst = pckt->bml_btl;
+        } else {
+            send_dst = mca_bml_base_btl_array_find(
+                    &pckt->proc->proc_bml->btl_eager, btl);
+        }
+        if(NULL == send_dst) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.pckt_pending,
+                             (opal_list_item_t*)pckt);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        switch(pckt->hdr.hdr_common.hdr_type) {
+            case MCA_PML_BFO_HDR_TYPE_ACK:
+                rc = mca_pml_bfo_recv_request_ack_send_btl(pckt->proc,
+                        send_dst,
+                        pckt->hdr.hdr_ack.hdr_src_req.lval,
+                        pckt->hdr.hdr_ack.hdr_dst_req.pval,
+                        pckt->hdr.hdr_ack.hdr_send_offset,
+                        pckt->hdr.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA);
+                if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
+                    OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+                    opal_list_append(&mca_pml_bfo.pckt_pending,
+                                     (opal_list_item_t*)pckt);
+                    OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+                    return;
+                }
+                break;
+            case MCA_PML_BFO_HDR_TYPE_FIN:
+                rc = mca_pml_bfo_send_fin(pckt->proc, send_dst,
+                                          pckt->hdr.hdr_fin.hdr_des,
+                                          pckt->order,
+                                          pckt->hdr.hdr_fin.hdr_fail,
+                                          pckt->hdr.hdr_match.hdr_seq,
+                                          pckt->hdr.hdr_fin.hdr_restartseq,
+                                          pckt->hdr.hdr_match.hdr_ctx,
+                                          pckt->hdr.hdr_match.hdr_src);
+                if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
+                    return;
+                }
+                break;
+            default:
+                opal_output(0, "[%s:%d] wrong header type\n",
+                            __FILE__, __LINE__);
+                break;
+        }
+        /* We're done with this packet, return it back to the free list */
+        MCA_PML_BFO_PCKT_PENDING_RETURN(pckt);
+    }
+}
+
+void mca_pml_bfo_process_pending_rdma(void)
+{
+    mca_pml_bfo_rdma_frag_t* frag;
+    int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending);
+
+    for(i = 0; i < s; i++) {
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        frag = (mca_pml_bfo_rdma_frag_t*)
+            opal_list_remove_first(&mca_pml_bfo.rdma_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+        if(NULL == frag)
+            break;
+        if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) {
+            frag->retries++;
+            rc = mca_pml_bfo_send_request_put_frag(frag);
+        } else {
+            rc = mca_pml_bfo_recv_request_get_frag(frag);
+        }
+        if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc))
+            break;
+    }
+}
+
+
+void mca_pml_bfo_error_handler(
+        struct mca_btl_base_module_t* btl, int32_t flags,
+        ompi_proc_t* errproc, char* btlname ) { 
+/* BFO FAILOVER CODE - begin */
+    /* If we get a non-fatal error, try to failover */
+    if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) {
+        mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlname);
+/* BFO FAILOVER CODE - end */
+    } else {
+        orte_errmgr.abort(-1, NULL);
+    }
+}
+
+#if OPAL_ENABLE_FT_CR    == 0
+int mca_pml_bfo_ft_event( int state ) {
+    return OMPI_SUCCESS;
+}
+#else
+int mca_pml_bfo_ft_event( int state )
+{
+    static bool first_continue_pass = false;
+    ompi_proc_t** procs = NULL;
+    size_t num_procs;
+    int ret, p;
+
+    if(OPAL_CRS_CHECKPOINT == state) {
+        if( opal_cr_timing_barrier_enabled ) {
+            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
+            orte_grpcomm.barrier();
+        }
+
+        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
+    }
+    else if(OPAL_CRS_CONTINUE == state) {
+        first_continue_pass = !first_continue_pass;
+
+        if( !first_continue_pass ) { 
+            if( opal_cr_timing_barrier_enabled ) {
+                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
+                orte_grpcomm.barrier();
+            }
+            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
+        }
+
+        if( ompi_cr_continue_like_restart && !first_continue_pass ) {
+            /*
+             * Get a list of processes
+             */
+            procs = ompi_proc_all(&num_procs);
+            if(NULL == procs) {
+                return OMPI_ERR_OUT_OF_RESOURCE;
+            }
+
+            /*
+             * Refresh the proc structure, and publish our proc info in the modex.
+             * NOTE: Do *not* call ompi_proc_finalize as there are many places in
+             *       the code that point to indv. procs in this strucutre. For our
+             *       needs here we only need to fix up the modex, bml and pml 
+             *       references.
+             */
+            if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
+                opal_output(0,
+                            "pml:bfo: ft_event(Restart): proc_refresh Failed %d",
+                            ret);
+                for(p = 0; p < (int)num_procs; ++p) {
+                    OBJ_RELEASE(procs[p]);
+                }
+                free (procs);
+                return ret;
+            }
+        }
+    }
+    else if(OPAL_CRS_RESTART_PRE == state ) {
+        /* Nothing here */
+    }
+    else if(OPAL_CRS_RESTART == state ) {
+        /*
+         * Get a list of processes
+         */
+        procs = ompi_proc_all(&num_procs);
+        if(NULL == procs) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+        /*
+         * Clean out the modex information since it is invalid now.
+         *    orte_grpcomm.purge_proc_attrs();
+         * This happens at the ORTE level, so doing it again here will cause
+         * some issues with socket caching.
+         */
+
+
+        /*
+         * Refresh the proc structure, and publish our proc info in the modex.
+         * NOTE: Do *not* call ompi_proc_finalize as there are many places in
+         *       the code that point to indv. procs in this strucutre. For our
+         *       needs here we only need to fix up the modex, bml and pml 
+         *       references.
+         */
+        if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
+            opal_output(0,
+                        "pml:bfo: ft_event(Restart): proc_refresh Failed %d",
+                        ret);
+            for(p = 0; p < (int)num_procs; ++p) {
+                OBJ_RELEASE(procs[p]);
+            }
+            free (procs);
+            return ret;
+        }
+    }
+    else if(OPAL_CRS_TERM == state ) {
+        ;
+    }
+    else {
+        ;
+    }
+
+    /* Call the BML
+     * BML is expected to call ft_event in
+     * - BTL(s)
+     * - MPool(s)
+     */
+    if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) {
+        opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n",
+                    ret);
+    }
+    
+    if(OPAL_CRS_CHECKPOINT == state) {
+        OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
+
+        if( opal_cr_timing_barrier_enabled ) {
+            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
+            /* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/
+        }
+    }
+    else if(OPAL_CRS_CONTINUE == state) {
+        if( !first_continue_pass ) {
+            if( opal_cr_timing_barrier_enabled ) {
+                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
+                orte_grpcomm.barrier();
+            }
+            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
+        }
+
+        if( ompi_cr_continue_like_restart && !first_continue_pass ) {
+            /*
+             * Exchange the modex information once again.
+             * BTLs will have republished their modex information.
+             */
+            if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
+                opal_output(0,
+                            "pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
+                            ret);
+                return ret;
+            }
+
+            /*
+             * Startup the PML stack now that the modex is running again
+             * Add the new procs (BTLs redo modex recv's)
+             */
+            if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
+                opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
+                return ret;
+            }
+
+            /* Is this barrier necessary ? JJH */
+            if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
+                opal_output(0, "pml:bfo: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
+                return ret;
+            }
+
+            if( NULL != procs ) {
+                for(p = 0; p < (int)num_procs; ++p) {
+                    OBJ_RELEASE(procs[p]);
+                }
+                free(procs);
+                procs = NULL;
+            }
+        }
+        if( !first_continue_pass ) {
+            if( opal_cr_timing_barrier_enabled ) {
+                OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
+                orte_grpcomm.barrier();
+            }
+            OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
+        }
+    }
+    else if(OPAL_CRS_RESTART_PRE == state ) {
+        /* Nothing here */
+    }
+    else if(OPAL_CRS_RESTART == state  ) {
+        /*
+         * Exchange the modex information once again.
+         * BTLs will have republished their modex information.
+         */
+        if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
+            opal_output(0,
+                        "pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
+                        ret);
+            return ret;
+        }
+
+        /*
+         * Startup the PML stack now that the modex is running again
+         * Add the new procs (BTLs redo modex recv's)
+         */
+        if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
+            opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
+            return ret;
+        }
+
+        /* Is this barrier necessary ? JJH */
+        if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
+            opal_output(0, "pml:bfo: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
+            return ret;
+        }
+
+        if( NULL != procs ) {
+            for(p = 0; p < (int)num_procs; ++p) {
+                OBJ_RELEASE(procs[p]);
+            }
+            free(procs);
+            procs = NULL;
+        }
+    }
+    else if(OPAL_CRS_TERM == state ) {
+        ;
+    }
+    else {
+        ;
+    }
+
+    return OMPI_SUCCESS;
+}
+#endif /* OPAL_ENABLE_FT_CR */
+
+int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2)
+{
+    const mca_pml_bfo_com_btl_t *b1 = (const mca_pml_bfo_com_btl_t *) v1;
+    const mca_pml_bfo_com_btl_t *b2 = (const mca_pml_bfo_com_btl_t *) v2;
+
+    if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
+        return 1;
+    if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
+        return -1;
+
+    return 0;
+}
+
diff --git a/ompi/mca/pml/bfo/pml_bfo.h b/ompi/mca/pml/bfo/pml_bfo.h
new file mode 100644
index 0000000000..7129382787
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ *  @file 
+ */
+
+#ifndef MCA_PML_BFO_H
+#define MCA_PML_BFO_H
+
+#include "ompi_config.h"
+#include "ompi/class/ompi_free_list.h"
+#include "ompi/request/request.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/pml/base/pml_base_request.h"
+#include "ompi/mca/pml/base/pml_base_bsend.h"
+#include "ompi/mca/pml/base/pml_base_sendreq.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "pml_bfo_hdr.h"
+#include "ompi/mca/bml/base/base.h"
+#include "ompi/proc/proc.h"
+#include "ompi/mca/allocator/base/base.h"
+
+BEGIN_C_DECLS
+
+/**
+ * BFO PML module
+ */
+
+struct mca_pml_bfo_t {
+    mca_pml_base_module_t super; 
+
+    int priority;
+    int free_list_num;      /* initial size of free list */
+    int free_list_max;      /* maximum size of free list */
+    int free_list_inc;      /* number of elements to grow free list */
+    size_t send_pipeline_depth;
+    size_t recv_pipeline_depth;
+    size_t rdma_put_retries_limit;
+    int max_rdma_per_request;
+    int max_send_per_range;
+    bool leave_pinned; 
+    int leave_pinned_pipeline;
+    
+    /* lock queue access */
+    opal_mutex_t lock;
+
+    /* free lists */
+    ompi_free_list_t rdma_frags;
+    ompi_free_list_t recv_frags;
+    ompi_free_list_t pending_pckts;
+    ompi_free_list_t buffers;
+    ompi_free_list_t send_ranges;
+
+    /* list of pending operations */
+    opal_list_t pckt_pending;
+    opal_list_t send_pending;
+    opal_list_t recv_pending;
+    opal_list_t rdma_pending;
+    /* List of pending fragments without a matching communicator */
+    opal_list_t non_existing_communicator_pending;
+    bool enabled; 
+/* BFO FAILOVER CODE - begin */
+    bool fast_failover;
+/* BFO FAILOVER CODE - end */
+    char* allocator_name;
+    mca_allocator_base_module_t* allocator; 
+    uint32_t unexpected_limit;
+};
+typedef struct mca_pml_bfo_t mca_pml_bfo_t; 
+
+extern mca_pml_bfo_t mca_pml_bfo;
+extern int mca_pml_bfo_output;
+
+/*
+ * PML interface functions.
+ */
+
+extern int mca_pml_bfo_add_comm(
+    struct ompi_communicator_t* comm
+);
+
+extern int mca_pml_bfo_del_comm(
+    struct ompi_communicator_t* comm
+);
+
+extern int mca_pml_bfo_add_procs(
+    struct ompi_proc_t **procs,
+    size_t nprocs
+);
+
+extern int mca_pml_bfo_del_procs(
+    struct ompi_proc_t **procs,
+    size_t nprocs
+);
+
+extern int mca_pml_bfo_enable( bool enable );
+
+extern int mca_pml_bfo_progress(void);
+
+extern int mca_pml_bfo_iprobe( int dst,
+                               int tag,
+                               struct ompi_communicator_t* comm,
+                               int *matched,
+                               ompi_status_public_t* status );
+
+extern int mca_pml_bfo_probe( int dst,
+                              int tag,
+                              struct ompi_communicator_t* comm,
+                              ompi_status_public_t* status );
+
+extern int mca_pml_bfo_isend_init( void *buf,
+                                   size_t count,
+                                   ompi_datatype_t *datatype,
+                                   int dst,
+                                   int tag,
+                                   mca_pml_base_send_mode_t mode,
+                                   struct ompi_communicator_t* comm,
+                                   struct ompi_request_t **request );
+
+extern int mca_pml_bfo_isend( void *buf,
+                              size_t count,
+                              ompi_datatype_t *datatype,
+                              int dst,
+                              int tag,
+                              mca_pml_base_send_mode_t mode,
+                              struct ompi_communicator_t* comm,
+                              struct ompi_request_t **request );
+
+extern int mca_pml_bfo_send( void *buf,
+                             size_t count,
+                             ompi_datatype_t *datatype,
+                             int dst,
+                             int tag,
+                             mca_pml_base_send_mode_t mode,
+                             struct ompi_communicator_t* comm );
+
+extern int mca_pml_bfo_irecv_init( void *buf,
+                                   size_t count,
+                                   ompi_datatype_t *datatype,
+                                   int src,
+                                   int tag,
+                                   struct ompi_communicator_t* comm,
+                                   struct ompi_request_t **request );
+
+extern int mca_pml_bfo_irecv( void *buf,
+                              size_t count,
+                              ompi_datatype_t *datatype,
+                              int src,
+                              int tag,
+                              struct ompi_communicator_t* comm,
+                              struct ompi_request_t **request );
+
+extern int mca_pml_bfo_recv( void *buf,
+                             size_t count,
+                             ompi_datatype_t *datatype,
+                             int src,
+                             int tag,
+                             struct ompi_communicator_t* comm,
+                             ompi_status_public_t* status );
+
+extern int mca_pml_bfo_dump( struct ompi_communicator_t* comm,
+                             int verbose );
+
+extern int mca_pml_bfo_start( size_t count,
+                              ompi_request_t** requests );
+
+extern int mca_pml_bfo_ft_event( int state );
+
+END_C_DECLS
+
+struct mca_pml_bfo_pckt_pending_t {
+    ompi_free_list_item_t super;
+    ompi_proc_t* proc;
+    mca_pml_bfo_hdr_t hdr;
+    struct mca_bml_base_btl_t *bml_btl;
+    uint8_t order;
+};
+typedef struct mca_pml_bfo_pckt_pending_t mca_pml_bfo_pckt_pending_t;
+OBJ_CLASS_DECLARATION(mca_pml_bfo_pckt_pending_t);
+
+#define MCA_PML_BFO_PCKT_PENDING_ALLOC(pckt,rc)                 \
+do {                                                            \
+    ompi_free_list_item_t* item;                                \
+    OMPI_FREE_LIST_WAIT(&mca_pml_bfo.pending_pckts, item, rc);  \
+    pckt = (mca_pml_bfo_pckt_pending_t*)item;                   \
+} while (0)
+
+#define MCA_PML_BFO_PCKT_PENDING_RETURN(pckt)                   \
+do {                                                            \
+    /* return packet */                                         \
+    OMPI_FREE_LIST_RETURN(&mca_pml_bfo.pending_pckts,           \
+        (ompi_free_list_item_t*)pckt);                          \
+} while(0)
+
+#define MCA_PML_BFO_ADD_FIN_TO_PENDING(P, D, B, O, S)               \
+    do {                                                            \
+        mca_pml_bfo_pckt_pending_t *_pckt;                          \
+        int _rc;                                                    \
+                                                                    \
+        MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc);                  \
+        _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;  \
+        _pckt->hdr.hdr_fin.hdr_des = (D);                           \
+        _pckt->hdr.hdr_fin.hdr_fail = (S);                          \
+        _pckt->proc = (P);                                          \
+        _pckt->bml_btl = (B);                                       \
+        _pckt->order = (O);                                         \
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);                        \
+        opal_list_append(&mca_pml_bfo.pckt_pending,                 \
+                (opal_list_item_t*)_pckt);                          \
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);                      \
+    } while(0)
+
+
+int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, 
+        ompi_ptr_t hdr_des, uint8_t order, uint32_t status,
+        uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
+
+/* This function tries to resend FIN/ACK packets from pckt_pending queue.
+ * Packets are added to the queue when sending of FIN or ACK is failed due to
+ * resource unavailability. bml_btl passed to the function doesn't represents
+ * packet's destination, it represents BTL on which resource was freed, so only
+ * this BTL should be considered for resending packets */
+void mca_pml_bfo_process_pending_packets(struct mca_btl_base_module_t* btl);
+
+/* This function retries failed PUT/GET operations on frag. When RDMA operation
+ * cannot be accomplished for some reason, frag is put on the rdma_pending list.
+ * Later the operation is retried. The destination of RDMA operation is stored
+ * inside the frag structure */
+void mca_pml_bfo_process_pending_rdma(void);
+
+#define MCA_PML_BFO_PROGRESS_PENDING(btl)                       \
+    do {                                                        \
+        if(opal_list_get_size(&mca_pml_bfo.pckt_pending))       \
+            mca_pml_bfo_process_pending_packets(btl);           \
+        if(opal_list_get_size(&mca_pml_bfo.recv_pending))       \
+            mca_pml_bfo_recv_request_process_pending();         \
+        if(opal_list_get_size(&mca_pml_bfo.send_pending))       \
+            mca_pml_bfo_send_request_process_pending(btl);      \
+        if(opal_list_get_size(&mca_pml_bfo.rdma_pending))       \
+            mca_pml_bfo_process_pending_rdma();                 \
+    } while (0)
+
+/*
+ * Compute the total number of bytes on supplied descriptor
+ */
+#define MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH(segments, count, hdrlen, length) \
+do {                                                                        \
+   size_t i;                                                                \
+                                                                            \
+   for( i = 0; i < count; i++ ) {                                           \
+       length += segments[i].seg_len;                                       \
+   }                                                                        \
+   length -= hdrlen;                                                        \
+} while(0)
+
+/* represent BTL chosen for sending request */
+struct mca_pml_bfo_com_btl_t {
+    mca_bml_base_btl_t *bml_btl;
+    struct mca_mpool_base_registration_t* btl_reg;
+    size_t length;
+};
+typedef struct mca_pml_bfo_com_btl_t mca_pml_bfo_com_btl_t;
+
+int mca_pml_bfo_com_btl_comp(const void *v1, const void *v2);
+
+/* Calculate what percentage of a message to send through each BTL according to
+ * relative weight */
+static inline void
+mca_pml_bfo_calc_weighted_length( mca_pml_bfo_com_btl_t *btls, int num_btls, size_t size,
+                                  double weight_total )
+{
+    int i;
+    size_t length_left;
+
+    /* shortcut for common case for only one BTL */
+    if( OPAL_LIKELY(1 == num_btls) ) {
+        btls[0].length = size;
+        return;
+    }
+
+    /* sort BTLs according of their weights so BTLs with smaller weight will
+     * not hijack all of the traffic */
+    qsort( btls, num_btls, sizeof(mca_pml_bfo_com_btl_t),
+           mca_pml_bfo_com_btl_comp );
+
+    for(length_left = size, i = 0; i < num_btls; i++) {
+        mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
+        size_t length = 0;
+        if( OPAL_UNLIKELY(0 != length_left) ) {
+            length = (length_left > bml_btl->btl->btl_eager_limit)?
+                ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
+                length_left;
+
+            if(length > length_left)
+                length = length_left;
+            length_left -= length;
+        }
+        btls[i].length = length;
+    }
+
+    /* account for rounding errors */
+    btls[0].length += length_left;
+}
+
+#endif
diff --git a/ompi/mca/pml/bfo/pml_bfo_comm.c b/ompi/mca/pml/bfo/pml_bfo_comm.c
new file mode 100644
index 0000000000..168eaf7912
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_comm.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include <string.h>
+
+#include "pml_bfo.h"
+#include "pml_bfo_comm.h"
+
+
+
+static void mca_pml_bfo_comm_proc_construct(mca_pml_bfo_comm_proc_t* proc)
+{
+    proc->expected_sequence = 1;
+    proc->ompi_proc = NULL;
+    proc->send_sequence = 0;
+    OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
+    OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
+    OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
+}
+
+
+static void mca_pml_bfo_comm_proc_destruct(mca_pml_bfo_comm_proc_t* proc)
+{
+    OBJ_DESTRUCT(&proc->frags_cant_match);
+    OBJ_DESTRUCT(&proc->specific_receives);
+    OBJ_DESTRUCT(&proc->unexpected_frags);
+}
+
+
+static OBJ_CLASS_INSTANCE(
+    mca_pml_bfo_comm_proc_t,
+    opal_object_t,
+    mca_pml_bfo_comm_proc_construct,
+    mca_pml_bfo_comm_proc_destruct);
+
+
+static void mca_pml_bfo_comm_construct(mca_pml_bfo_comm_t* comm)
+{
+    OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t);
+    OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t);
+    comm->recv_sequence = 0;
+    comm->procs = NULL;
+    comm->num_procs = 0;
+}
+
+
+static void mca_pml_bfo_comm_destruct(mca_pml_bfo_comm_t* comm)
+{
+    size_t i;
+    for(i=0; i<comm->num_procs; i++)
+        OBJ_DESTRUCT((&comm->procs[i]));
+    if(NULL != comm->procs)
+        free(comm->procs);
+    OBJ_DESTRUCT(&comm->wild_receives);
+    OBJ_DESTRUCT(&comm->matching_lock);
+}
+
+
+OBJ_CLASS_INSTANCE(
+    mca_pml_bfo_comm_t,
+    opal_object_t,
+    mca_pml_bfo_comm_construct,
+    mca_pml_bfo_comm_destruct);
+
+
+int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size)
+{
+    size_t i;
+
+    /* send message sequence-number support - sender side */
+    comm->procs = (mca_pml_bfo_comm_proc_t*)malloc(sizeof(mca_pml_bfo_comm_proc_t)*size);
+    if(NULL == comm->procs) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    for(i=0; i<size; i++) {
+        OBJ_CONSTRUCT(comm->procs+i, mca_pml_bfo_comm_proc_t);
+    }
+    comm->num_procs = size;
+    return OMPI_SUCCESS;
+}
+
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_comm.h b/ompi/mca/pml/bfo/pml_bfo_comm.h
new file mode 100644
index 0000000000..c9564480fb
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_comm.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_PML_BFO_COMM_H
+#define MCA_PML_BFO_COMM_H
+
+#include "opal/threads/mutex.h"
+#include "opal/class/opal_list.h"
+#include "ompi/proc/proc.h"
+BEGIN_C_DECLS
+
+
+struct mca_pml_bfo_comm_proc_t {
+    opal_object_t super;
+    uint16_t expected_sequence;    /**< send message sequence number - receiver side */
+    struct ompi_proc_t* ompi_proc;
+#if OPAL_HAVE_THREAD_SUPPORT
+    volatile int32_t send_sequence; /**< send side sequence number */
+#else
+    int32_t send_sequence; /**< send side sequence number */
+#endif
+    opal_list_t frags_cant_match;  /**< out-of-order fragment queues */
+    opal_list_t specific_receives; /**< queues of unmatched specific receives */
+    opal_list_t unexpected_frags;  /**< unexpected fragment queues */
+};
+typedef struct mca_pml_bfo_comm_proc_t mca_pml_bfo_comm_proc_t;
+
+
+/**
+ *  Cached on ompi_communicator_t to hold queues/state
+ *  used by the PML<->PTL interface for matching logic. 
+ */
+struct mca_pml_comm_t {
+    opal_object_t super;
+#if OPAL_HAVE_THREAD_SUPPORT
+    volatile uint32_t recv_sequence;  /**< recv request sequence number - receiver side */
+#else
+    uint32_t recv_sequence;  /**< recv request sequence number - receiver side */
+#endif
+    opal_mutex_t matching_lock;   /**< matching lock */
+    opal_list_t wild_receives;    /**< queue of unmatched wild (source process not specified) receives */
+    mca_pml_bfo_comm_proc_t* procs;
+    size_t num_procs;
+};
+typedef struct mca_pml_comm_t mca_pml_bfo_comm_t;
+
+OBJ_CLASS_DECLARATION(mca_pml_bfo_comm_t);
+
+
+/**
+ * Initialize an instance of mca_pml_bfo_comm_t based on the communicator size.
+ *
+ * @param  comm   Instance of mca_pml_bfo_comm_t
+ * @param  size   Size of communicator 
+ * @return        OMPI_SUCCESS or error status on failure.
+ */
+
+extern int mca_pml_bfo_comm_init_size(mca_pml_bfo_comm_t* comm, size_t size);
+
+END_C_DECLS
+#endif
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_component.c b/ompi/mca/pml/bfo/pml_bfo_component.c
new file mode 100644
index 0000000000..01f3391e79
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_component.c
@@ -0,0 +1,252 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "opal/event/event.h"
+#include "mpi.h"
+#include "ompi/runtime/params.h"
+#include "ompi/mca/pml/pml.h"
+#include "opal/mca/base/mca_base_param.h"
+#include "ompi/mca/pml/base/pml_base_bsend.h"
+#include "pml_bfo.h"
+#include "pml_bfo_hdr.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_rdmafrag.h"
+#include "pml_bfo_recvfrag.h"
+#include "ompi/mca/bml/base/base.h" 
+#include "pml_bfo_component.h"
+#include "ompi/mca/allocator/base/base.h"
+
+OBJ_CLASS_INSTANCE( mca_pml_bfo_pckt_pending_t,
+                    ompi_free_list_item_t,
+                    NULL,
+                    NULL );
+
+static int mca_pml_bfo_component_open(void);
+static int mca_pml_bfo_component_close(void);
+static mca_pml_base_module_t*
+mca_pml_bfo_component_init( int* priority, bool enable_progress_threads,
+                            bool enable_mpi_threads );
+static int mca_pml_bfo_component_fini(void);
+int mca_pml_bfo_output = 0;
+
+mca_pml_base_component_2_0_0_t mca_pml_bfo_component = {
+
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+
+    {
+      MCA_PML_BASE_VERSION_2_0_0,
+    
+      "bfo", /* MCA component name */
+      OMPI_MAJOR_VERSION,  /* MCA component major version */
+      OMPI_MINOR_VERSION,  /* MCA component minor version */
+      OMPI_RELEASE_VERSION,  /* MCA component release version */
+      mca_pml_bfo_component_open,  /* component open */
+      mca_pml_bfo_component_close  /* component close */
+    },
+    {
+        /* The component is checkpoint ready */
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    },
+
+    mca_pml_bfo_component_init,  /* component init */
+    mca_pml_bfo_component_fini   /* component finalize */
+    
+};
+
+void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
+                             size_t* size,
+                             mca_mpool_base_registration_t** registration);
+ 
+void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
+                           void* segment );
+
+static inline int mca_pml_bfo_param_register_int(
+    const char* param_name,
+    int default_value)
+{
+    int id = mca_base_param_register_int("pml","bfo",param_name,NULL,default_value);
+    int param_value = default_value;
+    mca_base_param_lookup_int(id,&param_value);
+    return param_value;
+}
+
+static int mca_pml_bfo_component_open(void)
+{
+    int value;
+    mca_allocator_base_component_t* allocator_component;
+
+    value = mca_pml_bfo_param_register_int("verbose", 0);
+    mca_pml_bfo_output = opal_output_open(NULL);
+    opal_output_set_verbosity(mca_pml_bfo_output, value);
+
+    mca_pml_bfo.free_list_num =
+        mca_pml_bfo_param_register_int("free_list_num", 4);
+    mca_pml_bfo.free_list_max =
+        mca_pml_bfo_param_register_int("free_list_max", -1);
+    mca_pml_bfo.free_list_inc =
+        mca_pml_bfo_param_register_int("free_list_inc", 64);
+    mca_pml_bfo.priority =
+        mca_pml_bfo_param_register_int("priority", 5);
+    mca_pml_bfo.send_pipeline_depth =
+        mca_pml_bfo_param_register_int("send_pipeline_depth", 3);
+    mca_pml_bfo.recv_pipeline_depth =
+        mca_pml_bfo_param_register_int("recv_pipeline_depth", 4);
+    mca_pml_bfo.rdma_put_retries_limit =
+        mca_pml_bfo_param_register_int("rdma_put_retries_limit", 5);
+    mca_pml_bfo.max_rdma_per_request =
+        mca_pml_bfo_param_register_int("max_rdma_per_request", 4);
+    mca_pml_bfo.max_send_per_range =
+        mca_pml_bfo_param_register_int("max_send_per_range", 4);
+
+    mca_pml_bfo.unexpected_limit =
+        mca_pml_bfo_param_register_int("unexpected_limit", 128);
+/* BFO FAILOVER CODE - begin */
+    mca_pml_bfo.fast_failover =
+        mca_pml_bfo_param_register_int("fast_failover", 0);
+/* BFO FAILOVER CODE - end */
+ 
+    mca_base_param_reg_string(&mca_pml_bfo_component.pmlm_version,
+                              "allocator",
+                              "Name of allocator component for unexpected messages",
+                              false, false,
+                              "bucket",
+                              &mca_pml_bfo.allocator_name);
+
+    allocator_component = mca_allocator_component_lookup( mca_pml_bfo.allocator_name );
+    if(NULL == allocator_component) {
+        opal_output(0, "mca_pml_bfo_component_open: can't find allocator: %s\n", mca_pml_bfo.allocator_name);
+        return OMPI_ERROR;
+    }
+
+    mca_pml_bfo.allocator = allocator_component->allocator_init(true,
+                                                                mca_pml_bfo_seg_alloc,
+                                                                mca_pml_bfo_seg_free, NULL);
+    if(NULL == mca_pml_bfo.allocator) {
+        opal_output(0, "mca_pml_bfo_component_open: unable to initialize allocator\n");
+        return OMPI_ERROR;
+    }
+
+    mca_pml_bfo.enabled = false; 
+    return mca_bml_base_open(); 
+}
+
+
+static int mca_pml_bfo_component_close(void)
+{
+    int rc;
+
+    if (OMPI_SUCCESS != (rc = mca_bml_base_close())) {
+         return rc;
+    }
+    if (NULL != mca_pml_bfo.allocator_name) {
+        free(mca_pml_bfo.allocator_name);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+
+static mca_pml_base_module_t*
+mca_pml_bfo_component_init( int* priority, 
+                            bool enable_progress_threads,
+                            bool enable_mpi_threads )
+{
+    opal_output_verbose( 10, mca_pml_bfo_output, 
+                         "in bfo, my priority is %d\n", mca_pml_bfo.priority);
+    
+    if((*priority) > mca_pml_bfo.priority) { 
+        *priority = mca_pml_bfo.priority;
+        return NULL;
+    }
+    *priority = mca_pml_bfo.priority;
+
+    if(OMPI_SUCCESS != mca_bml_base_init( enable_progress_threads, 
+                                          enable_mpi_threads)) {
+        return NULL;
+    }
+
+    /* Set this here (vs in component_open()) because
+       ompi_mpi_leave_pinned* may have been set after MCA params were
+       read (e.g., by the openib btl) */
+    mca_pml_bfo.leave_pinned = (1 == ompi_mpi_leave_pinned);
+    mca_pml_bfo.leave_pinned_pipeline = (int) ompi_mpi_leave_pinned_pipeline;
+
+    return &mca_pml_bfo.super;
+}
+
+int mca_pml_bfo_component_fini(void)
+{
+    int rc;
+
+    /* Shutdown BML */
+    if(OMPI_SUCCESS != (rc = mca_bml.bml_finalize()))
+        return rc;
+
+    if(!mca_pml_bfo.enabled)
+        return OMPI_SUCCESS; /* never selected.. return success.. */  
+    mca_pml_bfo.enabled = false;  /* not anymore */
+
+    OBJ_DESTRUCT(&mca_pml_bfo.rdma_pending);
+    OBJ_DESTRUCT(&mca_pml_bfo.pckt_pending);
+    OBJ_DESTRUCT(&mca_pml_bfo.recv_pending);
+    OBJ_DESTRUCT(&mca_pml_bfo.send_pending);
+    OBJ_DESTRUCT(&mca_pml_bfo.non_existing_communicator_pending);
+    OBJ_DESTRUCT(&mca_pml_bfo.buffers);
+    OBJ_DESTRUCT(&mca_pml_bfo.pending_pckts);
+    OBJ_DESTRUCT(&mca_pml_bfo.recv_frags);
+    OBJ_DESTRUCT(&mca_pml_bfo.rdma_frags);
+    OBJ_DESTRUCT(&mca_pml_bfo.lock);
+
+    if(OMPI_SUCCESS != (rc = mca_pml_bfo.allocator->alc_finalize(mca_pml_bfo.allocator))) {
+        return rc;
+    }
+
+#if 0
+    if (mca_pml_base_send_requests.fl_num_allocated !=
+        mca_pml_base_send_requests.super.opal_list_length) {
+        opal_output(0, "bfo send requests: %d allocated %d returned\n",
+                    mca_pml_base_send_requests.fl_num_allocated,
+                    mca_pml_base_send_requests.super.opal_list_length);
+    }
+    if (mca_pml_base_recv_requests.fl_num_allocated !=
+        mca_pml_base_recv_requests.super.opal_list_length) {
+        opal_output(0, "bfo recv requests: %d allocated %d returned\n",
+                    mca_pml_base_recv_requests.fl_num_allocated,
+                    mca_pml_base_recv_requests.super.opal_list_length);
+    }
+#endif
+
+    return OMPI_SUCCESS;
+}
+
+void *mca_pml_bfo_seg_alloc( struct mca_mpool_base_module_t* mpool,
+                             size_t* size,
+                             mca_mpool_base_registration_t** registration) { 
+    return malloc(*size);
+}
+
+void mca_pml_bfo_seg_free( struct mca_mpool_base_module_t* mpool,
+                           void* segment ) { 
+    free(segment);
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_component.h b/ompi/mca/pml/bfo/pml_bfo_component.h
new file mode 100644
index 0000000000..2fd08d018e
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_component.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ *  @file 
+ */
+
+#ifndef MCA_PML_BFO_COMPONENT_H
+#define MCA_PML_BFO_COMPONENT_H
+
+BEGIN_C_DECLS
+
+/*
+ * PML module functions.
+ */
+OMPI_MODULE_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_bfo_component;
+
+END_C_DECLS
+
+#endif
diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.c b/ompi/mca/pml/bfo/pml_bfo_failover.c
new file mode 100644
index 0000000000..d0fbc94362
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_failover.c
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/**
+ * @file
+ * Functions that implement failover capabilities.  To utilize the
+ * failover feature, one needs to configure the library with
+ * --enable-openib-failover.  Then the system that is being used
+ * must have two or more openib BTLs in use.   When an error occurs,
+ * the BTL will call into this PML to map out the offending BTL and
+ * continue using the one that is still working.
+ * Most of the differences between the ob1 PML and the bfo PML are
+ * contained in this file.
+ */
+
+#include "ompi_config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "opal/class/opal_bitmap.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/pml/base/base.h"
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/pml/base/base.h"
+#include "ompi/mca/btl/base/base.h"
+#include "pml_bfo.h"
+#include "pml_bfo_component.h"
+#include "pml_bfo_comm.h"
+#include "pml_bfo_hdr.h"
+#include "pml_bfo_recvfrag.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_rdmafrag.h"
+#include "pml_bfo_failover.h"
+#include "ompi/mca/bml/base/base.h"
+#include "orte/mca/errmgr/errmgr.h"
+#include "orte/mca/grpcomm/grpcomm.h"
+#include "orte/util/show_help.h"
+#include "orte/mca/notifier/notifier.h"
+
+#include "ompi/runtime/ompi_cr.h"
+
+static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl,
+                                              mca_bml_base_endpoint_t* ep);
+
+/**
+ * When running with failover enabled, check the PML sequence numbers
+ * to see if we have received a duplicate message.  This check is done
+ * for for all MATCH fragments.  It is also done for RNDV and RGET
+ * fragments that do not have the MCA_PML_BFO_HDR_FLAGS_RESTART flag
+ * set.
+ * We set the window size to half the total range of sequence numbers.
+ * We only enter this code when the seq_num is not the expected one.
+ * A few more notes on the algorithm used here.  In normal operation,
+ * the expected value will either be equal to or less than the
+ * sequence number of the header.  This is because we are using this
+ * sequence number to detect packets arriving prior to them being
+ * expected.  If we determine that expected is less than header, then
+ * make sure this is not a rollover case.  We do that by adding the
+ * maxnum to the expected.
+ * @param proc Pointer to proc from where message came
+ * @param hdr Pointer to header of message 
+ */
+bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc,
+                                  mca_pml_bfo_match_hdr_t *hdr)
+{
+    const int window = 32768;
+    const int maxnum = 65536;
+    mca_pml_bfo_recv_frag_t *frag;
+
+#if 0
+    opal_output(0, "checking dup, exp=%d, act=%d, type=%d, cant_match=%d\n",
+                (uint16_t)proc->expected_sequence,
+                hdr->hdr_seq, hdr->hdr_common.hdr_type,
+                opal_list_get_size(&proc->frags_cant_match));
+#endif
+
+    /* Few cases near end of values where expected may equal 65535 and
+     * an out of order shows up that may equal something like 1.  */
+    if (OPAL_UNLIKELY((uint16_t)proc->expected_sequence > hdr->hdr_seq)) {
+        if (((uint16_t)proc->expected_sequence - hdr->hdr_seq) < window) {
+            opal_output_verbose(20, mca_pml_bfo_output,
+                                "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n",
+                                __FILE__, __LINE__, (uint16_t)proc->expected_sequence,
+                                hdr->hdr_seq, hdr->hdr_common.hdr_type);
+            return true;
+        }
+    } else {
+        /* This is the normal flow through this code.  We also need to
+         * use the maxnum to ensure that we handle cases where the
+         * expected number has rolled over but then a duplicate message
+         * shows up that is greater than it. */
+        if ((((uint16_t)proc->expected_sequence + maxnum) - hdr->hdr_seq) < window) {
+            opal_output_verbose(20, mca_pml_bfo_output,
+                "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n",
+                __FILE__, __LINE__, (uint16_t)proc->expected_sequence,
+                hdr->hdr_seq, hdr->hdr_common.hdr_type);
+            return true;
+        }
+    }
+
+    /* Need to explicitly check against any out of order fragments. Unfortunately, we
+     * always have to do this since we can get a duplicate out of order fragment. */
+    if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
+        for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
+            frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
+            frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag))
+            {
+                mca_pml_bfo_match_hdr_t* mhdr = &frag->hdr.hdr_match;
+
+                if(mhdr->hdr_seq == hdr->hdr_seq) {
+                    opal_output_verbose(20, mca_pml_bfo_output,
+                        "%s:%d: frag duplicated on frags_cant_match list, seq=%d, type=%d\n",
+                        __FILE__, __LINE__, hdr->hdr_seq, hdr->hdr_common.hdr_type);
+                    return true;
+                }
+            }
+    }
+
+    return false;
+}
+
+/**
+ * This function checks to see if we have received a duplicate FIN
+ * message.  This is done by first pulling the pointer of the request
+ * that the FIN message is pointing to from the message.  We then
+ * check the various fields in the request to the fields in the header
+ * and make sure they match.  If they do not, then the request must
+ * have been recycled already and this is a duplicate FIN message.  We
+ * have to do this check on every FIN message that we receive.
+ */
+bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma,
+                                  mca_btl_base_module_t* btl)
+{
+    mca_pml_base_request_t* basereq;
+    /* When running with failover enabled, need to ensure that this
+     * is not a duplicate FIN message.  */
+    if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
+        /* The first check is to make sure the descriptor is pointing
+         * to a valid request.  The descriptor may be pointing to NULL
+         * if it was freed and not reused yet.  */
+        if (NULL == rdma->des_cbdata) {
+            opal_output_verbose(20, mca_pml_bfo_output,
+                "FIN: received: dropping because not pointing to valid descriptor "
+                "PML=%d CTX=%d SRC=%d RQS=%d",
+                hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx,
+                hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq);
+            return true;
+        }
+
+        basereq = (mca_pml_base_request_t*)rdma->des_cbdata;
+        /* Now we know the descriptor is pointing to a non-null request.
+         * Does it match what we expect?  To make sure the receiver request
+         * matches the FIN message, check the context number, source of the
+         * message, and MPI sequence number.  Then make sure that it also
+         * matches the internal sequencing number of the requests.  We need
+         * to look at the type of request we are pointing at to figure out
+         * what fields to access.  */
+        if (basereq->req_type == MCA_PML_REQUEST_RECV) {
+            mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)basereq;
+            if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) ||
+                (hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
+                (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
+                opal_output_verbose(5, mca_pml_bfo_output,
+                                    "FIN: received on receiver: dropping because no match "
+                                    "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
+                                    "RQS:exp=%d,act=%d, dst_req=%p",
+                                    (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
+                                    recvreq->req_recv.req_base.req_comm->c_contextid,
+                                    hdr->hdr_match.hdr_ctx,
+                                    recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                                    hdr->hdr_match.hdr_src,
+                                    recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
+                                    (void *)recvreq);
+                return true;
+            }
+            if (hdr->hdr_fin.hdr_restartseq != recvreq->req_restartseq) {
+                opal_output_verbose(5, mca_pml_bfo_output,
+                                    "FIN: received on receiver: dropping because old "
+                                    "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
+                                    "RQS:exp=%d,act=%d, dst_req=%p",
+                                    (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
+                                    recvreq->req_recv.req_base.req_comm->c_contextid,
+                                    hdr->hdr_match.hdr_ctx,
+                                    recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                                    hdr->hdr_match.hdr_src,
+                                    recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
+                                    (void *)recvreq);
+                return true;
+            }
+        } else if (basereq->req_type == MCA_PML_REQUEST_SEND) {
+            mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)basereq;
+            if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
+                (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
+                (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
+                uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+                opal_output_verbose(5, mca_pml_bfo_output,
+                                    "FIN: received on sender: dropping because no match "
+                                    "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
+                                    "RQS:exp=%d,act=%d, dst_req=%p",
+                                    seq, hdr->hdr_match.hdr_seq,
+                                    sendreq->req_send.req_base.req_comm->c_contextid,
+                                    hdr->hdr_match.hdr_ctx,
+                                    sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
+                                    sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
+                                    (void *)sendreq);
+                return true;
+            }
+            if (hdr->hdr_fin.hdr_restartseq != sendreq->req_restartseq) {
+                uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+                opal_output_verbose(5, mca_pml_bfo_output,
+                                    "FIN: received on sender: dropping because old "
+                                    "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
+                                    "RQS:exp=%d,act=%d, dst_req=%p",
+                                    seq, hdr->hdr_match.hdr_seq,
+                                    sendreq->req_send.req_base.req_comm->c_contextid,
+                                    hdr->hdr_match.hdr_ctx,
+                                    sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
+                                    sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
+                                    (void *)sendreq);
+                return true;
+            }
+        } else {
+            /* We can get here if the descriptor has been reused, but
+             * not as an RDMA descriptor.  In that case, the callback
+             * function has been set to something else.  Clearly the
+             * descriptor we are interested is gone, so just drop the
+             * FIN message. */
+            opal_output_verbose(5, mca_pml_bfo_output,
+                                "FIN: received: dropping because descriptor has been reused "
+                                "PML=%d CTX=%d SRC=%d RQS=%d rdma->des_flags=%d",
+                                hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx,
+                                hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq, rdma->des_flags);
+            return true;
+        }
+    }
+    return false;
+}
+
+/**
+ * Repost a FIN message if we get an error on the completion event.
+ */
+void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
+    /* In the error case, we will repost the FIN message.  I had
+     * considered restarting the request.  The problem is that the
+     * request may be already complete when we detect that a FIN
+     * message got an error on its completion event.  For example, with
+     * the PUT protocol, if the RDMA writes succeed and all the data
+     * has been sent, then the request is marked as complete and can be
+     * freed.  Therefore, an error on the FIN message has no request to
+     * refer back to.  So, we will just repost it.  However, we are also
+     * faced with the case where the FIN message has an error but it
+     * actually makes it to the other side.  In that case we are now
+     * sending a FIN message to a non-existent request on the receiver
+     * side.  To handle that, we have added the match information to
+     * the FIN message.  That way, we can check on the receiving side
+     * to ensure that it is pointing to a valid request. */
+    mca_pml_bfo_fin_hdr_t* hdr;
+    mca_bml_base_endpoint_t* bml_endpoint;
+    ompi_proc_t *proc;
+    mca_bml_base_btl_t* bml_btl;
+
+    proc = (ompi_proc_t*) des->des_cbdata;
+    bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+    hdr = (mca_pml_bfo_fin_hdr_t*)des->des_src->seg_addr.pval;
+
+    opal_output_verbose(20, mca_pml_bfo_output,
+                        "REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d",
+                        hdr->hdr_match.hdr_seq, ORTE_PROC_MY_NAME->vpid, proc->proc_name.vpid,
+                        hdr->hdr_fail, hdr->hdr_match.hdr_src);
+
+    bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+
+    /* Reconstruct the fin for sending on the other BTL */
+    mca_pml_bfo_send_fin(proc, bml_btl,
+                         hdr->hdr_des, MCA_BTL_NO_ORDER,
+                         hdr->hdr_fail, hdr->hdr_match.hdr_seq, hdr->hdr_restartseq,
+                         hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src);
+    return;
+}
+
+/**
+ * This function is called when a RNDV or RGET is received with the
+ * FLAGS_RESTART flag set.  This means this message already has a
+ * receive request already associated with it.
+ */
+mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr) {
+    mca_pml_bfo_recv_request_t *match = NULL;
+    mca_pml_bfo_rendezvous_hdr_t * rhdr = (mca_pml_bfo_rendezvous_hdr_t *) hdr;
+    match = (mca_pml_bfo_recv_request_t *) rhdr->hdr_dst_req.pval;
+
+    /* Check to see if we have received a duplicate RNDV (or RGET).  This can
+     * occur because we got an error when we reposted the RNDV.  Therefore,
+     * we make sure that the request has not completed from underneath us
+     * and been recycled.  Secondly, make sure we are not getting it a
+     * second time for the same request. */
+    if ((rhdr->hdr_match.hdr_ctx != match->req_recv.req_base.req_comm->c_contextid) ||
+        (rhdr->hdr_match.hdr_src != match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
+        (rhdr->hdr_match.hdr_seq != (uint16_t)match->req_msgseq) ||
+        (rhdr->hdr_restartseq == match->req_restartseq)) {
+        if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) {
+            opal_output_verbose(20, mca_pml_bfo_output,
+                                "RNDV: received with RESTART flag: duplicate, dropping "
+                                "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
+                                match->remote_req_send.pval, (void *)match,
+                                match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+        } else {
+            opal_output_verbose(20, mca_pml_bfo_output,
+                                "RGET: received with RESTART flag: duplicate, dropping "
+                                "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
+                                match->remote_req_send.pval, (void *)match,
+                                match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+        }
+        return NULL;
+    }
+
+    mca_pml_bfo_recv_request_reset(match);
+    if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDV: received with RESTART flag: restarting recv, "
+                            "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d",
+                            match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
+                            match->remote_req_send.pval, (void *)match,
+                            match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+    } else {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RGET: received with RESTART flag: restarting recv, "
+                            "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d",
+                            match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
+                            match->remote_req_send.pval, (void *)match,
+                            match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+    }
+    return match;
+}
+
+/**
+ * Callback for when a RNDVRESTARTNOTIFY message is received.  A
+ * RNDVRESTARTNOTIFY message is sent from the sender to the receiver
+ * telling the receiver that the message is going to be started over.
+ * The receiver first makes sure that the request being pointed to is
+ * still valid.  If it is not, that means the receiver must have
+ * completed the request and therefore we need to send a NACK back to
+ * the sender.  The receiver then makes sure this is not a duplicate
+ * message.  If it is a duplicate, it will just drop it.  Otherwise,
+ * it will then send a RNDVRESTARTACK message if there are no
+ * outstanding events on the receiver.  Otherwise, it will just change
+ * the state of the request and wait for another event to send the
+ * RNDVRESTARTACK to the sender.
+ */
+void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl,
+                                                      mca_btl_base_tag_t tag,
+                                                      mca_btl_base_descriptor_t* des,
+                                                      void* cbdata ) {
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_recv_request_t* recvreq;
+    ompi_proc_t* ompi_proc;
+    orte_process_name_t orte_proc;
+
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY);
+    recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_restart.hdr_dst_req.pval;
+
+    /* Check to see if the receive request is still valid.  If the
+     * request is recycled, that means the original request must have
+     * completed and we therefore need to send a NACK back to the sender.
+     * Note that when the request is gone, we need to pull some information
+     * off the header so that we can figure out where to send the NACK
+     * message back to. */
+    if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) ||
+        (hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
+        (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
+        orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
+        orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
+        ompi_proc = ompi_proc_find(&orte_proc);
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
+                            "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
+                            "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
+                            "hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s",
+                            (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
+                            recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                            hdr->hdr_match.hdr_src, recvreq->req_restartseq,
+                            hdr->hdr_restart.hdr_restartseq,
+                            recvreq->remote_req_send.pval, (void *)recvreq,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                            hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
+                            ompi_proc->proc_hostname);
+        mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
+        return;
+    }
+
+    /* We know that we have the correct receive request.  Make sure this is not
+     * a duplicate RNDVRESTARTNOTIFY on this request. */
+    if (hdr->hdr_restart.hdr_restartseq == recvreq->req_restartseq) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RNDVRESTARTNOTIFY: received duplicate: dropping RNDVRESTARTNOTIFY "
+                            "message PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
+                            "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
+                            recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                            hdr->hdr_match.hdr_src, recvreq->req_restartseq,
+                            hdr->hdr_restart.hdr_restartseq,
+                            recvreq->remote_req_send.pval, (void *)recvreq,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+        return;
+    }
+
+    /* Increment restart number. */
+    recvreq->req_restartseq++;
+    recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED;
+    opal_output_verbose(30, mca_pml_bfo_output,
+                        "RNDVRESTARTNOTIFY: received: outstanding receive events=%d, "
+                        "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                        recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
+                        recvreq->remote_req_send.pval, (void *)recvreq,
+                        recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+
+    if (0 == recvreq->req_events) {
+        mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
+                                                OMPI_SUCCESS, btl);
+    }
+
+    return;
+}
+
+/**
+ * Callback for when a RNDVRESTARTACK message is received.  This
+ * message is sent from the receiver to the sender to acknowledge
+ * the receipt of the RNDVRESTARTNOTIFY message.  At this point,
+ * the sender can reset the send request and restart the message.
+ */
+void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl,
+                                                   mca_btl_base_tag_t tag,
+                                                   mca_btl_base_descriptor_t* des,
+                                                   void* cbdata ) {
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_send_request_t* sendreq;
+
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK);
+    sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
+
+    /* Check to see if we have received a duplicate message.  The
+     * first three comparisons make sure that we are not looking at a
+     * recycled request.  The last check makes sure we are not getting
+     * a duplicate message for this specific request.  All of this is
+     * needed because the receiver might get an error and repost the
+     * RNDVRESTARTACK message, but the RNDVRESTARTACK was actually received. */
+    if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
+        (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
+        (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) ||
+        (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RNDVRESTARTACK: received: does not match request, dropping "
+                            "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d "
+                            "src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
+                            sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
+                            sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
+                            sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        return;
+    }
+
+    sendreq->req_restart++;
+    if (2 == sendreq->req_restart) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDVRESTARTACK: received: restarting send "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                            hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        mca_pml_bfo_send_request_restart(sendreq, false, 0);
+    } else {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDVRESTARTACK received: waiting for RNDVRESTARTNOTIFY completion "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                            hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,    
+                            sendreq->req_send.req_base.req_peer);
+    }
+    return;
+}
+
+
+/**
+ * Callback for when a RECVERRNOTIFY message is received.  This message
+ * is sent from the receiver to the sender and tells the sender that
+ * the receiver has seen an error.  This will trigger the sender
+ * to start the request restart sequence.
+ */
+void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl,
+                                                  mca_btl_base_tag_t tag,
+                                                  mca_btl_base_descriptor_t* des,
+                                                  void* cbdata ) {
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_send_request_t* sendreq;
+
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY);
+    sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
+
+    /* First make sure that this message is pointing to a valid request.
+     * This can be determined if the communicator context, the source of
+     * the message, and the MPI sequence number all match. */
+    if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
+        (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
+        (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RECVERRNOTIFY: received: does not match request, dropping "
+                            "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d RQS:exp=%d,act=%d "
+                            "src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
+                            sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
+                            sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
+                            sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        return;
+    }
+
+    /* If a good ACK was never received, then the first ACK received
+     * might be a RECVERRNOTIFY message.  In that case, the sendreq does not
+     * have a valid req_recv pointer in it.  Therefore, check for that
+     * case and update the field in the sendreq if necessary. */
+    if (NULL == sendreq->req_recv.pval) {
+        sendreq->req_recv = hdr->hdr_restart.hdr_dst_req;
+    }
+
+    /* Now check to see a restart needs to be issued.  The request
+     * sequence number in the header is compared against the current
+     * request sequence number in the send request.  If the header
+     * sequence number is greater than or equal to the send request
+     * number, then a rndvrestartnotify is issued.  There are some cases
+     * where a few extra rndvrestartnotifys are issued.  That is OK as
+     * it will all work itself out.  The idea is to prevent many
+     * restarts unnecessarily.  This still allows multiple restarts to
+     * happen.  It could be that sometime later another error occurs
+     * which initiates a restart.  That is OK as it will have the new
+     * sequence number and all is well. */
+    if (hdr->hdr_restart.hdr_restartseq >= sendreq->req_restartseq) {
+        assert(sendreq->req_send.req_base.req_ompi.req_state == OMPI_REQUEST_ACTIVE);
+        sendreq->req_error++;
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RECVERRNOTIFY: received: sendreq has error, outstanding events=%d, "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                            sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq, (void *)sendreq,
+                            sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+
+        if (0 == sendreq->req_events) {
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
+                                                       OMPI_SUCCESS, btl);
+        }
+    } else {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RECVERRNOTIFY: received: error has already been noted, ignoring "
+                            "PML:exp=%d,act=%d RQS:exp=%d,act=%d src_req=%p, dst_req=%p, peer=%d",
+                            sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+    }
+    return;
+}
+
+/**
+ * Callback for when a RNDVRESTARTNACK message is received.  This message
+ * is sent from the receiver to the sender and tells the sender that
+ * the receiver has already completed the message and there is nothing
+ * else to be done.  The sender should then just make the send request
+ * complete.
+ */
+void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl,
+                                                    mca_btl_base_tag_t tag,
+                                                    mca_btl_base_descriptor_t* des,
+                                                    void* cbdata ) {
+
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_send_request_t* sendreq;
+
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK);
+    sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
+
+    /* Not convinced a RNDVRESTARTNACK that does not match a request can
+     * happen, but have the check in here anyways for now */
+    if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
+        (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
+        (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) ||
+        (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RNDVRESTARTNACK: received: does not match request, dropping "
+                            "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d "
+                            "src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
+                            sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
+                            sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
+                            sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        return;
+    }
+
+    opal_output_verbose(20, mca_pml_bfo_output,
+                        "RNDVRESTARTNACK: received: marking send request as complete "
+                        "PML=%d CTX=%d SRC=%d EXP=%d "
+                        "src_req=%p, dst_req=%p, peer=%d",
+                        (uint16_t)sendreq->req_send.req_base.req_sequence,
+                        sendreq->req_send.req_base.req_comm->c_contextid,
+                        sendreq->req_send.req_base.req_peer, sendreq->req_restartseq,
+                        (void *)sendreq, sendreq->req_recv.pval,
+                        sendreq->req_send.req_base.req_peer);
+    mca_pml_bfo_send_request_rndvrestartnack(sendreq);
+    return;
+}
+
+
+/**
+ * This function gets called when failover is enabled and an error
+ * occurs during the rendezvous protocol.  A message is sent to the
+ * receiving side notifying the request that the communication is
+ * going to be starting over.  However, none of the information in the
+ * send request is reset yet, so that any in flight fragments can
+ * still find a home.  Information in the send request gets reset when
+ * the completion event for this send occurs AND an ACK has been
+ * received back from the receiver.
+ */
+void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq,
+                                                bool repost, mca_btl_base_tag_t tag,
+                                                int status, mca_btl_base_module_t* btl)
+{
+    mca_btl_base_descriptor_t* des;
+    mca_pml_bfo_restart_hdr_t* restart;
+    int rc;
+    mca_bml_base_btl_t* bml_btl;
+    ompi_proc_t* proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc;
+    mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+
+    /* If this message is not a repost, then update the sequence number. */
+    if (!repost) {
+        /* Bump up the rendezvous request sequence number. */
+        sendreq->req_restartseq++;
+    }
+
+    assert(0 == sendreq->req_events);
+    assert(0 != bml_endpoint->btl_eager.arr_size);
+
+    /* In the case that this is started because the receiver has
+     * sent us a message, then attempt to use a different BTL than the
+     * error message was received on.  This may potentially tickle the
+     * error sooner if this side has not seen it yet. */
+    bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+    if (bml_btl->btl == btl) {
+        /* If there is more than one BTL left, then we will get a 
+         * different one.  If there is only one, we will just get 
+         * the same one back again.  That is OK. */
+        bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+    }
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_restart_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                       MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        opal_output(0, "%s:%d Our of resources, cannot proceed", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    /* fill out header */
+    restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+    restart->hdr_match.hdr_common.hdr_flags = 0;
+    restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY;
+    restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+    restart->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+    restart->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+    restart->hdr_restartseq = sendreq->req_restartseq;
+    restart->hdr_src_req.pval = sendreq;
+    restart->hdr_dst_req = sendreq->req_recv;
+    restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
+    restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid;
+    restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid;
+
+    bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);
+
+    /* initialize descriptor */
+    des->des_cbfunc = mca_pml_bfo_rndvrestartnotify_completion;
+
+    opal_output_verbose(30, mca_pml_bfo_output,
+                        "RNDVRESTARTNOTIFY: sent: PML=%d, RQS(new)=%d, CTX=%d, SRC=%d, "
+                        "src_req=%p, dst_req=%p, peer=%d",
+                        (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
+                        restart->hdr_match.hdr_ctx, restart->hdr_match.hdr_src,
+                        (void *)sendreq, sendreq->req_recv.pval,
+                        sendreq->req_send.req_base.req_peer);
+
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY);
+    if( OPAL_UNLIKELY( rc < 0 ) ) {
+        opal_output(0, "[%s:%d] Cannot send rndvrestartnotify message", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+}
+
+/**
+ * This function is called when a RNDVRESTARTNACK message is received
+ * by the sender.
+ */
+void mca_pml_bfo_send_request_rndvrestartnack(mca_pml_bfo_send_request_t* sendreq)
+{
+    /* A RNDVRESTARTNACK was sent by the receiver.  This means that the
+     * receiver is rejecting the RNDVRESTARTNOTIFY message indicating the
+     * receiver's request is complete.  Therefore, mark the sender complete
+     * also.  This data exchange is over. */
+    send_request_pml_complete(sendreq);
+}
+
+/**
+ * This function restarts a RNDV send request.  When this is called,
+ * all the fields in the send request are reset and the send is
+ * started over.  The sendreq->req_restartseq will be non-zero which will
+ * trigger a special flag in the RNDV header which indicates the match
+ * has already happened on the receiving side.
+ */
+void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
+                                      bool repost, mca_btl_base_tag_t tag)
+{
+    size_t offset = 0;
+    opal_list_item_t *first_item;
+    opal_list_item_t *last_item;
+    mca_bml_base_endpoint_t* endpoint;
+    size_t i;
+
+    /* If the tag is something valid, it was a repost.  We could also
+     * check the repost field as well.  Maybe I can drop the
+     * repost and have the tag double as it. */
+    switch (tag) {
+    case MCA_PML_BFO_HDR_TYPE_RNDV:
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDV: completion failed, reset and repost: PML=%d, RQS=%d, "
+                            "CTX=%d, SRC=%d, src_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
+                            sendreq->req_send.req_base.req_peer);
+        break;
+    case MCA_PML_BFO_HDR_TYPE_RGET:
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RGET: completion failed, reset and repost: PML=%d, RQS=%d, "
+                            "CTX=%d, SRC=%d, src_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
+                            sendreq->req_send.req_base.req_peer);
+        break;
+    default:
+        break;
+    }
+
+    /* Return mpool resources, they get reacquired when request starts over. */
+    mca_pml_bfo_free_rdma_resources(sendreq);
+
+    /* Release any memory in use if this is a buffered send */
+    if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
+        sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
+        mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
+    }
+
+    /* Clear out any unsent send ranges.  Recreated the get_send_range
+     * and the get_next_send_range functions. */
+    OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
+    first_item = opal_list_get_first(&sendreq->req_send_ranges);
+    last_item = opal_list_get_end(&sendreq->req_send_ranges);
+    while (first_item != last_item) {
+        opal_list_remove_item(&sendreq->req_send_ranges, last_item);
+        OMPI_FREE_LIST_RETURN(&mca_pml_bfo.send_ranges, (ompi_free_list_item_t *)last_item);
+        last_item = opal_list_get_end(&sendreq->req_send_ranges);
+    }
+    OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
+
+    /* Reset the converter to the beginning. */
+    opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
+                                &offset);
+
+    /* Bump up internal sequence number to handle possible duplicate
+     * RNDV messages.  In the case of reposting a RNDV message, do not
+     * increment the value.  That way, a duplicate message can be
+     * detected. */
+    if (!repost) {
+        sendreq->req_restartseq++;
+    }
+
+    /* This code here is essentially the same is mca_pml_bfo_send_request_start()
+     * but with a few modifications since we are restarting the request, not
+     * starting entirely from scratch. */
+    endpoint = (mca_bml_base_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_bml;
+    sendreq->req_endpoint = endpoint;
+    sendreq->req_state = 0;
+    sendreq->req_lock = 0;
+    sendreq->req_pipeline_depth = 0;
+    sendreq->req_bytes_delivered = 0;
+    sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
+
+    /* Note that we do not reset the following two items.
+     * They stay with their original values.
+     *     sendreq->req_send.req_base.req_sequence
+     *     sendreq->req_restartseq
+     */
+    sendreq->req_restart = 0;         /* reset in case we restart again */
+    sendreq->req_error = 0;           /* clear error state */
+    sendreq->req_events = 0;          /* clear events, probably 0 anyways */
+    sendreq->req_acked = false;
+
+    MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
+
+    for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
+        mca_bml_base_btl_t* bml_btl;
+        int rc;
+
+        /* select a btl */
+        bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+        rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
+        if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) )
+            return;
+    }
+    add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
+}
+
+/**
+ * This function will repost a match fragment.  This function has to
+ * handle the case where there may not be a request associated with
+ * the fragment and just use the information in the fragment to
+ * repost the send.
+ */
+void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
+{
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+    mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
+    struct mca_bml_base_endpoint_t* endpoint;
+    int rc;
+    size_t offset = 0;
+
+    /* At this point a determination has to be made whether the
+     * BFO_HDR_TYPE_MATCH fragment was sent via the sendi interface or
+     * via the regular send interface.  This is important because if it
+     * was sent via the sendi interface, then the request associated
+     * with it has already been completed and released.  This can be
+     * determined by looking at the des->des_flags field of the
+     * descriptor.  If the ALWAYS_CALLBACK flag is set then it is known
+     * that there is a valid send request associated with the fragment
+     * and it can be used to extricate information.  If ALWAYS_CALLBACK
+     * is not set, then the endpoint information is in the callback
+     * data field and where to resend the fragment can be determined
+     * from the fragment. */
+    if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+        endpoint = sendreq->req_endpoint;
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "MATCH: repost: src_req=%p",
+                            (void *)sendreq);
+    } else {
+        endpoint = des->des_cbdata;
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "MATCH: repost: des=%p (sendi fragment)",
+                            (void *)des);
+    }
+
+    assert(0 != endpoint->btl_eager.arr_size);
+    bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+
+    if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+        /* Reset the converter to the beginning */
+        opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
+                                    &offset);
+        rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
+        if (OMPI_SUCCESS == rc) {
+            return;
+        } else if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d",
+                                (int)sendreq->req_endpoint->btl_eager.arr_size);
+            add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
+            return;
+        } else {
+            opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
+                        __FILE__, __LINE__);
+            orte_errmgr.abort(-1, NULL);
+        }
+    } else {
+        /* No send request available so alloc and repost explicitly */
+        mca_btl_base_descriptor_t* newdes = NULL;
+        mca_btl_base_segment_t* oldseg;
+        mca_btl_base_segment_t* newseg;
+
+        oldseg = des->des_src;
+        /* The alloc routine must be called with the MCA_BTL_NO_ORDER
+         * flag so that the allocation routine works.  The allocation
+         * will fill in the order flag in the descriptor. */
+        mca_bml_base_alloc( bml_btl, &newdes,
+                            MCA_BTL_NO_ORDER,
+                            oldseg->seg_len,
+                            MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+        if (OPAL_UNLIKELY(NULL == newdes)) {
+            opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
+                        __FILE__, __LINE__);
+            orte_errmgr.abort(-1, NULL);
+        }
+        newseg = newdes->des_src;
+        /* Copy over all the data that is actually sent over the wire */
+        memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len);
+        newseg->seg_len = oldseg->seg_len;
+
+        /* This call will either return OMPI_SUCCESS or OMPI_ERROR.  The
+         * OMPI_SUCCESS only says that the send request can be freed.
+         * It may be that the message was queued up in the BTL. */
+        rc = mca_bml_base_send(bml_btl, newdes, MCA_PML_BFO_HDR_TYPE_MATCH);
+
+        /* Some BTLs will set the CALLBACK flag but we do not want that
+         * as there is no longer a request associated with this descriptor.
+         * Therefore, always make sure it is cleared.  */
+        newdes->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
+
+        if( OPAL_LIKELY( rc >= 0 )) {
+            /* Just let the normal flow of data free whatever needs
+             * to be freed */
+            return;
+        } else {
+            opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
+                        __FILE__, __LINE__);
+            orte_errmgr.abort(-1, NULL);
+        }
+   }
+    /* No need to free any descriptors.  The BTLs take care of it since
+     * we originally allocated with MCA_BTL_DES_FLAGS_BTL_OWNERSHIP. */
+}
+
+/**
+ * Completion callback for rndvrestartnotify completion event.  If the
+ * RNDVRESTARTACK has already been received, then reset and restart.
+ * Otherwise, just update the state and let the RNDVRESTARTACK trigger
+ * the reset and restart.
+ */
+void
+mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
+                                         struct mca_btl_base_endpoint_t* ep,
+                                         struct mca_btl_base_descriptor_t* des,
+                                         int status)
+{
+    mca_pml_bfo_restart_hdr_t* restart;
+    mca_pml_bfo_send_request_t* sendreq;
+
+    restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+    sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval;
+
+    /* Need to resend this message in the case that it fails */
+    if( OPAL_UNLIKELY((OMPI_SUCCESS != status))) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDVRESTARTNOTIFY: completion failed: repost "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        /* Repost the message and indicate it is a repost, not a new one. No need
+         * to check the req_events as this is the only possible outstanding send
+         * event when we have posted this message.  We also know the sendreq is still
+         * available because nothing can proceed until this completion event happens
+         * successfully as we track the req_restart value. */
+        mca_pml_bfo_send_request_rndvrestartnotify(sendreq, true,
+                                                   MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
+                                                   status, btl);
+        return;
+    }
+
+    /* The req_restart value is incremented to indicate completion of
+     * the RNDVRESTARTNOTIFY message.  Then (typically) the arrival of the
+     * ACK message will cause the request to reset and restart. Need to
+     * make sure that RNDVRESTARTNOTIFY callback has been called as well as
+     * the ACK back from the receiver prior to resetting and restarting
+     * the request.  This is needed in case we get an error on the
+     * RNDVRESTARTNOTIFY message, but it actually makes it over. We want
+     * to make sure the send request has not restarted yet.  So, keep a
+     * counter that counts to 2. */
+    sendreq->req_restart++;
+    if (2 == sendreq->req_restart) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDVRESTARTNOTIFY: completion: restarting request "
+                            "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_recv.pval, (void *)sendreq,
+                            sendreq->req_send.req_base.req_peer);
+        mca_pml_bfo_send_request_restart(sendreq, false, 0);
+    } else {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDVRESTARTNOTIFY: completion: waiting for ack "
+                            "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_recv.pval, (void *)sendreq,
+                            sendreq->req_send.req_base.req_peer);
+    }
+}
+
+/**
+ * This function is called when an error is detected on a completion
+ * event on the receiving side.  This can come from a ACK, PUT, RDMA
+ * read (GET) or RECVERRNOTIFY completion event.  When this happens, check
+ * the state of the request and decide if the sender needs be notified
+ * that a problem was seen.  If no RECVERRNOTIFY message has been sent and
+ * no RNDVRESTARTNOTIFY has been received from the sender, then send a
+ * message telling the sender an error was seen.
+ */
+void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
+                                            mca_btl_base_tag_t tag, int status)
+{
+    mca_btl_base_descriptor_t* des;
+    mca_pml_bfo_restart_hdr_t* restart;
+    ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
+    mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+    mca_bml_base_btl_t* bml_btl;
+    int rc;
+
+    assert(0 != bml_endpoint->btl_eager.arr_size);
+
+    bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_restart_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                       MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    /* fill out header */
+    restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+    restart->hdr_match.hdr_common.hdr_flags = 0;
+    restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY;
+    restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
+    restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank;
+    restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq;
+    restart->hdr_restartseq = recvreq->req_restartseq;
+    restart->hdr_src_req = recvreq->remote_req_send;
+    restart->hdr_dst_req.pval = recvreq;
+
+    bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, proc);
+
+    /* initialize descriptor */
+    des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
+
+    opal_output_verbose(30, mca_pml_bfo_output,
+                        "RECVERRNOTIFY: sending to sender, "
+                        "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d, btl=%p",
+                        recvreq->req_msgseq, recvreq->req_restartseq,
+                        recvreq->remote_req_send.pval,
+                        (void *)recvreq,
+                        recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                        (void *)bml_btl->btl);
+
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY);
+    if( OPAL_UNLIKELY( rc < 0 ) ) {
+        opal_output(0, "[%s:%d] Cannot send recverrnotify message", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+    /* Prevent future error messages on this request */
+    recvreq->req_errstate |= RECVREQ_RECVERRSENT;
+}
+
+/**
+ * This function is called when it may be time to send a RNDVRESTARTACK
+ * message back to the sending side.  This can happen because we
+ * received a RNDVRESTARTNOTIFY message from the sender.  This can
+ * also happen if we have noticed that the request has received the
+ * RNDVRESTARTNOTIFY message, but has not yet sent out the RNDVRESTARTACK
+ * because there were still some pending receive events on the request.
+ * That means we can enter this routine from a completion event on a ACK,
+ * PUT, or RDMA read as well as from the receipt of a RNDVRESTARTNOTIFY
+ * message.  If all is good, we sent the RNDVRESTARTACK message back to
+ * the sender.  Then sometime later a message will arrive telling us
+ * to reset and restart the receive request.
+ */
+void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq,
+                                            mca_btl_base_tag_t tag, int status,
+                                            mca_btl_base_module_t* btl)
+{
+    mca_btl_base_descriptor_t* des;
+    mca_pml_bfo_restart_hdr_t* restart;
+    ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
+    mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+    mca_bml_base_btl_t* bml_btl;
+    int rc;
+
+    assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) == RECVREQ_RNDVRESTART_RECVED);
+    assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_ACKED) == 0);
+    assert(0 != bml_endpoint->btl_eager.arr_size);
+
+    bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+
+    /* Attempt to use a different BTL than the error message was
+     * received on.  This may potentially tickle the error sooner if
+     * this side has not seen it yet. */
+    if (bml_btl->btl == btl) {
+        /* If there is more than one BTL left, then we will get a 
+         * different one.  If there is only one, we will just get 
+         * the same one back again.  That is OK. */
+        bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+    }
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_restart_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                       MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    /* fill out header */
+    restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+    restart->hdr_match.hdr_common.hdr_flags = 0;
+    restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK;
+    restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
+    restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank;
+    restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq;
+    restart->hdr_restartseq = recvreq->req_restartseq;
+    restart->hdr_src_req = recvreq->remote_req_send;
+    restart->hdr_dst_req.pval = recvreq;
+
+    bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, proc);
+
+    /* initialize descriptor */
+    des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
+    des->des_cbdata = (void *)proc;
+
+    opal_output_verbose(30, mca_pml_bfo_output,
+                        "RNDVRESTARTACK: due to PML tag=%d completion, sending to "
+                        "sender, PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, "
+                        "peer=%d, btl=%p",
+                        tag, recvreq->req_msgseq, recvreq->req_restartseq,
+                        recvreq->remote_req_send.pval, (void *)recvreq, status,
+                        recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                        (void *)bml_btl->btl);
+
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK);
+    if( OPAL_UNLIKELY( rc < 0 ) ) {
+        opal_output(0, "[%s:%d] Cannot send rndvrestartack message", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+    /* Move to the next state so we do not send anymore ACKs */
+    recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_RECVED;
+    recvreq->req_errstate |= RECVREQ_RNDVRESTART_ACKED;
+}
+
+/**
+ * Called after the receipt of a RNDVRESTARTNOTIFY message to a request
+ * that no longer matches.  This can happen if the sender detected an
+ * error, but the receiver actually received all the data.  Therefore
+ * send a NACK back instead of the ACK so that the sender can complete
+ * its request.  This happens very rarely.  Note that we need to make
+ * use of the hdr_dst_rank that we received from the notify message.
+ * This is so the sending side make sure the message matches a valid
+ * request on the sending side.
+ */
+void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
+                                              ompi_proc_t* ompi_proc, bool repost)
+{
+    mca_btl_base_segment_t* segments;
+    mca_pml_bfo_restart_hdr_t* hdr;  /* hdr of NOTIFY message */
+    mca_pml_bfo_restart_hdr_t* nack; /* hdr of NACK message */
+    mca_btl_base_descriptor_t* des;
+    mca_bml_base_endpoint_t* bml_endpoint;
+    mca_bml_base_btl_t* bml_btl;
+    int rc;
+
+    if (repost) {
+        /* In the case where we are reposting the NACK, the information
+         * is in the src area, since we are reposting a send.  In addition,
+         * we get the ompi_proc from the old descriptor. */
+        segments = olddes->des_src;
+        ompi_proc = olddes->des_cbdata;
+    } else {
+        segments = olddes->des_dst;
+    }
+    hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval;
+
+    bml_endpoint = ompi_proc->proc_bml;
+    assert(0 != bml_endpoint->btl_eager.arr_size);
+    bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_restart_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                       MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    /* fill out header */
+    nack = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+    nack->hdr_match.hdr_common.hdr_flags = 0;
+    nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK;
+    nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx;
+    nack->hdr_match.hdr_src = hdr->hdr_dst_rank;       /* Receiver rank */
+    nack->hdr_match.hdr_seq = hdr->hdr_match.hdr_seq;
+    nack->hdr_restartseq = hdr->hdr_restartseq;
+    nack->hdr_src_req = hdr->hdr_src_req;
+    nack->hdr_dst_req.pval = 0;
+
+    bfo_hdr_hton(nack, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, ompi_proc);
+
+    /* Initialize descriptor.  Save away ompi_proc in case we need
+     * to respost this fragmnet. */
+    des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
+    des->des_cbdata = ompi_proc;
+
+    opal_output_verbose(30, mca_pml_bfo_output,
+                        "RNDVRESTARTNACK: sending to sender, "
+                        "PML=%d, RQS=%d, CTX=%d, SRC=%d, peer=%d",
+                        nack->hdr_match.hdr_seq, nack->hdr_restartseq,
+                        nack->hdr_match.hdr_ctx, nack->hdr_match.hdr_src,
+                        ompi_proc->proc_name.vpid);
+
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK);
+    if( OPAL_UNLIKELY( rc < 0 ) ) {
+        opal_output(0, "[%s:%d] Cannot send rndvrestartnack message", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+}
+
+
+/**
+ * Reset all the receive request fields to match what a request
+ * looks like when it is first started.   This gets called when
+ * the rendezvous/rget message is being restarted.
+ */
+void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* match) {
+    int i;
+
+    assert(true != match->req_recv.req_base.req_pml_complete);
+
+    /* Free up any resources that were reserved for this receive.  This
+     * was copied from the receive completion code.  */
+    for(i = 0; i < (int)match->req_rdma_cnt; i++) {
+        mca_mpool_base_registration_t* btl_reg = match->req_rdma[i].btl_reg;
+        if( NULL != btl_reg  && btl_reg->mpool != NULL) {
+            btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
+        }
+    }
+    match->req_rdma_cnt = 0;
+
+    /* This code is mostly copied from mca_pml_bfo_recv_req_start.
+     * Note 1: Leave req_bytes_expected as the original value.  No
+     * need to adjust this as it is set when convertor is created.
+     * Note 2: Leave req_bytes_delivered as the original value.
+     * This is created when the convertor is created and represents
+     * the expected bytes from the user. */
+    assert(0 == match->req_events);
+    match->req_errstate = 0;
+    match->req_lock = 0;
+    match->req_pipeline_depth = 0;
+    match->req_bytes_received = 0;
+    match->req_rdma_idx = 0;
+    match->req_rdma_offset = 0;
+    match->req_send_offset = 0;
+    match->req_pending = false;
+    match->req_ack_sent = false;
+    match->req_restartseq++;
+
+    /* These really should not need to be set, but this matches some
+     * of the initialization within MCA_PML_BASE_RECV_START. */
+    match->req_recv.req_base.req_pml_complete = false;
+    match->req_recv.req_base.req_ompi.req_complete = false;
+    match->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE;
+
+    /* Reset the convertor */
+    opal_convertor_set_position(&match->req_recv.req_base.req_convertor,
+                                &match->req_rdma_offset);
+    return;
+}
+
+/*
+ * Completion callback for RNDVRESTARTACK, RNDVRESTARTNACK and RECVERRNOTIFY.
+ */
+void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
+                                          struct mca_btl_base_endpoint_t* ep,
+                                          struct mca_btl_base_descriptor_t* des,
+                                          int status )
+{
+    if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
+        mca_pml_bfo_common_hdr_t* common = des->des_src->seg_addr.pval;
+        mca_pml_bfo_restart_hdr_t* restart;  /* RESTART header */
+        mca_pml_bfo_recv_request_t* recvreq;
+        int peer;
+
+        switch (common->hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK:
+            restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval;
+            recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
+            peer = recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE;
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RNDVRESTARTACK: completion failed: try again "
+                                "PML:req=%d,hdr=%d RQS:req=%d,hdr=%d CTX:req=%d,hdr=%d "
+                                "src_req=%p, dst_req=%p, peer=%d",
+                                recvreq->req_msgseq, restart->hdr_match.hdr_seq,
+                                recvreq->req_restartseq, restart->hdr_restartseq,
+                                recvreq->req_recv.req_base.req_comm->c_contextid,
+                                restart->hdr_match.hdr_ctx,
+                                recvreq->remote_req_send.pval,
+                                (void *)recvreq, peer);
+
+            /* Adjust the states back to avoid assert errors */
+            recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_ACKED;
+            recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED;
+            mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
+                                                    status, btl);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK:
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RNDVRESTARTNACK: completion failed: try again "
+                                "des=%p ", (void *)des);
+            /* Just blast it again.  No request associated with it. */
+            mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY:
+            /* With just two BTLs, this should never happen as we are
+             * typically sending the RECVERRNOTIFY message on the
+             * working BTL.  But, just in case, if we get an error,
+             * send it again. */
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RECVERRNOTIFY: completion failed: try again, "
+                                "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                recvreq->req_msgseq, recvreq->req_restartseq,
+                                recvreq->remote_req_send.pval,
+                                (void *)recvreq,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+            mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
+                                                   status);
+            break;
+        default:
+            opal_output(0, "[%s:%d] Unknown callback error", __FILE__, __LINE__);
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+}
+
+/*
+ * Remove a btl for future communication on an endpoint.
+ */
+void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
+                             ompi_proc_t *errproc, char *btlname)
+{
+    mca_bml_base_endpoint_t* ep;
+    bool remove = false;
+    int i;
+
+    ep = (mca_bml_base_endpoint_t*)errproc->proc_bml;
+
+    /* The bml_del_proc_btl function does not indicate if it
+     * actually removed a btl, so let me check up front.  This is
+     * done so that we can only print out messages when a btl is
+     * actually going to be removed. These arrays are small so it
+     * is OK to walk through all of them even though it may be
+     * redundant. */
+    for( i = 0; i < (int)ep->btl_eager.arr_size; i++ ) {
+        if( ep->btl_eager.bml_btls[i].btl == btl ) {
+            remove = true;
+        }
+    }
+    for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
+        if( ep->btl_send.bml_btls[i].btl == btl ) {
+            remove = true;
+        }
+    }
+    for( i = 0; i < (int)ep->btl_rdma.arr_size; i++ ) {
+        if( ep->btl_rdma.bml_btls[i].btl == btl ) {
+            remove = true;
+        }
+    }
+
+    if (true == remove) {
+        mca_bml.bml_del_proc_btl(errproc, btl);
+
+        orte_notifier.log(ORTE_NOTIFIER_ERROR, ORTE_ERR_COMM_FAILURE,
+                          "BTL %s error: rank=%d mapping out %s "
+                          "to rank=%d on node=%s",
+                          btl->btl_component->btl_version.mca_component_name,
+                          ORTE_PROC_MY_NAME->vpid,
+                          btlname, errproc->proc_name.vpid,
+                          errproc->proc_hostname);
+
+        opal_output_verbose(10, mca_pml_bfo_output,
+                            "BTL %s error: rank=%d mapping out %s "
+                            "to rank=%d on node=%s \n",
+                            btl->btl_component->btl_version.mca_component_name,
+                            ORTE_PROC_MY_NAME->vpid,
+                            btlname, errproc->proc_name.vpid,
+                            errproc->proc_hostname);
+
+        /* Need to search for any pending packets associated
+         * with this endpoint and remove them.  We may also
+         * have to restarts depending on the state of the
+         * requests. */
+        mca_pml_bfo_error_pending_packets(btl, ep);
+
+        if ((ep->btl_eager.arr_size == 0) &&
+            (ep->btl_send.arr_size == 0) &&
+            (ep->btl_rdma.arr_size == 0)) {
+            opal_output(0, "%s:%d: No more interfaces, aborting",
+                        __FILE__, __LINE__);
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+}
+
+void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
+                    int32_t flags, ompi_proc_t *errproc, char *btlname)
+{ 
+    ompi_proc_t** procs; 
+    size_t p, num_procs; 
+
+    /* If we are in here, we know that the we were called
+     * with the flags == MCA_BTL_ERROR_FLAGS_NONFATAL so no
+     * need to check it in here. */
+    assert(flags & MCA_BTL_ERROR_FLAGS_NONFATAL);
+
+    procs = ompi_proc_all(&num_procs);
+
+    if(NULL == procs) {
+        opal_output(0, "%s:%d: Out of memory, giving up.",
+                    __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    if (NULL == btlname) {
+        btlname = "unknown";
+    }
+
+    /* If the process to map out is not specified or if the fast
+     * failover flag is specified, then map out the entire BTL.
+     * Otherwise, only map out the BTL for the specific remote
+     * process. */
+    if (NULL == errproc || mca_pml_bfo.fast_failover) {
+        for( p = 0; p < num_procs; p++ ) {
+            mca_pml_bfo_map_out_btl(btl, procs[p], btlname);
+        }
+    } else {
+        mca_pml_bfo_map_out_btl(btl, errproc, btlname);
+    }
+    free(procs);
+}
+
+/**
+ * This function is called since when we are mapping out a BML.  This
+ * will walk through the four PML lists and dispatch with the
+ * fragments/requests.  There are four different lists and each one is
+ * handled slighty differently.  In all cases, we first see if the
+ * message is associated with the endpoint that is being mapped out.
+ * If not, then just leave it alone and put it back on the list.  If
+ * it is associated with the endpoint, then a each list handles it
+ * slighlty differently.  Also, in some cases, we actually adjust the
+ * pointers to the BMLs in the messages as they may have changed when
+ * the BML is mapped out.  That is because this is called after we
+ * have mapped out the offending BML and adjusted the array of
+ * available BMLs.
+ */
+static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl,
+                                              mca_bml_base_endpoint_t* ep) {
+    int32_t i, s;
+
+    /* The pckt_pending list contains both ACK and FIN messages.
+     * ACKs can be sent over any BTL associated with the endpoint.
+     * Therefore, the bml_btl entry for ACKS is NULL and they do
+     * not need to be adjusted.  It is also worth noting that
+     * the ACK will be the only outstanding message associated
+     * with a request so we can just let nature takes it course.
+     *
+     * FIN messages do have a BML associated with them, but they
+     * can also be sent over any BTL.  Therefore, adjust the bml
+     * pointer in the pckt to ensure it points at a valid BML.
+     */
+
+    s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending);
+    for(i = 0; i < s; i++) {
+        mca_pml_bfo_pckt_pending_t *pckt;
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "INFO: pckt_pending list has %d entries", s);
+#if 1
+        /* TODO: Error out until code is tested */
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "%s:%d: Support not implemented, aborting",
+                    __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+#endif
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        pckt = (mca_pml_bfo_pckt_pending_t*)
+            opal_list_remove_first(&mca_pml_bfo.pckt_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+        /* My guess is that this can happen in the threaded
+         * case where the other thread removed some packets
+         * after we determined the size of the list. */
+        if(NULL == pckt)
+            break;
+
+        /* If there is no bml stored on the packet, then just
+         * put it back on the list as there is nothing to adjust.
+         * This appears to be true with ACK packets. */
+        if (NULL == pckt->bml_btl) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.pckt_pending,
+                             (opal_list_item_t*)pckt);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        /* Now see if this endpoint matches the one we are mapping
+         * out.  If so, adjust the bml entry so to ensure it is
+         * not pointing at a stale bml.  We do not really care
+         * which BML it is pointing at as long as it is valid.
+         * In either case, then put entry back on the list. */
+        if (pckt->proc->proc_bml == ep) {
+            opal_output_verbose(15, mca_pml_bfo_output,
+                                "INFO: Found matching pckt on pckt_pending list, adjusting bml");
+            pckt->bml_btl = mca_bml_base_btl_array_get_next(&ep->btl_eager);
+        }
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        opal_list_append(&mca_pml_bfo.pckt_pending,
+                         (opal_list_item_t*)pckt);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+    }
+
+    /* This next list holds rdma fragments.  We need to walk through
+     * the list and see if any are associated with the endpoint
+     * we are mapping out.  If not, then just put back on the
+     * list.  If they are, then we need to error them out.  One issue
+     * is that we need to deal with the case where there may be more
+     * then one pending rdma fragment for a request. */
+    s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending);
+    for(i = 0; i < s; i++) {
+        mca_pml_bfo_rdma_frag_t* frag;
+        mca_pml_bfo_send_request_t* sendreq;
+        mca_pml_bfo_recv_request_t* recvreq;
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "INFO: rdma_pending list has %d entries", s);
+#if 1
+        /* TODO: Error out until code is tested */
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "%s:%d: Support not implemented, aborting",
+                    __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+#endif
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        frag = (mca_pml_bfo_rdma_frag_t*)
+            opal_list_remove_first(&mca_pml_bfo.rdma_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+        /* My guess is that this can happen in the threaded
+         * case where the other thread removed some packets
+         * after we determined the size of the list. */
+        if(NULL == frag)
+            break;
+
+        /* Check to see if it matches our endpoint.  If it does,
+         * then check if it matches the BTL that is being mapped
+         * out.  If it does not, then just readjust the BML pointer.
+         * If it does, then we need to do something with it. */
+        if (frag->rdma_ep != ep) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.rdma_pending,
+                             (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        /* If we are here, then we know we are working on the same
+         * endpoint.  Now check the BTL. */
+        if (frag->rdma_btl != btl) {
+            opal_output_verbose(15, mca_pml_bfo_output,
+                                "INFO: Found matching frag on rdma_pending list, adjusting bml");
+            /* The BTL this RDMA is associated with is not the
+             * one that is getting mapped out, so just adjust the
+             * BML pointer and put back on the list. */
+            frag->rdma_bml = mca_bml_base_btl_array_find(&ep->btl_rdma, frag->rdma_btl);
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.rdma_pending,
+                             (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        /* Now we call the restart routine.  This is just like if we got
+         * a completion event after calling an RDMA write.  This will
+         * take care of figuring out if we need to restart the request
+         * or wait for any outstanding events to complete.  */
+        if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) {
+            opal_output_verbose(15, mca_pml_bfo_output,
+                                "INFO: Found matching PUT frag on rdma_pending list, restarting");
+            sendreq = frag->rdma_req;
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_PUT, 2, btl);
+            MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+        } else {
+            opal_output_verbose(15, mca_pml_bfo_output,
+                                "INFO: Found matching RGET frag on rdma_pending list, sending reqerror");
+            /* This is just like what we do on an rget completion event */
+            recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
+            mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, 2);
+
+            /* See if the request has received a RNDVRESTARTNOTIFY */
+            if( OPAL_UNLIKELY(recvreq->req_errstate)) {
+                if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) {
+                    mca_pml_bfo_recv_request_rndvrestartack(recvreq,
+                                                            MCA_PML_BFO_HDR_TYPE_RGET,
+                                                            2, btl);
+                }
+            }
+            MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+        }
+    }
+
+    s = opal_list_get_size(&mca_pml_bfo.send_pending);
+    /* Look for pending events on our endpoint */
+    for(i = 0; i < s; i++) {
+        mca_pml_bfo_send_request_t* sendreq;
+        ompi_proc_t* proc;
+        mca_bml_base_endpoint_t* bml_endpoint;
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "INFO: send_pending list has %d entries", s);
+#if 1
+        /* TODO: Error out until code is tested */
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "%s:%d: Support not implemented, aborting",
+                    __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+#endif
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        sendreq = (mca_pml_bfo_send_request_t*)
+            opal_list_remove_first(&mca_pml_bfo.send_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+        /* My guess is that this can happen in the threaded
+         * case where the other thread removed some packets
+         * after we determined the size of the list. */
+        if(NULL == sendreq)
+            break;
+
+        proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc;
+        bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+
+        /* Check to see if it matches our endpoint.  If it does not,
+         * then just put it back on the list as there is nothing
+         * we need to do with it. */
+        if (bml_endpoint != ep) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.send_pending,
+                             (opal_list_item_t*)sendreq);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        switch(sendreq->req_pending) {
+        case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
+            /* If this send request is using the endpoint that received
+             * the error, then let us error it out.  In the case
+             * where there is only one fragment left to be scheduled
+             * and it would have gone over the good BTL, this is
+             * not necessary.  But, we will use simplicity here
+             * and assume that some of the fragments are still
+             * scheduled to go over the broken BTL. */
+            sendreq->req_error++;
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_FRAG, 2, btl);
+            break;
+        case MCA_PML_BFO_SEND_PENDING_START:
+            /* If the request has not even started, then just put it back
+             * on the list.  Nothing else to do with it. */
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.send_pending,
+                             (opal_list_item_t*)sendreq);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            break;
+        default:
+            opal_output(0, "[%s:%d] wrong send request type\n",
+                    __FILE__, __LINE__);
+            break;
+        }
+    }
+
+    s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending);
+    for(i = 0; i < s; i++) {
+        mca_pml_bfo_recv_request_t* recvreq;
+        ompi_proc_t* proc;
+        mca_bml_base_endpoint_t* bml_endpoint;
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "INFO: recv_pending list has %d entries", s);
+#if 1
+        /* TODO: Error out until code is tested */
+        opal_output_verbose(0, mca_pml_bfo_output,
+                            "%s:%d: Support not implemented, aborting",
+                    __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+#endif
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        recvreq = (mca_pml_bfo_recv_request_t*)
+            opal_list_remove_first(&mca_pml_bfo.recv_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+        /* My guess is that this can happen in the threaded
+         * case where the other thread removed some packets
+         * after we determined the size of the list. */
+        if(NULL == recvreq)
+            break;
+
+        proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
+        bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
+
+        if (bml_endpoint != ep) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.recv_pending,
+                             (opal_list_item_t*)recvreq);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            continue;
+        }
+
+        mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, 2);
+    }
+}
+
+/**
+ * Call each time we get a completion event on ACK or PUT message.
+ * These types of messages are receive control type messages.  This
+ * function is only called if the underlying BTL supports failover.
+ * Otherwise, there is no need for this check.
+ */
+void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
+                                                  struct mca_btl_base_descriptor_t* des,
+                                                  int status)
+{
+    mca_pml_bfo_common_hdr_t * common = des->des_src->seg_addr.pval;
+    mca_pml_bfo_ack_hdr_t* ack;  /* ACK header */
+    mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */
+    struct mca_btl_base_descriptor_t* rdma_des;
+    mca_pml_bfo_recv_request_t* recvreq;
+
+    if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
+        switch (common->hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_ACK:
+            ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval;
+            recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
+                
+            /* Record the error.  Send RECVERRNOTIFY if necessary. */
+            if (recvreq->req_errstate) {
+                opal_output_verbose(30, mca_pml_bfo_output,
+                                    "ACK: completion failed, error already seen, "
+                                    "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                    recvreq->req_msgseq, recvreq->req_restartseq,
+                                    recvreq->remote_req_send.pval, (void *)recvreq,
+                                    recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+            } else {
+                opal_output_verbose(30, mca_pml_bfo_output,
+                                    "ACK: completion failed, sending RECVERRNOTIFY to sender, "
+                                    "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                    recvreq->req_msgseq, recvreq->req_restartseq,
+                                    recvreq->remote_req_send.pval, (void *)recvreq,
+                                    recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+                mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, status);
+            }
+            break;
+
+        case MCA_PML_BFO_HDR_TYPE_PUT:
+            hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_src->seg_addr.pval;
+            rdma_des = hdr->hdr_des.pval;
+            recvreq = des->des_cbdata;
+            if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) {
+                /* We now record the error, send the RECVERRNOTIFY if
+                 * necessary, and free the descriptor.  Prior to this,
+                 * we want to ensure that we have not reached the case
+                 * where the PUT message actually made it over and we
+                 * have already received a FIN back.  We first check to
+                 * see if the RDMA descriptor cbdata is pointing to
+                 * NULL.  If it is, this means that the PUT message must
+                 * have made it over and a corresponding FIN already
+                 * made it back and freed the RDMA descriptor.  Second,
+                 * if it is non-null, we make sure that it is pointing
+                 * to the same request as the PUT descriptor is.  If
+                 * it is not, again we assume that the FIN came back
+                 * and freed it.  And we can count on the fact that the
+                 * recvreq has not been freed or reused as it is held
+                 * until this very completion event occurs.  */
+                if (recvreq->req_errstate) {
+                    opal_output_verbose(30, mca_pml_bfo_output,
+                                        "PUT: completion failed, error already seen, "
+                                        "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                        recvreq->req_msgseq, recvreq->req_restartseq,
+                                        recvreq->remote_req_send.pval, (void *)recvreq,
+                                        recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+                } else {
+                    opal_output_verbose(30, mca_pml_bfo_output,
+                                        "PUT: completion failed, sending RECVERRNOTIFY to sender, "
+                                        "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                                        recvreq->req_msgseq, recvreq->req_restartseq,
+                                        recvreq->remote_req_send.pval, (void *)recvreq,
+                                        recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+                    mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, status);
+                }
+#if 0
+                /* TODO: Add descriptor to receive request so it can
+                 * be freed only when receive request is freed and
+                 * only if needed. */
+                btl->btl_free(btl, rdma_des);
+#endif
+            }
+            break;
+        default:
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+
+    switch (common->hdr_type) {
+    case MCA_PML_BFO_HDR_TYPE_ACK:
+        ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval;
+        recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
+        recvreq->req_events--;
+        assert(recvreq->req_events >= 0);
+        if(OPAL_UNLIKELY (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) {
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "ACK: completion: recvreq in error, outstanding events=%d "
+                                "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
+                                recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
+                                recvreq->remote_req_send.pval, (void *)recvreq, status,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+            if (0 == recvreq->req_events) {
+                mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_ACK,
+                                                        status, btl);
+            }
+            return;
+        }
+        recv_request_pml_complete_check(recvreq);
+        break;
+    case MCA_PML_BFO_HDR_TYPE_PUT:
+        recvreq = des->des_cbdata;
+        recvreq->req_events--;
+        assert(recvreq->req_events >= 0);
+        if(OPAL_UNLIKELY(recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) {
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "PUT: completion: recvreq in error, outstanding events=%d "
+                                "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
+                                recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
+                                recvreq->remote_req_send.pval, (void *)recvreq, status,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+            if (0 == recvreq->req_events) {
+                mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_PUT,
+                                                        status, btl);
+            }
+            return;
+        }
+        recv_request_pml_complete_check(recvreq);
+        break;
+    }
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.h b/ompi/mca/pml/bfo/pml_bfo_failover.h
new file mode 100644
index 0000000000..2aea218ef8
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_failover.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+/**
+ * @file
+ * Functions that implement failover capabilities.
+ */
+                                                                                                                                                 
+#ifndef MCA_PML_BFO_FAILOVER_H
+#define MCA_PML_BFO_FAILOVER_H
+
+#include "ompi/mca/btl/btl.h"
+#include "pml_bfo_hdr.h"
+
+BEGIN_C_DECLS
+
+bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc,
+                                  mca_pml_bfo_match_hdr_t *hdr);
+bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma,
+                                  mca_btl_base_module_t* btl);
+
+mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr);
+
+void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
+                                      bool repost, mca_btl_base_tag_t tag);
+void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq,
+                                      bool repost, mca_btl_base_tag_t tag, int status,
+                                      mca_btl_base_module_t* btl);
+void mca_pml_bfo_send_request_rndvrestartnack(mca_pml_bfo_send_request_t* sendreq);
+
+void
+mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
+                                         struct mca_btl_base_endpoint_t* ep,
+                                         struct mca_btl_base_descriptor_t* des,
+                                         int status);
+void
+mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
+                                             struct mca_btl_base_descriptor_t* des,
+                                             int status);
+
+/* Reset a receive request to the beginning */
+void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* recvreq);
+/* Notify sender that receiver detected an error */
+void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
+                                            mca_btl_base_tag_t tag, int status);
+/* Ack the RNDVRESTARTNOTIFY message */
+void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq,
+                                             mca_btl_base_tag_t tag, int status,
+                                             mca_btl_base_module_t* btl);
+/* Nack the RNDVRESTARTNOTIFY message */
+void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
+                                              ompi_proc_t* ompi_proc, bool repost);
+
+void mca_pml_bfo_recv_restart_completion(mca_btl_base_module_t* btl,
+                                         struct mca_btl_base_endpoint_t* ep,
+                                         struct mca_btl_base_descriptor_t* des,
+                                         int status);
+void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
+                                        int32_t flags, ompi_proc_t *errproc, char *btlname);
+void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des);
+void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des);
+
+void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
+                             ompi_proc_t *errproc, char *btlname);
+
+extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl,
+                                 mca_btl_base_tag_t tag,
+                                 mca_btl_base_descriptor_t* descriptor,
+                                 void* cbdata );
+
+
+
+
+/**
+ * Four new callbacks for the four new message types.
+ */
+extern void mca_pml_bfo_recv_frag_callback_rndvrestartnotify( mca_btl_base_module_t *btl,
+                                                              mca_btl_base_tag_t tag,
+                                                              mca_btl_base_descriptor_t* descriptor,
+                                                              void* cbdata );
+
+extern void mca_pml_bfo_recv_frag_callback_rndvrestartack( mca_btl_base_module_t *btl,
+                                                           mca_btl_base_tag_t tag,
+                                                           mca_btl_base_descriptor_t* descriptor,
+                                                           void* cbdata );
+
+extern void mca_pml_bfo_recv_frag_callback_rndvrestartnack( mca_btl_base_module_t *btl,
+                                                            mca_btl_base_tag_t tag,
+                                                            mca_btl_base_descriptor_t* descriptor,
+                                                            void* cbdata );
+
+extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t *btl,
+                                                          mca_btl_base_tag_t tag,
+                                                          mca_btl_base_descriptor_t* descriptor,
+                                                          void* cbdata );
+                                              
+
+END_C_DECLS
+
+#endif
diff --git a/ompi/mca/pml/bfo/pml_bfo_hdr.h b/ompi/mca/pml/bfo/pml_bfo_hdr.h
new file mode 100644
index 0000000000..6e9e63aba0
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_hdr.h
@@ -0,0 +1,516 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      IBM Corporation.  All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef MCA_PML_BFO_HEADER_H
+#define MCA_PML_BFO_HEADER_H
+
+#include "ompi_config.h"
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifdef HAVE_NETINET_IN_H
+#include <netinet/in.h>
+#endif
+
+#include "opal/types.h"
+#include "opal/util/arch.h"
+#include "ompi/mca/btl/btl.h"
+#include "ompi/proc/proc.h"
+
+#define MCA_PML_BFO_HDR_TYPE_MATCH     (MCA_BTL_TAG_PML + 1)
+#define MCA_PML_BFO_HDR_TYPE_RNDV      (MCA_BTL_TAG_PML + 2)
+#define MCA_PML_BFO_HDR_TYPE_RGET      (MCA_BTL_TAG_PML + 3)
+#define MCA_PML_BFO_HDR_TYPE_ACK       (MCA_BTL_TAG_PML + 4)
+#define MCA_PML_BFO_HDR_TYPE_NACK      (MCA_BTL_TAG_PML + 5)
+#define MCA_PML_BFO_HDR_TYPE_FRAG      (MCA_BTL_TAG_PML + 6)
+#define MCA_PML_BFO_HDR_TYPE_GET       (MCA_BTL_TAG_PML + 7)
+#define MCA_PML_BFO_HDR_TYPE_PUT       (MCA_BTL_TAG_PML + 8)
+#define MCA_PML_BFO_HDR_TYPE_FIN       (MCA_BTL_TAG_PML + 9)
+/* BFO FAILOVER CODE - begin */
+#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY (MCA_BTL_TAG_PML + 10)
+#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK    (MCA_BTL_TAG_PML + 11)
+#define MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK   (MCA_BTL_TAG_PML + 12)
+#define MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY     (MCA_BTL_TAG_PML + 13)
+/* BFO FAILOVER CODE - end */
+
+#define MCA_PML_BFO_HDR_FLAGS_ACK     1  /* is an ack required */
+#define MCA_PML_BFO_HDR_FLAGS_NBO     2  /* is the hdr in network byte order */
+#define MCA_PML_BFO_HDR_FLAGS_PIN     4  /* is user buffer pinned */
+#define MCA_PML_BFO_HDR_FLAGS_CONTIG  8  /* is user buffer contiguous */
+#define MCA_PML_BFO_HDR_FLAGS_NORDMA  16 /* rest will be send by copy-in-out */
+/* BFO FAILOVER CODE - begin */
+#define MCA_PML_BFO_HDR_FLAGS_RESTART 32 /* restart RNDV because of error */
+/* BFO FAILOVER CODE - end  */
+
+/**
+ * Common hdr attributes - must be first element in each hdr type 
+ */
+struct mca_pml_bfo_common_hdr_t {
+    uint8_t hdr_type;  /**< type of envelope */
+    uint8_t hdr_flags; /**< flags indicating how fragment should be processed */
+};
+typedef struct mca_pml_bfo_common_hdr_t mca_pml_bfo_common_hdr_t;
+
+#define MCA_PML_BFO_COMMON_HDR_NTOH(h) 
+#define MCA_PML_BFO_COMMON_HDR_HTON(h) 
+
+/**
+ *  Header definition for the first fragment, contains the 
+ *  attributes required to match the corresponding posted receive.
+ */
+struct mca_pml_bfo_match_hdr_t {
+    mca_pml_bfo_common_hdr_t hdr_common;   /**< common attributes */
+    uint16_t hdr_ctx;                      /**< communicator index */
+    int32_t  hdr_src;                      /**< source rank */
+    int32_t  hdr_tag;                      /**< user tag */
+    uint16_t hdr_seq;                      /**< message sequence number */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t  hdr_padding[2];               /**< explicitly pad to 16 bytes.  Compilers seem to already prefer to do this, but make it explicit just in case */
+#endif
+};
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+#define OMPI_PML_BFO_MATCH_HDR_LEN  16
+#else
+#define OMPI_PML_BFO_MATCH_HDR_LEN  14
+#endif 
+
+typedef struct mca_pml_bfo_match_hdr_t mca_pml_bfo_match_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_MATCH_HDR_FILL(h) \
+do {                                  \
+    (h).hdr_padding[0] = 0;           \
+    (h).hdr_padding[1] = 0;           \
+} while(0)
+#else
+#define MCA_PML_BFO_MATCH_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+#define MCA_PML_BFO_MATCH_HDR_NTOH(h) \
+do { \
+    MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
+    (h).hdr_ctx = ntohs((h).hdr_ctx); \
+    (h).hdr_src = ntohl((h).hdr_src); \
+    (h).hdr_tag = ntohl((h).hdr_tag); \
+    (h).hdr_seq = ntohs((h).hdr_seq); \
+} while (0)
+
+#define MCA_PML_BFO_MATCH_HDR_HTON(h) \
+do { \
+    MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
+    MCA_PML_BFO_MATCH_HDR_FILL(h);    \
+    (h).hdr_ctx = htons((h).hdr_ctx); \
+    (h).hdr_src = htonl((h).hdr_src); \
+    (h).hdr_tag = htonl((h).hdr_tag); \
+    (h).hdr_seq = htons((h).hdr_seq); \
+} while (0) 
+
+/**
+ * Header definition for the first fragment when an acknowledgment
+ * is required. This could be the first fragment of a large message
+ * or a short message that requires an ack (synchronous).
+ */
+struct mca_pml_bfo_rendezvous_hdr_t {
+    mca_pml_bfo_match_hdr_t hdr_match;
+    uint64_t hdr_msg_length;            /**< message length */
+    ompi_ptr_t hdr_src_req;             /**< pointer to source request - returned in ack */
+/* BFO FAILOVER CODE - begin */
+    ompi_ptr_t hdr_dst_req;             /**< pointer to dst req - failover use only */
+    uint8_t hdr_restartseq;             /**< restart sequence - failover use only */
+/* BFO FAILOVER CODE - end */
+};
+typedef struct mca_pml_bfo_rendezvous_hdr_t mca_pml_bfo_rendezvous_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_RNDV_HDR_FILL(h) \
+    MCA_PML_BFO_MATCH_HDR_FILL((h).hdr_match)
+#else
+#define MCA_PML_BFO_RNDV_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+/* Note that hdr_src_req is not put in network byte order because it
+   is never processed by the receiver, other than being copied into
+   the ack header */
+#define MCA_PML_BFO_RNDV_HDR_NTOH(h) \
+    do { \
+        MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
+        (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \
+    } while (0)
+
+#define MCA_PML_BFO_RNDV_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
+        MCA_PML_BFO_RNDV_HDR_FILL(h); \
+        (h).hdr_msg_length = hton64((h).hdr_msg_length); \
+    } while (0) 
+
+/**
+ * Header definition for a combined rdma rendezvous/get
+ */
+struct mca_pml_bfo_rget_hdr_t {
+    mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
+    uint32_t hdr_seg_cnt;                     /**< number of segments for rdma */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[4];
+#endif
+    ompi_ptr_t hdr_des;                       /**< source descriptor */
+    mca_btl_base_segment_t hdr_segs[1];       /**< list of segments for rdma */
+};
+typedef struct mca_pml_bfo_rget_hdr_t mca_pml_bfo_rget_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_RGET_HDR_FILL(h)         \
+do {                                         \
+    MCA_PML_BFO_RNDV_HDR_FILL((h).hdr_rndv); \
+    (h).hdr_padding[0] = 0;                  \
+    (h).hdr_padding[1] = 0;                  \
+    (h).hdr_padding[2] = 0;                  \
+    (h).hdr_padding[3] = 0;                  \
+} while(0)
+#else
+#define MCA_PML_BFO_RGET_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+#define MCA_PML_BFO_RGET_HDR_NTOH(h) \
+    do { \
+       MCA_PML_BFO_RNDV_HDR_NTOH((h).hdr_rndv); \
+        (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
+    } while (0)
+
+#define MCA_PML_BFO_RGET_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_RNDV_HDR_HTON((h).hdr_rndv); \
+        MCA_PML_BFO_RGET_HDR_FILL(h); \
+        (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
+    } while (0) 
+
+/**
+ *  Header for subsequent fragments.
+ */
+struct mca_pml_bfo_frag_hdr_t {
+    mca_pml_bfo_common_hdr_t hdr_common;     /**< common attributes */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[6];
+#endif
+    uint64_t hdr_frag_offset;                /**< offset into message */
+    ompi_ptr_t hdr_src_req;                  /**< pointer to source request */
+    ompi_ptr_t hdr_dst_req;                  /**< pointer to matched receive */
+};
+typedef struct mca_pml_bfo_frag_hdr_t mca_pml_bfo_frag_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_FRAG_HDR_FILL(h) \
+do {                                 \
+  (h).hdr_padding[0] = 0;            \
+  (h).hdr_padding[1] = 0;            \
+  (h).hdr_padding[2] = 0;            \
+  (h).hdr_padding[3] = 0;            \
+  (h).hdr_padding[4] = 0;            \
+  (h).hdr_padding[5] = 0;            \
+} while(0)
+#else
+#define MCA_PML_BFO_FRAG_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+#define MCA_PML_BFO_FRAG_HDR_NTOH(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
+        (h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \
+    } while (0)
+
+#define MCA_PML_BFO_FRAG_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
+        MCA_PML_BFO_FRAG_HDR_FILL(h); \
+        (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
+    } while (0)
+
+/**
+ *  Header used to acknowledgment outstanding fragment(s).
+ */
+
+struct mca_pml_bfo_ack_hdr_t {
+    mca_pml_bfo_common_hdr_t hdr_common;      /**< common attributes */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[6];
+#endif
+    ompi_ptr_t hdr_src_req;                   /**< source request */
+    ompi_ptr_t hdr_dst_req;                   /**< matched receive request */
+    uint64_t hdr_send_offset;                 /**< starting point of copy in/out */
+};
+typedef struct mca_pml_bfo_ack_hdr_t mca_pml_bfo_ack_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_ACK_HDR_FILL(h) \
+do {                                \
+    (h).hdr_padding[0] = 0;         \
+    (h).hdr_padding[1] = 0;         \
+    (h).hdr_padding[2] = 0;         \
+    (h).hdr_padding[3] = 0;         \
+    (h).hdr_padding[4] = 0;         \
+    (h).hdr_padding[5] = 0;         \
+} while (0)
+#else
+#define MCA_PML_BFO_ACK_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+/* Note that the request headers are not put in NBO because the
+   src_req is already in receiver's byte order and the dst_req is not
+   used by the receiver for anything other than backpointers in return
+   headers */
+#define MCA_PML_BFO_ACK_HDR_NTOH(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
+        (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
+    } while (0)
+
+#define MCA_PML_BFO_ACK_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
+        MCA_PML_BFO_ACK_HDR_FILL(h); \
+        (h).hdr_send_offset = hton64((h).hdr_send_offset); \
+    } while (0) 
+
+/**
+ *  Header used to initiate an RDMA operation.
+ */
+
+struct mca_pml_bfo_rdma_hdr_t {
+    mca_pml_bfo_common_hdr_t hdr_common;      /**< common attributes */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[2];                   /** two to pad out the hdr to a 4 byte alignment.  hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
+#endif
+    uint32_t hdr_seg_cnt;                     /**< number of segments for rdma */
+    ompi_ptr_t hdr_req;                       /**< destination request */
+/* BFO FAILOVER CODE - begin */
+    ompi_ptr_t hdr_dst_req;                   /**< pointer to destination request */
+/* BFO FAILOVER CODE - end */
+    ompi_ptr_t hdr_des;                       /**< source descriptor */
+    uint64_t hdr_rdma_offset;                 /**< current offset into user buffer */ 
+    mca_btl_base_segment_t hdr_segs[1];       /**< list of segments for rdma */
+};
+typedef struct mca_pml_bfo_rdma_hdr_t mca_pml_bfo_rdma_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_RDMA_HDR_FILL(h) \
+do {                                 \
+    (h).hdr_padding[0] = 0;          \
+    (h).hdr_padding[1] = 0;          \
+} while(0)
+#else
+#define MCA_PML_BFO_RDMA_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+#define MCA_PML_BFO_RDMA_HDR_NTOH(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
+        (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
+        (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
+    } while (0)
+
+#define MCA_PML_BFO_RDMA_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
+        MCA_PML_BFO_RDMA_HDR_FILL(h); \
+        (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
+        (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
+    } while (0) 
+
+/**
+ *  Header used to complete an RDMA operation.
+ */
+
+struct mca_pml_bfo_fin_hdr_t {
+/* BFO FAILOVER CODE - begin */
+    mca_pml_bfo_match_hdr_t hdr_match;  /**< match info - needed for failover */ 
+    uint8_t hdr_restartseq;             /**< restart sequence - failover use only */
+/* BFO FAILOVER CODE - end */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[7];
+#endif
+    uint32_t hdr_fail;                        /**< RDMA operation failed */
+    ompi_ptr_t hdr_des;                       /**< completed descriptor */
+};
+typedef struct mca_pml_bfo_fin_hdr_t mca_pml_bfo_fin_hdr_t;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
+#define MCA_PML_BFO_FIN_HDR_FILL(h) \
+do {                                \
+    (h).hdr_padding[0] = 0;         \
+    (h).hdr_padding[1] = 0;         \
+} while (0)
+#else
+#define MCA_PML_BFO_FIN_HDR_FILL(h)
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
+
+#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
+    } while (0)
+
+#define MCA_PML_BFO_FIN_HDR_HTON(h) \
+    do { \
+        MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
+        MCA_PML_BFO_FIN_HDR_FILL(h); \
+    } while (0) 
+
+/* BFO FAILOVER CODE - begin */
+/**
+ *  Header used to restart a rendezvous request.
+ */
+struct mca_pml_bfo_restart_hdr_t {
+    mca_pml_bfo_match_hdr_t hdr_match;        /**< needed to avoid duplicate messages */
+    uint8_t hdr_restartseq;                   /**< restart sequence */
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+    uint8_t hdr_padding[3];
+#endif
+    ompi_ptr_t hdr_src_req;                   /**< source request */
+    ompi_ptr_t hdr_dst_req;                   /**< matched receive request */
+    int32_t  hdr_dst_rank;                    /**< needed to send NACK */
+    uint32_t hdr_jobid;                       /**< needed to send NACK */
+    uint32_t hdr_vpid;                        /**< needed to send NACK */
+};
+typedef struct mca_pml_bfo_restart_hdr_t mca_pml_bfo_restart_hdr_t;
+
+/* Only need to put parts of the restart header in NBO.  No need
+   to do hdr_src_req and hdr_dst_req as they are only used on the
+   by the process that originated them. */
+#define MCA_PML_BFO_RESTART_HDR_NTOH(h) \
+    do { \
+    MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
+    (h).hdr_dst_rank = ntohl((h).hdr_dst_rank); \
+    (h).hdr_jobid = ntohl((h).hdr_jobid); \
+    (h).hdr_vpid = ntohl((h).hdr_vpid); \
+    } while (0)
+
+#define MCA_PML_BFO_RESTART_HDR_HTON(h) \
+    do { \
+    MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
+    (h).hdr_dst_rank = htonl((h).hdr_dst_rank); \
+    (h).hdr_jobid = htonl((h).hdr_jobid); \
+    (h).hdr_vpid = htonl((h).hdr_vpid); \
+    } while (0) 
+/* BFO FAILOVER CODE - end */
+
+/**
+ * Union of defined hdr types.
+ */
+union mca_pml_bfo_hdr_t {
+    mca_pml_bfo_common_hdr_t hdr_common;
+    mca_pml_bfo_match_hdr_t hdr_match;
+    mca_pml_bfo_rendezvous_hdr_t hdr_rndv;
+    mca_pml_bfo_rget_hdr_t hdr_rget;
+    mca_pml_bfo_frag_hdr_t hdr_frag;
+    mca_pml_bfo_ack_hdr_t hdr_ack;
+    mca_pml_bfo_rdma_hdr_t hdr_rdma;
+    mca_pml_bfo_fin_hdr_t hdr_fin;
+/* BFO FAILOVER CODE - begin */
+    mca_pml_bfo_restart_hdr_t hdr_restart;
+/* BFO FAILOVER CODE - end */
+};
+typedef union mca_pml_bfo_hdr_t mca_pml_bfo_hdr_t;
+
+#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+static inline __opal_attribute_always_inline__ void
+bfo_hdr_ntoh(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type)
+{
+    if(!(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NBO))
+        return;
+
+    switch(hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_MATCH:
+            MCA_PML_BFO_MATCH_HDR_NTOH(hdr->hdr_match);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RNDV:
+            MCA_PML_BFO_RNDV_HDR_NTOH(hdr->hdr_rndv);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RGET:
+            MCA_PML_BFO_RGET_HDR_NTOH(hdr->hdr_rget);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_ACK:
+            MCA_PML_BFO_ACK_HDR_NTOH(hdr->hdr_ack);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_FRAG:
+            MCA_PML_BFO_FRAG_HDR_NTOH(hdr->hdr_frag);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_PUT:
+            MCA_PML_BFO_RDMA_HDR_NTOH(hdr->hdr_rdma);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_FIN:
+            MCA_PML_BFO_FIN_HDR_NTOH(hdr->hdr_fin);
+            break;
+        default:
+            assert(0);
+            break;
+    }
+}
+#else
+#define bfo_hdr_ntoh(h, t) do{}while(0)
+#endif
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+#define bfo_hdr_hton(h, t, p) \
+    bfo_hdr_hton_intr((mca_pml_bfo_hdr_t*)h, t, p)
+static inline __opal_attribute_always_inline__ void
+bfo_hdr_hton_intr(mca_pml_bfo_hdr_t *hdr, const uint8_t hdr_type,
+        const ompi_proc_t *proc)
+{
+#ifdef WORDS_BIGENDIAN
+    hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
+#else
+
+    if(!(proc->proc_arch & OPAL_ARCH_ISBIGENDIAN))
+        return;
+
+    hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_NBO;
+    switch(hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_MATCH:
+            MCA_PML_BFO_MATCH_HDR_HTON(hdr->hdr_match);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RNDV:
+            MCA_PML_BFO_RNDV_HDR_HTON(hdr->hdr_rndv);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RGET:
+            MCA_PML_BFO_RGET_HDR_HTON(hdr->hdr_rget);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_ACK:
+            MCA_PML_BFO_ACK_HDR_HTON(hdr->hdr_ack);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_FRAG:
+            MCA_PML_BFO_FRAG_HDR_HTON(hdr->hdr_frag);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_PUT:
+            MCA_PML_BFO_RDMA_HDR_HTON(hdr->hdr_rdma);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_FIN:
+            MCA_PML_BFO_FIN_HDR_HTON(hdr->hdr_fin);
+            break;
+        default:
+            assert(0);
+            break;
+    }
+#endif
+}
+#else
+#define bfo_hdr_hton(h, t, p) do{}while(0)
+#endif
+#endif
diff --git a/ompi/mca/pml/bfo/pml_bfo_iprobe.c b/ompi/mca/pml/bfo/pml_bfo_iprobe.c
new file mode 100644
index 0000000000..70a931927e
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_iprobe.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ompi/request/request.h"
+#include "pml_bfo_recvreq.h"
+
+
+int mca_pml_bfo_iprobe(int src,
+                       int tag,
+                       struct ompi_communicator_t *comm,
+                       int *matched, ompi_status_public_t * status)
+{
+    int rc = OMPI_SUCCESS;
+    mca_pml_bfo_recv_request_t recvreq;
+
+    OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
+    recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
+    recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_IPROBE;
+
+    MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true);
+    MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
+
+    if( recvreq.req_recv.req_base.req_ompi.req_complete == true ) {
+        if( NULL != status ) {
+            *status = recvreq.req_recv.req_base.req_ompi.req_status;
+        }
+        *matched = 1;
+    } else {
+        *matched = 0;
+        opal_progress();
+    }
+    MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
+    return rc;
+}
+
+
+int mca_pml_bfo_probe(int src,
+                      int tag,
+                      struct ompi_communicator_t *comm,
+                      ompi_status_public_t * status)
+{
+    mca_pml_bfo_recv_request_t recvreq;
+
+    OBJ_CONSTRUCT( &recvreq, mca_pml_bfo_recv_request_t );
+    recvreq.req_recv.req_base.req_ompi.req_type = OMPI_REQUEST_PML;
+    recvreq.req_recv.req_base.req_type = MCA_PML_REQUEST_PROBE;
+
+    MCA_PML_BFO_RECV_REQUEST_INIT(&recvreq, NULL, 0, &ompi_mpi_char.dt, src, tag, comm, true);
+    MCA_PML_BFO_RECV_REQUEST_START(&recvreq);
+
+    ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
+
+    if (NULL != status) {
+        *status = recvreq.req_recv.req_base.req_ompi.req_status;
+    }
+    MCA_PML_BASE_RECV_REQUEST_FINI( &recvreq.req_recv );
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_irecv.c b/ompi/mca/pml/bfo/pml_bfo_irecv.c
new file mode 100644
index 0000000000..62bdf78794
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_irecv.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+ *                         reserved. 
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ompi/request/request.h"
+#include "pml_bfo_recvreq.h"
+#include "ompi/peruse/peruse-internal.h"
+
+int mca_pml_bfo_irecv_init(void *addr,
+                           size_t count,
+                           ompi_datatype_t * datatype,
+                           int src,
+                           int tag,
+                           struct ompi_communicator_t *comm,
+                           struct ompi_request_t **request)
+{
+    int rc;
+    mca_pml_bfo_recv_request_t *recvreq;
+    MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc);
+    if (NULL == recvreq)
+        return rc;
+
+    MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
+                                   addr,
+                                   count, datatype, src, tag, comm, true);
+    
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &((recvreq)->req_recv.req_base),
+                             PERUSE_RECV);                              
+
+    *request = (ompi_request_t *) recvreq;
+    return OMPI_SUCCESS;
+}
+
+int mca_pml_bfo_irecv(void *addr,
+                      size_t count,
+                      ompi_datatype_t * datatype,
+                      int src,
+                      int tag,
+                      struct ompi_communicator_t *comm,
+                      struct ompi_request_t **request)
+{
+    int rc;
+
+    mca_pml_bfo_recv_request_t *recvreq;
+    MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc);
+    if (NULL == recvreq)
+        return rc;
+
+    MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
+                                   addr,
+                                   count, datatype, src, tag, comm, false);
+
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &((recvreq)->req_recv.req_base),
+                             PERUSE_RECV);
+
+    MCA_PML_BFO_RECV_REQUEST_START(recvreq);
+    *request = (ompi_request_t *) recvreq;
+    return OMPI_SUCCESS;
+}
+
+
+int mca_pml_bfo_recv(void *addr,
+                     size_t count,
+                     ompi_datatype_t * datatype,
+                     int src,
+                     int tag,
+                     struct ompi_communicator_t *comm,
+                     ompi_status_public_t * status)
+{
+    int rc;
+    mca_pml_bfo_recv_request_t *recvreq;
+    MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc);
+    if (NULL == recvreq)
+        return rc;
+
+    MCA_PML_BFO_RECV_REQUEST_INIT(recvreq,
+                                   addr,
+                                   count, datatype, src, tag, comm, false);
+
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &((recvreq)->req_recv.req_base),
+                             PERUSE_RECV);
+
+    MCA_PML_BFO_RECV_REQUEST_START(recvreq);
+    ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
+
+    if (NULL != status) {  /* return status */
+        *status = recvreq->req_recv.req_base.req_ompi.req_status;
+    }
+    rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+    ompi_request_free( (ompi_request_t**)&recvreq );
+    return rc;
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_isend.c b/ompi/mca/pml/bfo/pml_bfo_isend.c
new file mode 100644
index 0000000000..d2cf02c705
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_isend.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+ *                         reserved. 
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "pml_bfo.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_recvreq.h"
+#include "ompi/peruse/peruse-internal.h"
+
+int mca_pml_bfo_isend_init(void *buf,
+                           size_t count,
+                           ompi_datatype_t * datatype,
+                           int dst,
+                           int tag,
+                           mca_pml_base_send_mode_t sendmode,
+                           ompi_communicator_t * comm,
+                           ompi_request_t ** request)
+{
+    int rc;
+    
+    mca_pml_bfo_send_request_t *sendreq = NULL;
+    MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
+    if (rc != OMPI_SUCCESS)
+        return rc;
+
+    MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
+                                  buf,
+                                  count,
+                                  datatype,
+                                  dst, tag,
+                                  comm, sendmode, true);
+    
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &(sendreq)->req_send.req_base,
+                             PERUSE_SEND);
+
+    *request = (ompi_request_t *) sendreq;
+    return OMPI_SUCCESS;
+}
+
+
+int mca_pml_bfo_isend(void *buf,
+                      size_t count,
+                      ompi_datatype_t * datatype,
+                      int dst,
+                      int tag,
+                      mca_pml_base_send_mode_t sendmode,
+                      ompi_communicator_t * comm,
+                      ompi_request_t ** request)
+{
+    int rc;
+    mca_pml_bfo_send_request_t *sendreq = NULL;
+    
+    MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
+    if (rc != OMPI_SUCCESS)
+        return rc;
+    
+    MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
+                                  buf,
+                                  count,
+                                  datatype,
+                                  dst, tag,
+                                  comm, sendmode, false);
+
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &(sendreq)->req_send.req_base,
+                             PERUSE_SEND);
+
+    MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
+    *request = (ompi_request_t *) sendreq;
+    return rc;
+}
+
+
+int mca_pml_bfo_send(void *buf,
+                     size_t count,
+                     ompi_datatype_t * datatype,
+                     int dst,
+                     int tag,
+                     mca_pml_base_send_mode_t sendmode,
+                     ompi_communicator_t * comm)
+{
+    int rc;
+    mca_pml_bfo_send_request_t *sendreq;
+
+    MCA_PML_BFO_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc);
+    if (rc != OMPI_SUCCESS)
+        return rc;
+    
+    MCA_PML_BFO_SEND_REQUEST_INIT(sendreq,
+                                  buf,
+                                  count,
+                                  datatype,
+                                  dst, tag,
+                                  comm, sendmode, false);
+
+    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
+                             &(sendreq)->req_send.req_base,
+                             PERUSE_SEND);
+    
+    MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
+    if (rc != OMPI_SUCCESS) {
+        MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq );
+        return rc;
+    }
+
+    ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);
+
+    rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
+    ompi_request_free( (ompi_request_t**)&sendreq );
+    return rc;
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_progress.c b/ompi/mca/pml/bfo/pml_bfo_progress.c
new file mode 100644
index 0000000000..07c92125d0
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_progress.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2008 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "pml_bfo.h"
+#include "pml_bfo_sendreq.h"
+#include "ompi/mca/bml/base/base.h" 
+
+int mca_pml_bfo_progress(void)
+{
+    int i, queue_length = opal_list_get_size(&mca_pml_bfo.send_pending);
+    int j, completed_requests = 0;
+    bool send_succedded;
+
+    if( OPAL_LIKELY(0 == queue_length) )
+        return 0;
+
+    for( i = 0; i < queue_length; i++ ) {
+        mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE;
+        mca_pml_bfo_send_request_t* sendreq;
+        mca_bml_base_endpoint_t* endpoint;
+
+        sendreq = get_request_from_send_pending(&pending_type);
+        if(OPAL_UNLIKELY(NULL == sendreq))
+            break;
+
+        switch(pending_type) {
+        case MCA_PML_BFO_SEND_PENDING_NONE:
+            assert(0);
+            return 0;
+        case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
+            if( mca_pml_bfo_send_request_schedule_exclusive(sendreq) ==
+                OMPI_ERR_OUT_OF_RESOURCE ) {
+                return 0;
+            }
+            completed_requests++;
+            break;
+        case MCA_PML_BFO_SEND_PENDING_START:
+            endpoint = sendreq->req_endpoint;
+            send_succedded = false;
+            for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
+                mca_bml_base_btl_t* bml_btl;
+                int rc;
+                
+                /* select a btl */
+                bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+                rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
+                if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
+                    send_succedded = true;
+                    completed_requests++;
+                    break;
+                }
+            }
+            if( false == send_succedded ) {
+                add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
+            }
+        }
+    }
+    return completed_requests;
+}
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_rdma.c b/ompi/mca/pml/bfo/pml_bfo_rdma.c
new file mode 100644
index 0000000000..129f68059d
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_rdma.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+#include "ompi_config.h"
+#include "ompi/constants.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/bml/bml.h"
+#include "ompi/mca/mpool/mpool.h" 
+#include "pml_bfo.h"
+#include "pml_bfo_rdma.h"
+
+/* Use this registration if no registration needed for a BTL instead of NULL.
+ * This will help other code to distinguish case when memory is not registered
+ * from case when registration is not needed */
+static mca_mpool_base_registration_t pml_bfo_dummy_reg;
+
+/*
+ * Check to see if memory is registered or can be registered. Build a 
+ * set of registrations on the request.
+ */
+
+size_t mca_pml_bfo_rdma_btls(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    unsigned char* base,
+    size_t size,
+    mca_pml_bfo_com_btl_t* rdma_btls)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */        
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
+                    (bml_endpoint->btl_rdma_index + n) % num_btls); 
+        mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg;
+        mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
+
+        if( NULL != btl_mpool ) {
+            if(!mca_pml_bfo.leave_pinned) {
+                /* look through existing registrations */
+                btl_mpool->mpool_find(btl_mpool, base, size, &reg);
+            } else {
+                /* register the memory */
+                btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
+            }
+
+            if(NULL == reg)
+                continue;
+        }
+
+        rdma_btls[num_btls_used].bml_btl = bml_btl;
+        rdma_btls[num_btls_used].btl_reg = reg;
+        weight_total += bml_btl->btl_weight;
+        num_btls_used++;
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+     * registered amount to less then half of available bandwidth - fall back to
+     * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
+                                     weight_total);
+
+    bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
+    return num_btls_used;
+}
+
+size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
+                                       size_t size,
+                                       mca_pml_bfo_com_btl_t* rdma_btls )
+{
+    int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
+    double weight_total = 0;
+
+    for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) {
+        rdma_btls[i].bml_btl =
+            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
+        if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
+            rdma_btls[i].btl_reg = NULL;
+        else
+            rdma_btls[i].btl_reg = &pml_bfo_dummy_reg;
+
+        weight_total += rdma_btls[i].bml_btl->btl_weight;
+    }
+
+    mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total);
+
+    return i;
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_rdma.h b/ompi/mca/pml/bfo/pml_bfo_rdma.h
new file mode 100644
index 0000000000..8572682d36
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_rdma.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/**
+ *  @file
+ */
+                                                                                                                           
+#ifndef MCA_PML_BFO_RDMA_H
+#define MCA_PML_BFO_RDMA_H
+
+struct mca_bml_base_endpoint_t;
+
+/*
+ * Of the set of available btls that support RDMA,
+ * find those that already have registrations - or
+ * register if required (for leave_pinned option)
+ */
+size_t mca_pml_bfo_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
+    unsigned char* base, size_t size, struct mca_pml_bfo_com_btl_t* btls);
+
+/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
+ * Calculate number of bytes to send through each BTL according to available
+ * bandwidth */
+size_t mca_pml_bfo_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
+                size_t size, mca_pml_bfo_com_btl_t* rdma_btls);
+#endif
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c
new file mode 100644
index 0000000000..b99e30a8de
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2005 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "pml_bfo.h"
+#include "pml_bfo_rdmafrag.h"
+
+
+OBJ_CLASS_INSTANCE(
+    mca_pml_bfo_rdma_frag_t,
+    ompi_free_list_item_t,
+    NULL,
+    NULL);
diff --git a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h
new file mode 100644
index 0000000000..51dc4727b2
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ *  @file
+ */
+                                                                                                                                                 
+#ifndef MCA_PML_BFO_RDMAFRAG_H
+#define MCA_PML_BFO_RDMAFRAG_H
+
+#include "ompi/mca/btl/btl.h"
+#include "pml_bfo_hdr.h"
+
+BEGIN_C_DECLS
+
+typedef enum {
+    MCA_PML_BFO_RDMA_PUT,
+    MCA_PML_BFO_RDMA_GET
+} mca_pml_bfo_rdma_state_t;
+
+struct mca_pml_bfo_rdma_frag_t {
+    ompi_free_list_item_t super;
+    mca_bml_base_btl_t* rdma_bml;
+    mca_btl_base_module_t* rdma_btl;
+    mca_pml_bfo_hdr_t rdma_hdr;
+    mca_pml_bfo_rdma_state_t rdma_state;
+    size_t rdma_length;
+    mca_btl_base_segment_t rdma_segs[MCA_BTL_DES_MAX_SEGMENTS];
+    void *rdma_req;
+    struct mca_bml_base_endpoint_t* rdma_ep;
+    opal_convertor_t convertor;
+    mca_mpool_base_registration_t* reg;
+    uint32_t retries;
+};
+typedef struct mca_pml_bfo_rdma_frag_t mca_pml_bfo_rdma_frag_t;
+
+OBJ_CLASS_DECLARATION(mca_pml_bfo_rdma_frag_t);
+
+
+#define MCA_PML_BFO_RDMA_FRAG_ALLOC(frag,rc)                    \
+do {                                                            \
+    ompi_free_list_item_t* item;                                \
+    OMPI_FREE_LIST_WAIT(&mca_pml_bfo.rdma_frags, item, rc);     \
+    frag = (mca_pml_bfo_rdma_frag_t*)item;                      \
+} while(0)
+
+#define MCA_PML_BFO_RDMA_FRAG_RETURN(frag)                      \
+do {                                                            \
+    /* return fragment */                                       \
+    OMPI_FREE_LIST_RETURN(&mca_pml_bfo.rdma_frags,              \
+        (ompi_free_list_item_t*)frag);                          \
+} while(0)
+
+
+END_C_DECLS
+#endif
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_recvfrag.c b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c
new file mode 100644
index 0000000000..b940815ae6
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2006-2008 University of Houston.  All rights reserved.
+ * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+/**
+ * @file
+ */
+
+#include "ompi_config.h"
+
+#include "opal/class/opal_list.h"
+#include "opal/threads/mutex.h"
+#include "opal/prefetch.h"
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/pml/pml.h"
+#include "pml_bfo.h"
+#include "pml_bfo_comm.h"
+#include "pml_bfo_recvfrag.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_hdr.h"
+/* BFO FAILOVER CODE - begin */
+#include "pml_bfo_failover.h"
+/* BFO FAILOVER CODE - end */
+#include "ompi/peruse/peruse-internal.h"
+#include "ompi/memchecker.h"
+
+
+OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t,
+                    ompi_free_list_item_t,
+                    NULL,
+                    NULL );
+
+OBJ_CLASS_INSTANCE( mca_pml_bfo_recv_frag_t,
+                    opal_list_item_t,
+                    NULL,
+                    NULL );
+
+/**
+ * Static functions.
+ */
+
+/**
+ * Append a unexpected descriptor to a queue. This function will allocate and
+ * initialize the fragment (if necessary) and then will add it to the specified
+ * queue. The allocated fragment is not returned to the caller.
+ */
+static void
+append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
+                    mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
+                    size_t num_segments, mca_pml_bfo_recv_frag_t* frag)
+{
+    int rc;
+
+    if(NULL == frag) {
+        MCA_PML_BFO_RECV_FRAG_ALLOC(frag, rc);
+        MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
+    }
+    opal_list_append(queue, (opal_list_item_t*)frag);
+}
+
+/**
+ * Match incoming recv_frags against posted receives.  
+ * Supports out of order delivery.
+ * 
+ * @param frag_header (IN)          Header of received recv_frag.
+ * @param frag_desc (IN)            Received recv_frag descriptor.
+ * @param match_made (OUT)          Flag indicating wether a match was made.
+ * @param additional_matches (OUT)  List of additional matches 
+ * @return                          OMPI_SUCCESS or error status on failure.
+ */
+static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, 
+                                        mca_pml_bfo_match_hdr_t *hdr,
+                                        mca_btl_base_segment_t* segments,
+                                        size_t num_segments,
+                                        int type);
+ 
+static mca_pml_bfo_recv_request_t*
+match_one(mca_btl_base_module_t *btl,
+          mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
+          size_t num_segments, ompi_communicator_t *comm_ptr,
+          mca_pml_bfo_comm_proc_t *proc,
+          mca_pml_bfo_recv_frag_t* frag);
+ 
+void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl, 
+                                          mca_btl_base_tag_t tag,
+                                          mca_btl_base_descriptor_t* des,
+                                          void* cbdata )
+{ 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_match_hdr_t* hdr = (mca_pml_bfo_match_hdr_t*)segments->seg_addr.pval;
+    ompi_communicator_t *comm_ptr;
+    mca_pml_bfo_recv_request_t *match = NULL;
+    mca_pml_bfo_comm_t *comm;
+    mca_pml_bfo_comm_proc_t *proc;
+    size_t num_segments = des->des_dst_cnt;
+    size_t bytes_received = 0;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_BFO_MATCH_HDR_LEN) ) {
+        return;
+    }
+    bfo_hdr_ntoh(((mca_pml_bfo_hdr_t*) hdr), MCA_PML_BFO_HDR_TYPE_MATCH);
+    
+    /* communicator pointer */
+    comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
+    if(OPAL_UNLIKELY(NULL == comm_ptr)) {
+        /* This is a special case. A message for a not yet existing
+         * communicator can happens. Instead of doing a matching we
+         * will temporarily add it the a pending queue in the PML.
+         * Later on, when the communicator is completely instantiated,
+         * this pending queue will be searched and all matching fragments
+         * moved to the right communicator.
+         */
+        append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
+                             btl, hdr, segments, num_segments, NULL );
+        return;
+    }
+    comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
+    
+    /* source sequence number */
+    proc = &comm->procs[hdr->hdr_src];
+ 
+    /* We generate the MSG_ARRIVED event as soon as the PML is aware
+     * of a matching fragment arrival. Independing if it is received
+     * on the correct order or not. This will allow the tools to
+     * figure out if the messages are not received in the correct
+     * order (if multiple network interfaces).
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
+                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+ 
+    /* get next expected message sequence number - if threaded
+     * run, lock to make sure that if another thread is processing
+     * a frag from the same message a match is made only once.
+     * Also, this prevents other posted receives (for a pair of
+     * end points) from being processed, and potentially "loosing"
+     * the fragment.
+     */
+    OPAL_THREAD_LOCK(&comm->matching_lock);
+    
+     /* get sequence number of next message that can be processed */
+    if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
+                     (opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
+        goto slow_path;
+    }
+    
+    /* This is the sequence number we were expecting, so we can try
+     * matching it to already posted receives.
+     */
+    
+    /* We're now expecting the next sequence number. */
+    proc->expected_sequence++;
+
+    /* We generate the SEARCH_POSTED_QUEUE only when the message is
+     * received in the correct sequence. Otherwise, we delay the event
+     * generation until we reach the correct sequence number.
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
+                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+    
+    match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL);
+    
+    /* The match is over. We generate the SEARCH_POSTED_Q_END here,
+     * before going into the mca_pml_bfo_check_cantmatch_for_match so
+     * we can make a difference for the searching time for all
+     * messages.
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
+                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+    
+    /* release matching lock before processing fragment */
+    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    
+    if(OPAL_LIKELY(match)) {
+        bytes_received = segments->seg_len - OMPI_PML_BFO_MATCH_HDR_LEN;
+        match->req_recv.req_bytes_packed = bytes_received;
+        
+        MCA_PML_BFO_RECV_REQUEST_MATCHED(match, hdr);
+        if(match->req_bytes_expected > 0) { 
+            struct iovec iov[2];
+            uint32_t iov_count = 1;
+            
+            /*
+             *  Make user buffer accessable(defined) before unpacking.
+             */
+            MEMCHECKER(
+                       memchecker_call(&opal_memchecker_base_mem_defined,
+                                       match->req_recv.req_base.req_addr,
+                                       match->req_recv.req_base.req_count,
+                                       match->req_recv.req_base.req_datatype);
+                       );
+            
+            iov[0].iov_len = bytes_received;
+            iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
+                                              OMPI_PML_BFO_MATCH_HDR_LEN);
+            while (iov_count < num_segments) {
+                bytes_received += segments[iov_count].seg_len;
+                iov[iov_count].iov_len = segments[iov_count].seg_len;
+                iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
+                iov_count++;
+            }
+            opal_convertor_unpack( &match->req_recv.req_base.req_convertor,
+                                   iov,
+                                   &iov_count,
+                                   &bytes_received );
+            match->req_bytes_received = bytes_received;
+            /*
+             *  Unpacking finished, make the user buffer unaccessable again.
+             */
+            MEMCHECKER(
+                       memchecker_call(&opal_memchecker_base_mem_noaccess,
+                                       match->req_recv.req_base.req_addr,
+                                       match->req_recv.req_base.req_count,
+                                       match->req_recv.req_base.req_datatype);
+                       );
+        }
+        
+        /* no need to check if complete we know we are.. */
+        /*  don't need a rmb as that is for checking */
+        recv_request_pml_complete(match);
+    }
+    return;
+    
+ slow_path:
+    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+/* BFO FAILOVER CODE - begin */
+    /* Check for duplicate messages.  If message is duplicate, then just
+     * return as that essentially drops the message. */
+    if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    mca_pml_bfo_recv_frag_match(btl, hdr, segments,
+                                num_segments, MCA_PML_BFO_HDR_TYPE_MATCH);
+}
+
+
+void mca_pml_bfo_recv_frag_callback_rndv(mca_btl_base_module_t* btl, 
+                                         mca_btl_base_tag_t tag,
+                                         mca_btl_base_descriptor_t* des,
+                                         void* cbdata )
+{ 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+        return;
+    }
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDV);
+    mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
+                                des->des_dst_cnt, MCA_PML_BFO_HDR_TYPE_RNDV);
+    return;
+}
+
+void mca_pml_bfo_recv_frag_callback_rget(mca_btl_base_module_t* btl, 
+                                         mca_btl_base_tag_t tag,
+                                         mca_btl_base_descriptor_t* des,
+                                         void* cbdata )
+{ 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+        return;
+    }
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RGET);
+    mca_pml_bfo_recv_frag_match(btl, &hdr->hdr_match, segments,
+                                des->des_dst_cnt, MCA_PML_BFO_HDR_TYPE_RGET);
+    return;
+}
+
+  
+
+void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl, 
+                                        mca_btl_base_tag_t tag,
+                                        mca_btl_base_descriptor_t* des,
+                                        void* cbdata )
+{ 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_send_request_t* sendreq;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+         return;
+    }
+    
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK);
+    sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
+    sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
+/* BFO FAILOVER CODE - begin */
+    /* Drop any fragments if request is in error state.  Do not want
+     * to initiate any more activity. */
+    if( OPAL_UNLIKELY(sendreq->req_error)) {
+         opal_output_verbose(20, mca_pml_bfo_output,
+                             "ACK: received: dropping because request in error, "
+                             "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
+                             (uint16_t)sendreq->req_send.req_base.req_sequence,
+                             sendreq->req_restartseq,
+                             (void *)sendreq, sendreq->req_recv.pval,
+                             sendreq->req_send.req_base.req_peer);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    
+    /* if the request should be delivered entirely by copy in/out
+     * then throttle sends */
+    if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_NORDMA)
+        sendreq->req_throttle_sends = true;
+    
+    mca_pml_bfo_send_request_copy_in_out(sendreq,
+                                         hdr->hdr_ack.hdr_send_offset,
+                                         sendreq->req_send.req_bytes_packed -
+                                         hdr->hdr_ack.hdr_send_offset);
+
+    if (sendreq->req_state != 0) {
+        /* Typical receipt of an ACK message causes req_state to be
+         * decremented. However, a send request that started as an
+         * RGET request can become a RNDV. For example, when the
+         * receiver determines that its receive buffer is not
+         * contiguous and therefore cannot support the RGET
+         * protocol. A send request that started with the RGET
+         * protocol has req_state == 0 and as such should not be
+         * decremented.
+         */
+        OPAL_THREAD_ADD32(&sendreq->req_state, -1);
+    }
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_acked = true;
+/* BFO FAILOVER CODE - end */
+
+    if(send_request_pml_complete_check(sendreq) == false)
+        mca_pml_bfo_send_request_schedule(sendreq);
+    
+    return;
+}
+
+void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl, 
+                                         mca_btl_base_tag_t tag,
+                                         mca_btl_base_descriptor_t* des,
+                                         void* cbdata ) { 
+     mca_btl_base_segment_t* segments = des->des_dst;
+     mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+     mca_pml_bfo_recv_request_t* recvreq;
+     
+     if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+         return;
+     }
+     bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
+     recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
+/* BFO FAILOVER CODE - begin */
+     /* Drop any fragments if request is in error state.  Do not want
+      * to initiate any more activity. */
+     if( OPAL_UNLIKELY(recvreq->req_errstate)) {
+         opal_output_verbose(20, mca_pml_bfo_output,
+                             "FRAG: received: dropping because request in error, "
+                             "PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d",
+                             (uint16_t)recvreq->req_msgseq,
+                             recvreq->remote_req_send.pval,
+                             (void *)recvreq,
+                             recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
+                             (int)hdr->hdr_frag.hdr_frag_offset);
+         return;
+     }
+/* BFO FAILOVER CODE - end */
+     mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
+     
+     return;
+}
+
+
+void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl, 
+                                        mca_btl_base_tag_t tag,
+                                        mca_btl_base_descriptor_t* des,
+                                        void* cbdata ) { 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_pml_bfo_send_request_t* sendreq;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+        return;
+    }
+    
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
+    sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
+/* BFO FAILOVER CODE - begin */
+    /* Drop any fragments if request is in error state.  Do not want
+     * to initiate any more activity. */
+    if( OPAL_UNLIKELY(sendreq->req_error)) {
+         opal_output_verbose(20, mca_pml_bfo_output,
+                             "PUT: received: dropping because request in error, "
+                             "PML=%d, src_req=%p, dst_req=%p, peer=%d",
+                             (uint16_t)sendreq->req_send.req_base.req_sequence,
+                             (void *)sendreq, sendreq->req_recv.pval,
+                             sendreq->req_send.req_base.req_peer);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);
+    
+    return;
+}
+
+
+void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl, 
+                                        mca_btl_base_tag_t tag,
+                                        mca_btl_base_descriptor_t* des,
+                                        void* cbdata ) { 
+    mca_btl_base_segment_t* segments = des->des_dst;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+    mca_btl_base_descriptor_t* rdma;
+    
+    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
+        return;
+    }
+    
+    bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN);
+    rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
+/* BFO FAILOVER CODE - begin */
+    if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) {
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    rdma->des_cbfunc(btl, NULL, rdma,
+                     hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
+    
+    return;
+}
+
+
+
+#define PML_MAX_SEQ ~((mca_pml_sequence_t)0);
+
+static inline mca_pml_bfo_recv_request_t* get_posted_recv(opal_list_t *queue)
+{
+    if(opal_list_get_size(queue) == 0)
+        return NULL;
+
+    return (mca_pml_bfo_recv_request_t*)opal_list_get_first(queue);
+}
+
+static inline mca_pml_bfo_recv_request_t* get_next_posted_recv(
+        opal_list_t *queue,
+        mca_pml_bfo_recv_request_t* req)
+{
+    opal_list_item_t *i = opal_list_get_next((opal_list_item_t*)req);
+
+    if(opal_list_get_end(queue) == i)
+        return NULL;
+
+    return (mca_pml_bfo_recv_request_t*)i;
+}
+
+static mca_pml_bfo_recv_request_t *match_incomming(
+        mca_pml_bfo_match_hdr_t *hdr, mca_pml_bfo_comm_t *comm,
+        mca_pml_bfo_comm_proc_t *proc)
+{
+    mca_pml_bfo_recv_request_t *specific_recv, *wild_recv;
+    mca_pml_sequence_t wild_recv_seq, specific_recv_seq;
+    int tag = hdr->hdr_tag;
+
+    specific_recv = get_posted_recv(&proc->specific_receives);
+    wild_recv = get_posted_recv(&comm->wild_receives);
+
+    wild_recv_seq = wild_recv ?
+        wild_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
+    specific_recv_seq = specific_recv ?
+        specific_recv->req_recv.req_base.req_sequence : PML_MAX_SEQ;
+
+    /* they are equal only if both are PML_MAX_SEQ */
+    while(wild_recv_seq != specific_recv_seq) {
+        mca_pml_bfo_recv_request_t **match;
+        opal_list_t *queue;
+        int req_tag;
+        mca_pml_sequence_t *seq;
+
+        if (OPAL_UNLIKELY(wild_recv_seq < specific_recv_seq)) {
+            match = &wild_recv;
+            queue = &comm->wild_receives;
+            seq = &wild_recv_seq;
+        } else {
+            match = &specific_recv;
+            queue = &proc->specific_receives;
+            seq = &specific_recv_seq;
+        }
+
+        req_tag = (*match)->req_recv.req_base.req_tag;
+        if(req_tag == tag || (req_tag == OMPI_ANY_TAG && tag >= 0)) {
+            opal_list_remove_item(queue, (opal_list_item_t*)(*match));
+            PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
+                    &((*match)->req_recv.req_base), PERUSE_RECV);
+            return *match;
+        }
+
+        *match = get_next_posted_recv(queue, *match);
+        *seq = (*match) ? (*match)->req_recv.req_base.req_sequence : PML_MAX_SEQ;
+    }
+
+    return NULL;
+}
+
+static mca_pml_bfo_recv_request_t*
+match_one(mca_btl_base_module_t *btl,
+          mca_pml_bfo_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
+          size_t num_segments, ompi_communicator_t *comm_ptr,
+          mca_pml_bfo_comm_proc_t *proc,
+          mca_pml_bfo_recv_frag_t* frag)
+{
+    mca_pml_bfo_recv_request_t *match;
+    mca_pml_bfo_comm_t *comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
+
+    do {
+        match = match_incomming(hdr, comm, proc);
+
+        /* if match found, process data */
+        if(OPAL_LIKELY(NULL != match)) {
+            match->req_recv.req_base.req_proc = proc->ompi_proc;
+
+            if(OPAL_UNLIKELY(MCA_PML_REQUEST_PROBE == match->req_recv.req_base.req_type)) {
+                /* complete the probe */
+                mca_pml_bfo_recv_request_matched_probe(match, btl, segments,
+                                                       num_segments);
+                /* attempt to match actual request */
+                continue;
+            }
+
+            PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_MSG_MATCH_POSTED_REQ,
+                                    &(match->req_recv.req_base), PERUSE_RECV);
+            return match;
+        }
+
+        /* if no match found, place on unexpected queue */
+        append_frag_to_list(&proc->unexpected_frags, btl, hdr, segments,
+                            num_segments, frag);
+        PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm_ptr,
+                               hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+        return NULL;
+    } while(true);
+}
+
+static mca_pml_bfo_recv_frag_t* check_cantmatch_for_match(mca_pml_bfo_comm_proc_t *proc)
+{
+    mca_pml_bfo_recv_frag_t *frag;
+
+    /* search the list for a fragment from the send with sequence
+     * number next_msg_seq_expected
+     */
+    for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
+        frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
+        frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag))
+    {
+        mca_pml_bfo_match_hdr_t* hdr = &frag->hdr.hdr_match;
+        /*
+         * If the message has the next expected seq from that proc...
+         */
+        if(hdr->hdr_seq != proc->expected_sequence)
+            continue;
+
+        opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
+        return frag;
+    }
+
+    return NULL;
+}
+
+/**
+ * RCS/CTS receive side matching
+ *
+ * @param hdr list of parameters needed for matching
+ *                    This list is also embeded in frag,
+ *                    but this allows to save a memory copy when
+ *                    a match is made in this routine. (IN)
+ * @param frag   pointer to receive fragment which we want
+ *                    to match (IN/OUT).  If a match is not made,
+ *                    hdr is copied to frag.
+ * @param match_made  parameter indicating if we matched frag/
+ *                    hdr (OUT)
+ * @param additional_matches  if a match is made with frag, we
+ *                    may be able to match fragments that previously
+ *                    have arrived out-of-order.  If this is the
+ *                    case, the associated fragment descriptors are
+ *                    put on this list for further processing. (OUT)
+ *
+ * @return OMPI error code
+ *
+ * This routine is used to try and match a newly arrived message fragment
+ *   to pre-posted receives.  The following assumptions are made
+ *   - fragments are received out of order
+ *   - for long messages, e.g. more than one fragment, a RTS/CTS algorithm
+ *       is used.
+ *   - 2nd and greater fragments include a receive descriptor pointer
+ *   - fragments may be dropped
+ *   - fragments may be corrupt
+ *   - this routine may be called simultaneously by more than one thread
+ */
+static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, 
+                                        mca_pml_bfo_match_hdr_t *hdr,
+                                        mca_btl_base_segment_t* segments,
+                                        size_t num_segments, 
+                                        int type)
+{
+    /* local variables */
+    uint16_t next_msg_seq_expected, frag_msg_seq;
+    ompi_communicator_t *comm_ptr;
+    mca_pml_bfo_recv_request_t *match = NULL;
+    mca_pml_bfo_comm_t *comm;
+    mca_pml_bfo_comm_proc_t *proc;
+    mca_pml_bfo_recv_frag_t* frag = NULL;
+
+    /* communicator pointer */
+    comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
+    if(OPAL_UNLIKELY(NULL == comm_ptr)) {
+        /* This is a special case. A message for a not yet existing
+         * communicator can happens. Instead of doing a matching we
+         * will temporarily add it the a pending queue in the PML.
+         * Later on, when the communicator is completely instantiated,
+         * this pending queue will be searched and all matching fragments
+         * moved to the right communicator.
+         */
+        append_frag_to_list( &mca_pml_bfo.non_existing_communicator_pending,
+                             btl, hdr, segments, num_segments, NULL );
+        return OMPI_SUCCESS;
+    }
+    comm = (mca_pml_bfo_comm_t *)comm_ptr->c_pml_comm;
+
+    /* source sequence number */
+    frag_msg_seq = hdr->hdr_seq;
+    proc = &comm->procs[hdr->hdr_src];
+
+    /**
+     * We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
+     * fragment arrival. Independing if it is received on the correct order or not.
+     * This will allow the tools to figure out if the messages are not received in the
+     * correct order (if multiple network interfaces).
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
+                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+
+    /* get next expected message sequence number - if threaded
+     * run, lock to make sure that if another thread is processing 
+     * a frag from the same message a match is made only once.
+     * Also, this prevents other posted receives (for a pair of
+     * end points) from being processed, and potentially "loosing"
+     * the fragment.
+     */
+    OPAL_THREAD_LOCK(&comm->matching_lock);
+/* BFO FAILOVER CODE - begin */
+    /* In case of network failover, we may get a message telling us to
+     * restart.  In that case, we already have a pointer to the receive
+     * request in the header itself. */
+    if(OPAL_UNLIKELY(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_RESTART)) {
+        match = mca_pml_bfo_get_request(hdr);
+        if (NULL == match) {
+            return OMPI_SUCCESS;
+        }
+/* BFO FAILOVER CODE - end */
+    } else {
+
+    /* get sequence number of next message that can be processed */
+    next_msg_seq_expected = (uint16_t)proc->expected_sequence;
+    if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
+        goto wrong_seq;
+
+    /*
+     * This is the sequence number we were expecting,
+     * so we can try matching it to already posted
+     * receives.
+     */
+
+out_of_order_match:
+    /* We're now expecting the next sequence number. */
+    proc->expected_sequence++;
+
+    /**
+     * We generate the SEARCH_POSTED_QUEUE only when the message is received
+     * in the correct sequence. Otherwise, we delay the event generation until
+     * we reach the correct sequence number.
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
+                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+
+    match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
+
+    /**
+     * The match is over. We generate the SEARCH_POSTED_Q_END here, before going
+     * into the mca_pml_bfo_check_cantmatch_for_match so we can make a difference
+     * for the searching time for all messages.
+     */
+    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
+                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
+
+    /* release matching lock before processing fragment */
+    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    }
+
+    if(OPAL_LIKELY(match)) {
+        switch(type) { 
+        case MCA_PML_BFO_HDR_TYPE_MATCH:
+            mca_pml_bfo_recv_request_progress_match(match, btl, segments, num_segments);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RNDV:
+            mca_pml_bfo_recv_request_progress_rndv(match, btl, segments, num_segments);
+            break;
+        case MCA_PML_BFO_HDR_TYPE_RGET:
+            mca_pml_bfo_recv_request_progress_rget(match, btl, segments, num_segments);
+            break;
+        }
+        
+        if(OPAL_UNLIKELY(frag))
+            MCA_PML_BFO_RECV_FRAG_RETURN(frag);
+    }
+    
+    /* 
+     * Now that new message has arrived, check to see if
+     * any fragments on the c_c_frags_cant_match list
+     * may now be used to form new matchs
+     */
+    if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
+        OPAL_THREAD_LOCK(&comm->matching_lock);
+        if((frag = check_cantmatch_for_match(proc))) {
+            hdr = &frag->hdr.hdr_match;
+            segments = frag->segments;
+            num_segments = frag->num_segments;
+            btl = frag->btl;
+            type = hdr->hdr_common.hdr_type;
+            goto out_of_order_match;
+        }
+        OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    }
+
+    return OMPI_SUCCESS;
+wrong_seq:
+    /*
+     * This message comes after the next expected, so it
+     * is ahead of sequence.  Save it for later.
+     */
+/* BFO FAILOVER CODE - begin */
+    /* Check for duplicate messages.  If message is duplicate, then just
+     * return as that essentially drops the message. */
+    if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
+        return OMPI_SUCCESS;
+    }
+/* BFO FAILOVER CODE - end */
+
+    append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
+                        num_segments, NULL);
+    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    return OMPI_SUCCESS;
+}
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_recvfrag.h b/ompi/mca/pml/bfo/pml_bfo_recvfrag.h
new file mode 100644
index 0000000000..fc94975d7b
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_recvfrag.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ *  @file
+ */
+                                                                                                                                                 
+#ifndef MCA_PML_BFO_RECVFRAG_H
+#define MCA_PML_BFO_RECVFRAG_H
+
+#include "ompi/mca/btl/btl.h"
+#include "pml_bfo_hdr.h"
+
+BEGIN_C_DECLS
+
+struct mca_pml_bfo_buffer_t {
+    size_t len;
+    void * addr;
+};
+typedef struct mca_pml_bfo_buffer_t mca_pml_bfo_buffer_t;
+
+
+struct mca_pml_bfo_recv_frag_t {
+    ompi_free_list_item_t super;
+    mca_pml_bfo_hdr_t hdr;
+    size_t num_segments;
+    mca_btl_base_module_t* btl;
+    mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS];
+    mca_pml_bfo_buffer_t buffers[MCA_BTL_DES_MAX_SEGMENTS];
+    unsigned char addr[1];
+};
+typedef struct mca_pml_bfo_recv_frag_t mca_pml_bfo_recv_frag_t;
+
+OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_frag_t);
+
+
+#define MCA_PML_BFO_RECV_FRAG_ALLOC(frag,rc)                    \
+do {                                                            \
+    ompi_free_list_item_t* item;                                \
+    OMPI_FREE_LIST_WAIT(&mca_pml_bfo.recv_frags, item, rc);     \
+    frag = (mca_pml_bfo_recv_frag_t*)item;                      \
+} while(0)
+
+    
+#define MCA_PML_BFO_RECV_FRAG_INIT(frag, hdr, segs, cnt, btl )          \
+do {                                                                    \
+    size_t i, _size;                                                    \
+    mca_btl_base_segment_t* macro_segments = frag->segments;            \
+    mca_pml_bfo_buffer_t* buffers = frag->buffers;                      \
+    unsigned char* _ptr = (unsigned char*)frag->addr;                   \
+    /* init recv_frag */                                                \
+    frag->btl = btl;                                                    \
+    frag->hdr = *(mca_pml_bfo_hdr_t*)hdr;                               \
+    frag->num_segments = 1;                                             \
+    _size = segs[0].seg_len;                                            \
+    for( i = 1; i < cnt; i++ ) {                                        \
+        _size += segs[i].seg_len;                                       \
+    }                                                                   \
+    /* copy over data */                                                \
+    if(_size <= mca_pml_bfo.unexpected_limit ) {                        \
+        macro_segments[0].seg_addr.pval = frag->addr;                   \
+    } else {                                                            \
+        buffers[0].len = _size;                                         \
+        buffers[0].addr = (char*)                                       \
+            mca_pml_bfo.allocator->alc_alloc( mca_pml_bfo.allocator,    \
+                                              buffers[0].len,           \
+                                              0, NULL);                 \
+        _ptr = (unsigned char*)(buffers[0].addr);                       \
+        macro_segments[0].seg_addr.pval = buffers[0].addr;              \
+    }                                                                   \
+    macro_segments[0].seg_len = _size;                                  \
+    for( i = 0; i < cnt; i++ ) {                                        \
+        memcpy( _ptr, segs[i].seg_addr.pval, segs[i].seg_len);          \
+        _ptr += segs[i].seg_len;                                        \
+    }                                                                   \
+ } while(0)
+
+
+#define MCA_PML_BFO_RECV_FRAG_RETURN(frag)                              \
+do {                                                                    \
+    if( frag->segments[0].seg_len > mca_pml_bfo.unexpected_limit ) {    \
+        /* return buffers */                                            \
+        mca_pml_bfo.allocator->alc_free( mca_pml_bfo.allocator,         \
+                                         frag->buffers[0].addr );       \
+    }                                                                   \
+    frag->num_segments = 0;                                             \
+                                                                        \
+    /* return recv_frag */                                              \
+    OMPI_FREE_LIST_RETURN(&mca_pml_bfo.recv_frags,                      \
+                          (ompi_free_list_item_t*)frag);                \
+ } while(0)
+
+
+/**
+ *  Callback from BTL on receipt of a recv_frag (match).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_match( mca_btl_base_module_t *btl, 
+                                                  mca_btl_base_tag_t tag,
+                                                  mca_btl_base_descriptor_t* descriptor,
+                                                  void* cbdata );
+                                                                 
+/**
+ *  Callback from BTL on receipt of a recv_frag (rndv).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_rndv( mca_btl_base_module_t *btl, 
+                                                 mca_btl_base_tag_t tag,
+                                                 mca_btl_base_descriptor_t* descriptor,
+                                                 void* cbdata );
+/**
+ *  Callback from BTL on receipt of a recv_frag (rget).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_rget( mca_btl_base_module_t *btl, 
+                                                 mca_btl_base_tag_t tag,
+                                                 mca_btl_base_descriptor_t* descriptor,
+                                                 void* cbdata );
+
+/**
+ *  Callback from BTL on receipt of a recv_frag (ack).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_ack( mca_btl_base_module_t *btl, 
+                                                mca_btl_base_tag_t tag,
+                                                mca_btl_base_descriptor_t* descriptor,
+                                                void* cbdata );
+/**
+ *  Callback from BTL on receipt of a recv_frag (frag).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_frag( mca_btl_base_module_t *btl, 
+                                                 mca_btl_base_tag_t tag,
+                                                 mca_btl_base_descriptor_t* descriptor,
+                                                 void* cbdata );
+/**
+ *  Callback from BTL on receipt of a recv_frag (put).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_put( mca_btl_base_module_t *btl, 
+                                                mca_btl_base_tag_t tag,
+                                                mca_btl_base_descriptor_t* descriptor,
+                                                void* cbdata );
+/**
+ *  Callback from BTL on receipt of a recv_frag (fin).
+ */
+
+extern void mca_pml_bfo_recv_frag_callback_fin( mca_btl_base_module_t *btl, 
+                                                mca_btl_base_tag_t tag,
+                                                mca_btl_base_descriptor_t* descriptor,
+                                                void* cbdata );
+
+                                              
+END_C_DECLS
+#endif
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.c b/ompi/mca/pml/bfo/pml_bfo_recvreq.c
new file mode 100644
index 0000000000..bdc247f495
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c
@@ -0,0 +1,1163 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/bml/bml.h" 
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/mpool/mpool.h" 
+#include "pml_bfo_comm.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_recvfrag.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_rdmafrag.h"
+#include "ompi/mca/bml/base/base.h" 
+#include "orte/mca/errmgr/errmgr.h"
+#include "opal/util/arch.h"
+#include "ompi/memchecker.h"
+/* BFO FAILOVER CODE - begin */
+#include "pml_bfo_failover.h"
+/* BFO FAILOVER CODE - end */
+
+void mca_pml_bfo_recv_request_process_pending(void)
+{
+    mca_pml_bfo_recv_request_t* recvreq;
+    int i, s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending);
+
+    for(i = 0; i < s; i++) {
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        recvreq = (mca_pml_bfo_recv_request_t*)
+            opal_list_remove_first(&mca_pml_bfo.recv_pending);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+        if( OPAL_UNLIKELY(NULL == recvreq) )
+            break;
+        recvreq->req_pending = false;
+        if(OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_recv_request_schedule_exclusive(recvreq, NULL)) == 
+                OMPI_ERR_OUT_OF_RESOURCE)
+            break;
+    }
+}
+
+static int mca_pml_bfo_recv_request_free(struct ompi_request_t** request)
+{
+    mca_pml_bfo_recv_request_t* recvreq = *(mca_pml_bfo_recv_request_t**)request; 
+
+    assert( false == recvreq->req_recv.req_base.req_free_called );
+
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    recvreq->req_recv.req_base.req_free_called = true;
+
+    PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY,
+                             &(recvreq->req_recv.req_base), PERUSE_RECV );
+
+    if( true == recvreq->req_recv.req_base.req_pml_complete ) {
+        /* make buffer defined when the request is compeleted,
+           and before releasing the objects. */
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            recvreq->req_recv.req_base.req_addr,
+                            recvreq->req_recv.req_base.req_count,
+                            recvreq->req_recv.req_base.req_datatype);
+        );
+
+        MCA_PML_BFO_RECV_REQUEST_RETURN( recvreq );
+    }
+
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+    *request = MPI_REQUEST_NULL;
+    return OMPI_SUCCESS;
+} 
+
+static int mca_pml_bfo_recv_request_cancel(struct ompi_request_t* ompi_request, int complete)
+{
+    mca_pml_bfo_recv_request_t* request = (mca_pml_bfo_recv_request_t*)ompi_request;
+    mca_pml_bfo_comm_t* comm = request->req_recv.req_base.req_comm->c_pml_comm;
+
+    if( true == ompi_request->req_complete ) { /* way to late to cancel this one */
+        /*
+         * Receive request completed, make user buffer accessable.
+         */
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            request->req_recv.req_base.req_addr,
+                            request->req_recv.req_base.req_count,
+                            request->req_recv.req_base.req_datatype);
+        );
+        return OMPI_SUCCESS;
+    }
+
+    /* The rest should be protected behind the match logic lock */
+    OPAL_THREAD_LOCK(&comm->matching_lock);
+    if( OMPI_ANY_TAG == ompi_request->req_status.MPI_TAG ) { /* the match has not been already done */
+       if( request->req_recv.req_base.req_peer == OMPI_ANY_SOURCE ) {
+          opal_list_remove_item( &comm->wild_receives, (opal_list_item_t*)request );
+       } else {
+          mca_pml_bfo_comm_proc_t* proc = comm->procs + request->req_recv.req_base.req_peer;
+          opal_list_remove_item(&proc->specific_receives, (opal_list_item_t*)request);
+       }
+       PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_REMOVE_FROM_POSTED_Q,
+                                &(request->req_recv.req_base), PERUSE_RECV );
+       /**
+        * As now the PML is done with this request we have to force the pml_complete
+        * to true. Otherwise, the request will never be freed.
+        */
+       request->req_recv.req_base.req_pml_complete = true;
+    }
+    OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    ompi_request->req_status._cancelled = true;
+    /* This macro will set the req_complete to true so the MPI Test/Wait* functions
+     * on this request will be able to complete. As the status is marked as
+     * cancelled the cancel state will be detected.
+     */
+    MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(request);
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+    /*
+     * Receive request cancelled, make user buffer accessable.
+     */
+    MEMCHECKER(
+        memchecker_call(&opal_memchecker_base_mem_defined,
+                        request->req_recv.req_base.req_addr,
+                        request->req_recv.req_base.req_count,
+                        request->req_recv.req_base.req_datatype);
+    );
+    return OMPI_SUCCESS;
+}
+
+static void mca_pml_bfo_recv_request_construct(mca_pml_bfo_recv_request_t* request)
+{
+    request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
+    request->req_recv.req_base.req_ompi.req_free = mca_pml_bfo_recv_request_free;
+    request->req_recv.req_base.req_ompi.req_cancel = mca_pml_bfo_recv_request_cancel;
+    request->req_rdma_cnt = 0;
+    OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
+}
+
+OBJ_CLASS_INSTANCE(
+    mca_pml_bfo_recv_request_t,
+    mca_pml_base_recv_request_t,
+    mca_pml_bfo_recv_request_construct,
+    NULL);
+
+
+/*
+ * Release resources.
+ */
+
+static void mca_pml_bfo_recv_ctl_completion( mca_btl_base_module_t* btl,
+                                             struct mca_btl_base_endpoint_t* ep,
+                                             struct mca_btl_base_descriptor_t* des,
+                                             int status )
+{
+/* BFO FAILOVER CODE - begin */
+    if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
+        mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status);
+    }
+/* BFO FAILOVER CODE - end */
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+/*
+ * Put operation has completed remotely - update request status
+ */
+
+static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
+                                        struct mca_btl_base_endpoint_t* ep,
+                                        struct mca_btl_base_descriptor_t* des,
+                                        int status )
+{
+    mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
+    mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)des->des_cbdata;
+    size_t bytes_received = 0;
+
+    if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
+        MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_dst, des->des_dst_cnt,
+                                            0, bytes_received );
+    }
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
+
+    btl->btl_free(btl, des);
+/* BFO FAILOVER CODE - begin */
+    /* This can happen if a FIN message arrives after the request was
+     * marked in error.  So, just drop the message.  Note that the
+     * status field is not being checked.  That is because the status
+     * field is the value returned in the FIN hdr.hdr_fail field and
+     * may be used for other things.  Note that we allow the various
+     * fields to be updated in case this actually completes the
+     * request and the sending side thinks it is done. */
+    if( OPAL_UNLIKELY(recvreq->req_errstate)) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "FIN: received on broken request, skipping, "
+                            "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d",
+                            recvreq->req_msgseq, recvreq->req_restartseq,
+                            (unsigned long)recvreq->remote_req_send.pval,
+                            (unsigned long)recvreq,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+        /* Even though in error, it still might complete.  */
+        recv_request_pml_complete_check(recvreq);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+
+    /* check completion status */
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
+    if(recv_request_pml_complete_check(recvreq) == false &&
+            recvreq->req_rdma_offset < recvreq->req_send_offset) {
+        /* schedule additional rdma operations */
+        mca_pml_bfo_recv_request_schedule(recvreq, bml_btl);
+    }
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+/*
+ *
+ */
+
+int mca_pml_bfo_recv_request_ack_send_btl(
+        ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
+        uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
+        bool nordma)
+{
+    mca_btl_base_descriptor_t* des;
+    mca_pml_bfo_ack_hdr_t* ack;
+    int rc;
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_ack_hdr_t),
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        return OMPI_ERR_OUT_OF_RESOURCE; 
+    }
+
+    /* fill out header */
+    ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval;
+    ack->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK;
+    ack->hdr_common.hdr_flags = nordma ? MCA_PML_BFO_HDR_FLAGS_NORDMA : 0;
+    ack->hdr_src_req.lval = hdr_src_req;
+    ack->hdr_dst_req.pval = hdr_dst_req;
+    ack->hdr_send_offset = hdr_send_offset;
+
+    bfo_hdr_hton(ack, MCA_PML_BFO_HDR_TYPE_ACK, proc);
+
+    /* initialize descriptor */
+    des->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
+/* BFO FAILOVER CODE - begin */
+    des->des_cbdata = (void *)proc;
+/* BFO FAILOVER CODE - end */
+
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK);
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+/* BFO FAILOVER CODE - begin */
+        if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
+            (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
+            ((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++;
+        }
+/* BFO FAILOVER CODE - end */
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, des);
+    return OMPI_ERR_OUT_OF_RESOURCE; 
+}
+
+static int mca_pml_bfo_recv_request_ack(
+    mca_pml_bfo_recv_request_t* recvreq,
+    mca_pml_bfo_rendezvous_hdr_t* hdr, 
+    size_t bytes_received)
+{
+    ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
+    mca_bml_base_endpoint_t* bml_endpoint = NULL;
+
+    bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; 
+
+    /* by default copy everything */
+    recvreq->req_send_offset = bytes_received;
+    if(hdr->hdr_msg_length > bytes_received) {
+        size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
+        /*
+         * lookup request buffer to determine if memory is already
+         * registered. 
+         */
+
+        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == 0 &&
+           hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_CONTIG &&
+           rdma_num != 0) {
+            unsigned char *base;
+            opal_convertor_get_current_pointer( &recvreq->req_recv.req_base.req_convertor, (void**)&(base) );
+           
+            if(hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_BFO_HDR_FLAGS_PIN)
+                recvreq->req_rdma_cnt = mca_pml_bfo_rdma_btls(bml_endpoint,
+                        base, recvreq->req_recv.req_bytes_packed,
+                        recvreq->req_rdma );
+            else
+                recvreq->req_rdma_cnt = 0;
+
+            /* memory is already registered on both sides */
+            if (recvreq->req_rdma_cnt != 0) {
+                recvreq->req_send_offset = hdr->hdr_msg_length;
+                /* are rdma devices available for long rdma protocol */
+            } else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) {
+                /* use convertor to figure out the rdma offset for this request */
+                recvreq->req_send_offset = hdr->hdr_msg_length - 
+                    bml_endpoint->btl_pipeline_send_length;
+
+                if(recvreq->req_send_offset < bytes_received)
+                    recvreq->req_send_offset = bytes_received;
+
+                /* use converter to figure out the rdma offset for this
+                 * request */
+                opal_convertor_set_position(&recvreq->req_recv.req_base.req_convertor,
+                        &recvreq->req_send_offset);
+
+                recvreq->req_rdma_cnt =
+                    mca_pml_bfo_rdma_pipeline_btls(bml_endpoint,
+                            recvreq->req_send_offset - bytes_received,
+                            recvreq->req_rdma);
+            }
+        }
+        /* nothing to send by copy in/out - no need to ack */
+        if(recvreq->req_send_offset == hdr->hdr_msg_length)
+            return OMPI_SUCCESS;
+    }
+    /* let know to shedule function there is no need to put ACK flag */
+    recvreq->req_ack_sent = true;
+    return mca_pml_bfo_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
+                                             recvreq, recvreq->req_send_offset,
+                                             recvreq->req_send_offset == bytes_received);
+}
+
+
+/**
+ * Return resources used by the RDMA
+ */
+
+static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
+                                         struct mca_btl_base_endpoint_t* ep,
+                                         struct mca_btl_base_descriptor_t* des,
+                                         int status )
+{
+    mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
+    mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
+    mca_bml_base_btl_t* bml_btl; 
+    mca_bml_base_endpoint_t* bml_endpoint;
+
+/* BFO FAILOVER CODE - begin */
+    if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
+        recvreq->req_events--;
+        assert(recvreq->req_events >= 0);
+    }
+/* BFO FAILOVER CODE - end */
+
+    /* check completion status */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+/* BFO FAILOVER CODE - begin */
+        /* Record the error and send RECVERRNOTIFY if necessary. */ 
+        if (recvreq->req_errstate) { 
+            opal_output_verbose(30, mca_pml_bfo_output, 
+                                "RDMA read: completion failed, error already seen, " 
+                                "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", 
+                                recvreq->req_msgseq, recvreq->req_restartseq, 
+                                (unsigned long)recvreq->remote_req_send.pval, 
+                                (unsigned long)recvreq,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); 
+            return; 
+        } else { 
+            opal_output_verbose(30, mca_pml_bfo_output, 
+                                "RDMA read: completion failed, sending RECVERRNOTIFY to sender, " 
+                                "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", 
+                                recvreq->req_msgseq, recvreq->req_restartseq,
+                                (unsigned long)recvreq->remote_req_send.pval, 
+                                (unsigned long)recvreq,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); 
+            mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); 
+        }
+    }
+/* BFO FAILOVER CODE - end */
+/* BFO FAILOVER CODE - begin */
+    /* See if the request has received a RNDVRESTARTNOTIFY */
+    if( OPAL_UNLIKELY(recvreq->req_errstate)) {
+        if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) {
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RDMA read: completion: recvreq has error, outstanding events=%d "
+                                "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d",
+                                recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
+                                (unsigned long)recvreq->remote_req_send.pval,
+                                (unsigned long)recvreq, status,
+                                recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+            if (0 == recvreq->req_events) {
+                mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET,
+                                                        status, btl);
+            }
+        }           
+        MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+/* BFO FAILOVER CODE - begin */
+    /* Find back the bml_btl that this btl belongs to.  If we cannot
+     * find it, then it may have been removed from underneath us, so
+     * find the next available one to send the FIN message on. */
+    bml_endpoint = recvreq->req_recv.req_base.req_proc->proc_bml;
+    bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
+    if( OPAL_UNLIKELY(NULL == bml_btl) ) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RDMA write completion: BML was removed from underneath us, "
+                            "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d",
+                            recvreq->req_msgseq, recvreq->req_restartseq,
+                            (unsigned long)recvreq->remote_req_send.pval,
+                            (unsigned long)recvreq, status,
+                            recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
+        bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
+    }
+/* BFO FAILOVER CODE - end */
+
+    mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc,
+                         bml_btl,
+                         frag->rdma_hdr.hdr_rget.hdr_des,
+                         des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq, 
+                         recvreq->req_recv.req_base.req_comm->c_contextid, 
+                         recvreq->req_recv.req_base.req_comm->c_my_rank); 
+
+    /* is receive request complete */
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
+    recv_request_pml_complete_check(recvreq);
+
+    MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+
+/*
+ *
+ */
+int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag )
+{
+    mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
+    mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
+    mca_btl_base_descriptor_t* descriptor;
+    size_t save_size = frag->rdma_length;
+    int rc;
+
+    /* prepare descriptor */
+    mca_bml_base_prepare_dst( bml_btl, 
+                              NULL,
+                              &recvreq->req_recv.req_base.req_convertor,
+                              MCA_BTL_NO_ORDER,
+                              0,
+                              &frag->rdma_length,
+                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK,
+                              &descriptor );
+    if( OPAL_UNLIKELY(NULL == descriptor) ) {
+        frag->rdma_length = save_size;
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+        opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag);
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    descriptor->des_src = frag->rdma_segs;
+    descriptor->des_src_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
+    descriptor->des_cbfunc = mca_pml_bfo_rget_completion;
+    descriptor->des_cbdata = frag;
+
+    PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
+                                 &(recvreq->req_recv.req_base),
+                                 frag->rdma_length, PERUSE_RECV);
+
+    /* queue up get request */
+    rc = mca_bml_base_get(bml_btl,descriptor);
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+        if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
+            mca_bml_base_free(bml_btl, descriptor);
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.rdma_pending,
+                    (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        } else {
+            ORTE_ERROR_LOG(rc);
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+/* BFO FAILOVER CODE - begin */
+    if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
+        (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
+        recvreq->req_events++;
+    }
+/* BFO FAILOVER CODE - end */
+
+    return OMPI_SUCCESS;
+}
+
+
+
+
+/*
+ * Update the recv request status to reflect the number of bytes
+ * received and actually delivered to the application. 
+ */
+
+void mca_pml_bfo_recv_request_progress_frag( mca_pml_bfo_recv_request_t* recvreq,
+                                             mca_btl_base_module_t* btl,
+                                             mca_btl_base_segment_t* segments,
+                                             size_t num_segments )
+{
+    size_t bytes_received = 0;
+    size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */
+    size_t data_offset = 0;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
+                                        0, bytes_received );
+    bytes_received -= sizeof(mca_pml_bfo_frag_hdr_t);
+    data_offset     = hdr->hdr_frag.hdr_frag_offset;
+    /*
+     *  Make user buffer accessable(defined) before unpacking.
+     */
+    MEMCHECKER(
+               memchecker_call(&opal_memchecker_base_mem_defined,
+                               recvreq->req_recv.req_base.req_addr,
+                               recvreq->req_recv.req_base.req_count,
+                               recvreq->req_recv.req_base.req_datatype);
+               );
+    MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq,
+                                     segments,
+                                     num_segments,
+                                     sizeof(mca_pml_bfo_frag_hdr_t),
+                                     data_offset,
+                                     bytes_received,
+                                     bytes_delivered );
+    /*
+     *  Unpacking finished, make the user buffer unaccessable again.
+     */
+    MEMCHECKER(
+               memchecker_call(&opal_memchecker_base_mem_noaccess,
+                               recvreq->req_recv.req_base.req_addr,
+                               recvreq->req_recv.req_base.req_count,
+                               recvreq->req_recv.req_base.req_datatype);
+               );
+    
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
+    /* check completion status */
+    if(recv_request_pml_complete_check(recvreq) == false &&
+            recvreq->req_rdma_offset < recvreq->req_send_offset) {
+        /* schedule additional rdma operations */
+        mca_pml_bfo_recv_request_schedule(recvreq, NULL);
+    }
+}
+
+/*
+ * Update the recv request status to reflect the number of bytes
+ * received and actually delivered to the application. 
+ */
+
+void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq,
+                                             mca_btl_base_module_t* btl,
+                                             mca_btl_base_segment_t* segments,
+                                             size_t num_segments )
+{
+    size_t bytes_received = 0;
+    mca_pml_bfo_rget_hdr_t* hdr = (mca_pml_bfo_rget_hdr_t*)segments->seg_addr.pval;
+    mca_bml_base_endpoint_t* bml_endpoint = NULL;
+    mca_pml_bfo_rdma_frag_t* frag;
+    size_t i, size = 0;
+    int rc;
+
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
+                                        0, bytes_received );
+    recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
+
+/* BFO FAILOVER CODE - begin */
+    recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
+/* BFO FAILOVER CODE - end */
+    MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
+    
+    /* if receive buffer is not contiguous we can't just RDMA read into it, so
+     * fall back to copy in/out protocol. It is a pity because buffer on the
+     * sender side is already registered. We need to be smarter here, perhaps
+     * do couple of RDMA reads */
+    if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
+        mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+        return;
+    }
+    
+    MCA_PML_BFO_RDMA_FRAG_ALLOC(frag,rc);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        /* GLB - FIX */
+         ORTE_ERROR_LOG(rc);
+         orte_errmgr.abort(-1, NULL);
+    }
+
+    /* lookup bml datastructures */
+    bml_endpoint = (mca_bml_base_endpoint_t*)recvreq->req_recv.req_base.req_proc->proc_bml; 
+
+    /* allocate/initialize a fragment */
+    for(i = 0; i < hdr->hdr_seg_cnt; i++) {
+        frag->rdma_segs[i] = hdr->hdr_segs[i];
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+        if ((recvreq->req_recv.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
+            (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
+            size += opal_swap_bytes4(hdr->hdr_segs[i].seg_len);
+        } else 
+#endif
+        {
+            size += hdr->hdr_segs[i].seg_len;
+        }
+    }
+/* BFO FAILOVER CODE - begin */
+    frag->rdma_btl = btl;
+/* BFO FAILOVER CODE - end */
+    frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
+    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
+        opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+    frag->rdma_hdr.hdr_rget = *hdr;
+    frag->rdma_req = recvreq;
+    frag->rdma_ep = bml_endpoint;
+    frag->rdma_length = size;
+    frag->rdma_state = MCA_PML_BFO_RDMA_GET;
+    frag->reg = NULL;
+
+    mca_pml_bfo_recv_request_get_frag(frag);
+    return;
+}
+
+/*
+ * Update the recv request status to reflect the number of bytes
+ * received and actually delivered to the application. 
+ */
+
+void mca_pml_bfo_recv_request_progress_rndv( mca_pml_bfo_recv_request_t* recvreq,
+                                             mca_btl_base_module_t* btl,
+                                             mca_btl_base_segment_t* segments,
+                                             size_t num_segments )
+{
+    size_t bytes_received = 0;
+    size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */
+    size_t data_offset = 0;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
+                                        0, bytes_received );
+    
+    bytes_received -= sizeof(mca_pml_bfo_rendezvous_hdr_t);
+    recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
+    recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
+    recvreq->req_rdma_offset = bytes_received;
+    MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match);
+    mca_pml_bfo_recv_request_ack(recvreq, &hdr->hdr_rndv, bytes_received);
+    /**
+     * The PUT protocol do not attach any data to the original request.
+     * Therefore, we might want to avoid unpacking if there is nothing to
+     * unpack.
+     */
+    if( 0 < bytes_received ) {
+        MEMCHECKER(
+                   memchecker_call(&opal_memchecker_base_mem_defined,
+                                   recvreq->req_recv.req_base.req_addr,
+                                   recvreq->req_recv.req_base.req_count,
+                                   recvreq->req_recv.req_base.req_datatype);
+                   );
+        MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq,
+                                         segments,
+                                         num_segments,
+                                         sizeof(mca_pml_bfo_rendezvous_hdr_t),
+                                         data_offset,
+                                         bytes_received,
+                                         bytes_delivered );
+        MEMCHECKER(
+                   memchecker_call(&opal_memchecker_base_mem_noaccess,
+                                   recvreq->req_recv.req_base.req_addr,
+                                   recvreq->req_recv.req_base.req_count,
+                                   recvreq->req_recv.req_base.req_datatype);
+                   );
+    }
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
+    /* check completion status */
+    if(recv_request_pml_complete_check(recvreq) == false &&
+       recvreq->req_rdma_offset < recvreq->req_send_offset) {
+        /* schedule additional rdma operations */
+        mca_pml_bfo_recv_request_schedule(recvreq, NULL);
+    }
+}
+
+/*
+ * Update the recv request status to reflect the number of bytes
+ * received and actually delivered to the application. 
+ */
+void mca_pml_bfo_recv_request_progress_match( mca_pml_bfo_recv_request_t* recvreq,
+                                              mca_btl_base_module_t* btl,
+                                              mca_btl_base_segment_t* segments,
+                                              size_t num_segments )
+{
+    size_t bytes_received = 0, data_offset = 0;
+    size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_BFO_RECV_REQUEST_UNPACK */
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
+                                        0, bytes_received );
+    bytes_received -= OMPI_PML_BFO_MATCH_HDR_LEN;
+    recvreq->req_recv.req_bytes_packed = bytes_received;
+    
+    MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match);
+    /*
+     *  Make user buffer accessable(defined) before unpacking.
+     */
+    MEMCHECKER(
+               memchecker_call(&opal_memchecker_base_mem_defined,
+                               recvreq->req_recv.req_base.req_addr,
+                               recvreq->req_recv.req_base.req_count,
+                               recvreq->req_recv.req_base.req_datatype);
+               );
+    MCA_PML_BFO_RECV_REQUEST_UNPACK( recvreq,
+                                     segments,
+                                     num_segments,
+                                     OMPI_PML_BFO_MATCH_HDR_LEN,
+                                     data_offset,
+                                     bytes_received,
+                                     bytes_delivered);
+    /*
+     *  Unpacking finished, make the user buffer unaccessable again.
+     */
+    MEMCHECKER(
+               memchecker_call(&opal_memchecker_base_mem_noaccess,
+                               recvreq->req_recv.req_base.req_addr,
+                               recvreq->req_recv.req_base.req_count,
+                               recvreq->req_recv.req_base.req_datatype);
+               );
+    
+    /*
+     * No need for atomic here, as we know there is only one fragment
+     * for this request.
+     */
+    recvreq->req_bytes_received += bytes_received;
+    recv_request_pml_complete(recvreq);
+}
+
+
+/**
+ * Handle completion of a probe request
+ */
+
+void mca_pml_bfo_recv_request_matched_probe( mca_pml_bfo_recv_request_t* recvreq,
+                                             mca_btl_base_module_t* btl,
+                                             mca_btl_base_segment_t* segments,
+                                             size_t num_segments )
+{
+    size_t bytes_packed = 0;
+    mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
+
+    switch(hdr->hdr_common.hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_MATCH:
+
+            MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( segments, num_segments,
+                                                OMPI_PML_BFO_MATCH_HDR_LEN,
+                                                bytes_packed );
+            break;
+
+        case MCA_PML_BFO_HDR_TYPE_RNDV:
+        case MCA_PML_BFO_HDR_TYPE_RGET:
+
+            bytes_packed = hdr->hdr_rndv.hdr_msg_length;
+            break;
+    }
+
+    /* set completion status */
+    recvreq->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_match.hdr_tag;
+    recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_match.hdr_src;
+    recvreq->req_bytes_received = bytes_packed;
+    recvreq->req_bytes_expected = bytes_packed;
+    recv_request_pml_complete(recvreq);
+}
+
+
+/*
+ * Schedule RDMA protocol.
+ *
+*/
+
+int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
+                                            mca_btl_base_module_t *start_btl )
+{
+    mca_bml_base_btl_t* bml_btl; 
+    int num_tries = recvreq->req_rdma_cnt, num_fail = 0;
+    size_t i, prev_bytes_remaining = 0;
+    size_t bytes_remaining = recvreq->req_send_offset -
+        recvreq->req_rdma_offset;
+
+    /* if starting bml_btl is provided schedule next fragment on it first */
+    if(start_btl != NULL) {
+        for(i = 0; i < recvreq->req_rdma_cnt; i++) {
+            if(recvreq->req_rdma[i].bml_btl->btl != start_btl)
+                continue;
+            /* something left to be send? */
+            if( OPAL_LIKELY(recvreq->req_rdma[i].length) )
+                recvreq->req_rdma_idx = i;
+            break;
+        }
+    }
+
+    while(bytes_remaining > 0 &&
+           recvreq->req_pipeline_depth < mca_pml_bfo.recv_pipeline_depth) {
+        size_t hdr_size;
+        size_t size;
+        mca_pml_bfo_rdma_hdr_t* hdr;
+        mca_btl_base_descriptor_t* dst;
+        mca_btl_base_descriptor_t* ctl;
+        mca_mpool_base_registration_t * reg = NULL;
+        mca_btl_base_module_t* btl;
+        int rc, rdma_idx;
+
+        if(prev_bytes_remaining == bytes_remaining) {
+            if(++num_fail == num_tries) {
+                OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+                if(false == recvreq->req_pending) {
+                    opal_list_append(&mca_pml_bfo.recv_pending,
+                            (opal_list_item_t*)recvreq);
+                    recvreq->req_pending = true;
+                }
+                OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+                return OMPI_ERR_OUT_OF_RESOURCE;
+            }
+        } else {
+            num_fail = 0;
+            prev_bytes_remaining = bytes_remaining;
+        }
+
+        do {
+            rdma_idx = recvreq->req_rdma_idx;
+            bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
+            reg = recvreq->req_rdma[rdma_idx].btl_reg;
+            size = recvreq->req_rdma[rdma_idx].length;
+            if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
+                recvreq->req_rdma_idx = 0;
+        } while(!size);
+        btl = bml_btl->btl;
+
+        /* makes sure that we don't exceed BTL max rdma size
+         * if memory is not pinned already */
+        if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) &&
+                             (size > btl->btl_rdma_pipeline_frag_size)) {
+            size = btl->btl_rdma_pipeline_frag_size;
+        }
+
+        /* take lock to protect converter against concurrent access
+         * from unpack */
+        OPAL_THREAD_LOCK(&recvreq->lock);
+        opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor,
+                                     &recvreq->req_rdma_offset );
+
+        /* prepare a descriptor for RDMA */
+        mca_bml_base_prepare_dst(bml_btl, reg, 
+                                 &recvreq->req_recv.req_base.req_convertor,
+                                 MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &dst);
+        OPAL_THREAD_UNLOCK(&recvreq->lock);
+
+        if(OPAL_UNLIKELY(dst == NULL)) {
+            continue;
+        }
+
+        dst->des_cbfunc = mca_pml_bfo_put_completion;
+        dst->des_cbdata = recvreq;
+
+        /* prepare a descriptor for rdma control message */
+        hdr_size = sizeof(mca_pml_bfo_rdma_hdr_t);
+        if(dst->des_dst_cnt > 1) {
+            hdr_size += (sizeof(mca_btl_base_segment_t) *
+                    (dst->des_dst_cnt-1));
+        }
+
+        mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size,
+                           MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+
+        if( OPAL_UNLIKELY(NULL == ctl) ) {
+            mca_bml_base_free(bml_btl,dst);
+            continue;
+        }
+        ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
+/* BFO FAILOVER CODE - begin */
+        ctl->des_cbdata = recvreq;
+/* BFO FAILOVER CODE - end */
+        
+        /* fill in rdma header */
+        hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval;
+        hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_PUT;
+        hdr->hdr_common.hdr_flags =
+            (!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
+        hdr->hdr_req = recvreq->remote_req_send;
+/* BFO FAILOVER CODE - begin */
+        hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */
+/* BFO FAILOVER CODE - end */
+        hdr->hdr_des.pval = dst;
+        hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
+        hdr->hdr_seg_cnt = dst->des_dst_cnt;
+
+        for( i = 0; i < dst->des_dst_cnt; i++ ) {
+            hdr->hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval);
+            hdr->hdr_segs[i].seg_len       = dst->des_dst[i].seg_len;
+            hdr->hdr_segs[i].seg_key.key64 = dst->des_dst[i].seg_key.key64;
+        }
+
+        if(!recvreq->req_ack_sent)
+            recvreq->req_ack_sent = true;
+        bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
+
+        PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
+                                      &(recvreq->req_recv.req_base), size,
+                                      PERUSE_RECV);
+
+        /* send rdma request to peer */
+        rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT);
+        if( OPAL_LIKELY( rc >= 0 ) ) {
+/* BFO FAILOVER CODE - begin */
+            if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
+                 (ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
+                 recvreq->req_events++;
+            }
+/* BFO FAILOVER CODE - end */
+            /* update request state */
+            recvreq->req_rdma_offset += size;
+            OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
+            recvreq->req_rdma[rdma_idx].length -= size;
+            bytes_remaining -= size;
+        } else {
+            mca_bml_base_free(bml_btl,ctl);
+            mca_bml_base_free(bml_btl,dst);
+        }
+    }
+
+    return OMPI_SUCCESS;
+}
+
+#define IS_PROB_REQ(R) \
+    ((MCA_PML_REQUEST_IPROBE == (R)->req_recv.req_base.req_type) || \
+     (MCA_PML_REQUEST_PROBE == (R)->req_recv.req_base.req_type))
+
+static inline void append_recv_req_to_queue(opal_list_t *queue,
+        mca_pml_bfo_recv_request_t *req)
+{
+    if(OPAL_UNLIKELY(req->req_recv.req_base.req_type == MCA_PML_REQUEST_IPROBE))
+        return;
+
+    opal_list_append(queue, (opal_list_item_t*)req);
+
+    /**
+     * We don't want to generate this kind of event for MPI_Probe. Hopefully,
+     * the compiler will optimize out the empty if loop in the case where PERUSE
+     * support is not required by the user.
+     */
+    if(req->req_recv.req_base.req_type != MCA_PML_REQUEST_PROBE) {
+        PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_INSERT_IN_POSTED_Q,
+                                &(req->req_recv.req_base), PERUSE_RECV);
+    }
+}
+
+/*
+ *  this routine tries to match a posted receive.  If a match is found,
+ *  it places the request in the appropriate matched receive list. This
+ *  function has to be called with the communicator matching lock held.
+*/
+static mca_pml_bfo_recv_frag_t*
+recv_req_match_specific_proc( const mca_pml_bfo_recv_request_t *req,
+                              mca_pml_bfo_comm_proc_t *proc )
+{
+    opal_list_t* unexpected_frags = &proc->unexpected_frags;
+    opal_list_item_t *i;
+    mca_pml_bfo_recv_frag_t* frag;
+    int tag = req->req_recv.req_base.req_tag;
+
+    if(opal_list_get_size(unexpected_frags) == 0)
+        return NULL;
+
+    if( OMPI_ANY_TAG == tag ) {
+        for (i =  opal_list_get_first(unexpected_frags);
+             i != opal_list_get_end(unexpected_frags);
+             i =  opal_list_get_next(i)) {
+            frag = (mca_pml_bfo_recv_frag_t*)i;
+            
+            if( frag->hdr.hdr_match.hdr_tag >= 0 )
+                return frag;
+        }
+    } else {
+        for (i =  opal_list_get_first(unexpected_frags);
+             i != opal_list_get_end(unexpected_frags);
+             i =  opal_list_get_next(i)) {
+            frag = (mca_pml_bfo_recv_frag_t*)i;
+            
+            if( frag->hdr.hdr_match.hdr_tag == tag )
+                return frag;
+        }
+    }
+    return NULL;
+}
+
+/*
+ * this routine is used to try and match a wild posted receive - where
+ * wild is determined by the value assigned to the source process
+*/
+static mca_pml_bfo_recv_frag_t*
+recv_req_match_wild( mca_pml_bfo_recv_request_t* req,
+                     mca_pml_bfo_comm_proc_t **p)
+{
+    mca_pml_bfo_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm;
+    mca_pml_bfo_comm_proc_t* proc = comm->procs;
+    size_t proc_count = comm->num_procs, i;
+
+    /*
+     * Loop over all the outstanding messages to find one that matches.
+     * There is an outer loop over lists of messages from each
+     * process, then an inner loop over the messages from the
+     * process.
+     */
+    for (i = 0; i < proc_count; i++) {
+        mca_pml_bfo_recv_frag_t* frag;
+
+        /* loop over messages from the current proc */
+        if((frag = recv_req_match_specific_proc(req, &proc[i]))) {
+            *p = &proc[i];
+            req->req_recv.req_base.req_proc = proc[i].ompi_proc;
+            prepare_recv_req_converter(req);
+            return frag; /* match found */
+        }
+    }
+
+    *p = NULL;
+    return NULL;
+}
+
+
+void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req)
+{
+    mca_pml_bfo_comm_t* comm = req->req_recv.req_base.req_comm->c_pml_comm;
+    mca_pml_bfo_comm_proc_t* proc;
+    mca_pml_bfo_recv_frag_t* frag;
+    opal_list_t *queue;
+    mca_pml_bfo_hdr_t* hdr;
+
+    /* init/re-init the request */
+    req->req_lock = 0;
+    req->req_pipeline_depth = 0;
+    req->req_bytes_received = 0;
+    req->req_bytes_expected = 0;
+    /* What about req_rdma_cnt ? */
+/* BFO FAILOVER CODE - begin */
+    req->req_rdma_cnt = 0;
+    req->req_events = 0;
+    req->req_restartseq = 0;
+    req->req_errstate = 0;
+/* BFO FAILOVER CODE - end */
+    req->req_rdma_idx = 0;
+    req->req_pending = false;
+    req->req_ack_sent = false;
+
+    MCA_PML_BASE_RECV_START(&req->req_recv.req_base);
+
+    OPAL_THREAD_LOCK(&comm->matching_lock);
+    /**
+     * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include
+     * the cost of the request lock.
+     */
+    PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_BEGIN,
+                            &(req->req_recv.req_base), PERUSE_RECV);
+
+    /* assign sequence number */
+    req->req_recv.req_base.req_sequence = comm->recv_sequence++;
+
+    /* attempt to match posted recv */
+    if(req->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) {
+        frag = recv_req_match_wild(req, &proc);
+        queue = &comm->wild_receives;
+#if !OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+        /* As we are in a homogeneous environment we know that all remote
+         * architectures are exactly the same as the local one. Therefore,
+         * we can safely construct the convertor based on the proc
+         * information of rank 0.
+         */
+        if( NULL == frag ) {
+            req->req_recv.req_base.req_proc = ompi_proc_local_proc;
+            prepare_recv_req_converter(req);
+        }
+#endif  /* !OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
+    } else {
+        proc = &comm->procs[req->req_recv.req_base.req_peer];
+        req->req_recv.req_base.req_proc = proc->ompi_proc;
+        frag = recv_req_match_specific_proc(req, proc);
+        queue = &proc->specific_receives;
+        /* wild cardrecv will be prepared on match */ 
+        prepare_recv_req_converter(req);
+    }
+
+    if(OPAL_UNLIKELY(NULL == frag)) {
+        PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END,
+                                &(req->req_recv.req_base), PERUSE_RECV);
+        /* We didn't find any matches.  Record this irecv so we can match
+           it when the message comes in. */
+        append_recv_req_to_queue(queue, req);
+        req->req_match_received = false;
+        OPAL_THREAD_UNLOCK(&comm->matching_lock);
+    } else {
+        if(OPAL_LIKELY(!IS_PROB_REQ(req))) {
+            PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX,
+                                    &(req->req_recv.req_base), PERUSE_RECV);
+
+            hdr = (mca_pml_bfo_hdr_t*)frag->segments->seg_addr.pval;
+            PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_REMOVE_FROM_UNEX_Q,
+                                   req->req_recv.req_base.req_comm,
+                                   hdr->hdr_match.hdr_src,
+                                   hdr->hdr_match.hdr_tag,
+                                   PERUSE_RECV);
+
+            PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_SEARCH_UNEX_Q_END,
+                                    &(req->req_recv.req_base), PERUSE_RECV);
+
+            opal_list_remove_item(&proc->unexpected_frags,
+                                  (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&comm->matching_lock);
+            
+            switch(hdr->hdr_common.hdr_type) {
+            case MCA_PML_BFO_HDR_TYPE_MATCH:
+                mca_pml_bfo_recv_request_progress_match(req, frag->btl, frag->segments,
+                                                        frag->num_segments);
+                break;
+            case MCA_PML_BFO_HDR_TYPE_RNDV:
+                mca_pml_bfo_recv_request_progress_rndv(req, frag->btl, frag->segments,
+                                                       frag->num_segments);
+                break;
+            case MCA_PML_BFO_HDR_TYPE_RGET:
+                mca_pml_bfo_recv_request_progress_rget(req, frag->btl, frag->segments,
+                                                       frag->num_segments);
+                break;
+            default:
+                assert(0);
+            }
+            
+            MCA_PML_BFO_RECV_FRAG_RETURN(frag);
+
+        } else {
+            OPAL_THREAD_UNLOCK(&comm->matching_lock);
+            mca_pml_bfo_recv_request_matched_probe(req, frag->btl,
+                                                   frag->segments, frag->num_segments);
+        }
+    }
+}
diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.h b/ompi/mca/pml/bfo/pml_bfo_recvreq.h
new file mode 100644
index 0000000000..4e942f3a0c
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+/**
+ * @file
+ */
+#ifndef OMPI_PML_BFO_RECV_REQUEST_H
+#define OMPI_PML_BFO_RECV_REQUEST_H
+
+#include "pml_bfo.h"
+#include "pml_bfo_rdma.h"
+#include "pml_bfo_rdmafrag.h"
+#include "ompi/proc/proc.h"
+#include "ompi/mca/pml/bfo/pml_bfo_comm.h"
+#include "ompi/mca/mpool/base/base.h"
+#include "ompi/mca/pml/base/pml_base_recvreq.h"
+
+/* BFO FAILOVER CODE - begin */
+#define RECVREQ_RECVERRSENT        0x01
+#define RECVREQ_RNDVRESTART_RECVED 0x02
+#define RECVREQ_RNDVRESTART_ACKED  0x04
+/* BFO FAILOVER CODE - end */
+
+BEGIN_C_DECLS
+
+struct mca_pml_bfo_recv_request_t {
+    mca_pml_base_recv_request_t req_recv;
+    ompi_ptr_t remote_req_send;
+/* BFO FAILOVER CODE - begin */
+    int32_t req_msgseq;     /* PML sequence number */
+    int32_t req_events;     /* number of outstanding events on request */
+    int32_t req_restartseq; /* sequence number of restarted request */
+    int32_t req_errstate;   /* state of request if in error */
+/* BFO FAILOVER CODE - end */
+    int32_t req_lock;
+    size_t  req_pipeline_depth;
+    size_t  req_bytes_received;  /**< amount of data transferred into the user buffer */
+    size_t  req_bytes_expected; /**< local size of the data as suggested by the user */
+    size_t  req_rdma_offset;
+    size_t  req_send_offset;
+    uint32_t req_rdma_cnt;
+    uint32_t req_rdma_idx;
+    bool req_pending;
+    bool req_ack_sent; /**< whether ack was sent to the sender */
+    bool req_match_received; /**< Prevent request to be completed prematurely */
+    opal_mutex_t lock;
+    mca_pml_bfo_com_btl_t req_rdma[1];
+};
+typedef struct mca_pml_bfo_recv_request_t mca_pml_bfo_recv_request_t;
+
+OBJ_CLASS_DECLARATION(mca_pml_bfo_recv_request_t);
+
+static inline bool lock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
+{
+        return OPAL_THREAD_ADD32(&recvreq->req_lock,  1) == 1;
+}
+
+static inline bool unlock_recv_request(mca_pml_bfo_recv_request_t *recvreq)
+{
+        return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0;
+}
+
+/**
+ *  Allocate a recv request from the modules free list.
+ *
+ *  @param rc (OUT)  OMPI_SUCCESS or error status on failure.
+ *  @return          Receive request.
+ */
+#define MCA_PML_BFO_RECV_REQUEST_ALLOC(recvreq, rc)                \
+do {                                                               \
+   ompi_free_list_item_t* item;                                    \
+   rc = OMPI_SUCCESS;                                              \
+   OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc);      \
+   recvreq = (mca_pml_bfo_recv_request_t*)item;                    \
+} while(0)
+
+
+/**
+ * Initialize a receive request with call parameters.
+ *
+ * @param request (IN)       Receive request.
+ * @param addr (IN)          User buffer.
+ * @param count (IN)         Number of elements of indicated datatype.
+ * @param datatype (IN)      User defined datatype.
+ * @param src (IN)           Source rank w/in the communicator.
+ * @param tag (IN)           User defined tag.
+ * @param comm (IN)          Communicator.
+ * @param persistent (IN)    Is this a ersistent request.
+ */
+#define MCA_PML_BFO_RECV_REQUEST_INIT( request,                     \
+                                       addr,                        \
+                                       count,                       \
+                                       datatype,                    \
+                                       src,                         \
+                                       tag,                         \
+                                       comm,                        \
+                                       persistent)                  \
+do {                                                                \
+    MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv,           \
+                                    addr,                           \
+                                    count,                          \
+                                    datatype,                       \
+                                    src,                            \
+                                    tag,                            \
+                                    comm,                           \
+                                    persistent);                    \
+} while(0)
+
+/**
+ * Mark the request as completed at MPI level for internal purposes.
+ *
+ *  @param recvreq (IN)  Receive request.
+ */
+#define MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE( recvreq )                              \
+    do {                                                                              \
+       PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE,                             \
+                                &(recvreq->req_recv.req_base), PERUSE_RECV );         \
+        ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true );        \
+    } while (0)
+
+/*
+ *  Free the PML receive request
+ */
+#define MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq)                        \
+    {                                                                   \
+        MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv);           \
+        OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests,             \
+                               (ompi_free_list_item_t*)(recvreq));      \
+    }
+
+/**
+ * Complete receive request. Request structure cannot be accessed after calling
+ * this function any more.
+ *
+ *  @param recvreq (IN)  Receive request.
+ */
+static inline void
+recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
+{
+    size_t i;
+
+    assert(false == recvreq->req_recv.req_base.req_pml_complete);
+
+    if(recvreq->req_recv.req_bytes_packed > 0) {
+        PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
+                &recvreq->req_recv.req_base, PERUSE_RECV );
+    }
+
+    for(i = 0; i < recvreq->req_rdma_cnt; i++) {
+        mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
+        if( NULL != btl_reg  && btl_reg->mpool != NULL) {
+            btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
+        }
+    }
+    recvreq->req_rdma_cnt = 0;
+/* BFO FAILOVER CODE - begin */
+    /* Initialize to a value that we indicate it is invalid */
+    recvreq->req_msgseq = 42;
+/* BFO FAILOVER CODE - end */
+
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    if(true == recvreq->req_recv.req_base.req_free_called) {
+        MCA_PML_BFO_RECV_REQUEST_RETURN(recvreq);
+    } else {
+        /* initialize request status */
+        recvreq->req_recv.req_base.req_pml_complete = true;
+        recvreq->req_recv.req_base.req_ompi.req_status._count =
+            (int)recvreq->req_bytes_received;
+        if (recvreq->req_recv.req_bytes_packed > recvreq->req_bytes_expected) {
+            recvreq->req_recv.req_base.req_ompi.req_status._count =
+                (int)recvreq->req_recv.req_bytes_packed;
+            recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
+                MPI_ERR_TRUNCATE;
+        }
+        MCA_PML_BFO_RECV_REQUEST_MPI_COMPLETE(recvreq);
+    }
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+}
+
+static inline bool
+recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
+{
+#if OPAL_HAVE_THREAD_SUPPORT
+    opal_atomic_rmb();
+#endif
+    if(recvreq->req_match_received &&
+            recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
+                        (0 == recvreq->req_events) && lock_recv_request(recvreq)) {
+        recv_request_pml_complete(recvreq);
+        return true;
+    }
+
+    return false;
+}
+
+extern void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req);
+#define MCA_PML_BFO_RECV_REQUEST_START(r) mca_pml_bfo_recv_req_start(r)
+
+static inline void prepare_recv_req_converter(mca_pml_bfo_recv_request_t *req)
+{
+    if( req->req_recv.req_base.req_datatype->super.size | req->req_recv.req_base.req_count ) {
+        opal_convertor_copy_and_prepare_for_recv(
+                req->req_recv.req_base.req_proc->proc_convertor,
+                &(req->req_recv.req_base.req_datatype->super),
+                req->req_recv.req_base.req_count,
+                req->req_recv.req_base.req_addr,
+                0,
+                &req->req_recv.req_base.req_convertor);
+        opal_convertor_get_unpacked_size(&req->req_recv.req_base.req_convertor,
+                                         &req->req_bytes_expected);
+    }
+}
+
+#define MCA_PML_BFO_RECV_REQUEST_MATCHED(request, hdr) \
+    recv_req_matched(request, hdr)
+
+static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
+                                    mca_pml_bfo_match_hdr_t *hdr)
+{
+    req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
+    req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
+    req->req_match_received = true;
+/* BFO FAILOVER CODE - begin */
+    req->req_msgseq = hdr->hdr_seq;
+/* BFO FAILOVER CODE - end */
+#if OPAL_HAVE_THREAD_SUPPORT
+    opal_atomic_wmb();
+#endif
+    if(req->req_recv.req_bytes_packed > 0) {
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+        if(MPI_ANY_SOURCE == req->req_recv.req_base.req_peer) {
+            /* non wildcard prepared during post recv */
+            prepare_recv_req_converter(req);
+        }
+#endif  /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
+        PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_XFER_BEGIN,
+                                &req->req_recv.req_base, PERUSE_RECV);
+    }
+}
+
+
+/**
+ *
+ */
+
+#define MCA_PML_BFO_RECV_REQUEST_UNPACK( request,                                 \
+                                         segments,                                \
+                                         num_segments,                            \
+                                         seg_offset,                              \
+                                         data_offset,                             \
+                                         bytes_received,                          \
+                                         bytes_delivered)                         \
+do {                                                                              \
+    bytes_delivered = 0;                                                          \
+    if(request->req_recv.req_bytes_packed > 0) {                                  \
+        struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS];                               \
+        uint32_t iov_count = 0;                                                   \
+        size_t max_data = bytes_received;                                         \
+        size_t n, offset = seg_offset;                                            \
+        mca_btl_base_segment_t* segment = segments;                               \
+                                                                                  \
+        OPAL_THREAD_LOCK(&request->lock);                                         \
+        for( n = 0; n < num_segments; n++, segment++ ) {                          \
+            if(offset >= segment->seg_len) {                                      \
+                offset -= segment->seg_len;                                       \
+            } else {                                                              \
+                iov[iov_count].iov_len = segment->seg_len - offset;               \
+                iov[iov_count].iov_base = (IOVBASE_TYPE*)                         \
+                    ((unsigned char*)segment->seg_addr.pval + offset);            \
+                iov_count++;                                                      \
+                offset = 0;                                                       \
+            }                                                                     \
+        }                                                                         \
+        PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE,              \
+                                      &(recvreq->req_recv.req_base), max_data,    \
+                                      PERUSE_RECV);                               \
+        opal_convertor_set_position( &(request->req_recv.req_base.req_convertor), \
+                                     &data_offset );                              \
+        opal_convertor_unpack( &(request)->req_recv.req_base.req_convertor,       \
+                               iov,                                               \
+                               &iov_count,                                        \
+                               &max_data );                                       \
+        bytes_delivered = max_data;                                               \
+        OPAL_THREAD_UNLOCK(&request->lock);                                       \
+    }                                                                             \
+} while (0)
+
+
+/**
+ *
+ */
+
+void mca_pml_bfo_recv_request_progress_match(
+    mca_pml_bfo_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments);
+
+/**
+ *
+ */
+
+void mca_pml_bfo_recv_request_progress_frag(
+    mca_pml_bfo_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments);
+
+/**
+ *
+ */
+
+void mca_pml_bfo_recv_request_progress_rndv(
+    mca_pml_bfo_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments);
+
+/**
+ *
+ */
+
+void mca_pml_bfo_recv_request_progress_rget(
+    mca_pml_bfo_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments);
+
+/**
+ *
+ */
+
+void mca_pml_bfo_recv_request_matched_probe(
+    mca_pml_bfo_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments);
+
+/**
+ *
+ */
+
+int mca_pml_bfo_recv_request_schedule_once(
+    mca_pml_bfo_recv_request_t* req, mca_btl_base_module_t* start_btl);
+
+static inline int mca_pml_bfo_recv_request_schedule_exclusive(
+        mca_pml_bfo_recv_request_t* req,
+        mca_bml_base_btl_t* start_bml_btl)
+{
+    int rc;
+
+    do {
+        rc = mca_pml_bfo_recv_request_schedule_once(req, start_bml_btl ? start_bml_btl->btl : NULL);
+        if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
+            break;
+    } while(!unlock_recv_request(req));
+
+    if(OMPI_SUCCESS == rc)
+        recv_request_pml_complete_check(req);
+
+    return rc;
+}
+
+static inline void mca_pml_bfo_recv_request_schedule(
+        mca_pml_bfo_recv_request_t* req,
+        mca_bml_base_btl_t* start_bml_btl)
+{
+    if(!lock_recv_request(req))
+        return;
+
+    (void)mca_pml_bfo_recv_request_schedule_exclusive(req, start_bml_btl);
+}
+
+#define MCA_PML_BFO_ADD_ACK_TO_PENDING(P, S, D, O)                      \
+    do {                                                                \
+        mca_pml_bfo_pckt_pending_t *_pckt;                              \
+        int _rc;                                                        \
+                                                                        \
+        MCA_PML_BFO_PCKT_PENDING_ALLOC(_pckt,_rc);                      \
+        _pckt->hdr.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_ACK;      \
+        _pckt->hdr.hdr_ack.hdr_src_req.lval = (S);                      \
+        _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D);                      \
+        _pckt->hdr.hdr_ack.hdr_send_offset = (O);                       \
+        _pckt->proc = (P);                                              \
+        _pckt->bml_btl = NULL;                                          \
+        OPAL_THREAD_LOCK(&mca_pml_bfo.lock);                            \
+        opal_list_append(&mca_pml_bfo.pckt_pending,                     \
+                         (opal_list_item_t*)_pckt);                     \
+        OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);                          \
+    } while(0)
+
+int mca_pml_bfo_recv_request_ack_send_btl(ompi_proc_t* proc,
+        mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
+        uint64_t hdr_rdma_offset, bool nordma);
+
+static inline int mca_pml_bfo_recv_request_ack_send(ompi_proc_t* proc,
+        uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
+        bool nordma)
+{
+    size_t i;
+    mca_bml_base_btl_t* bml_btl;
+    mca_bml_base_endpoint_t* endpoint =
+        (mca_bml_base_endpoint_t*)proc->proc_bml;
+
+    for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
+        bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+        if(mca_pml_bfo_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
+                    hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
+            return OMPI_SUCCESS;
+    }
+
+    MCA_PML_BFO_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
+                                   hdr_send_offset);
+
+    return OMPI_ERR_OUT_OF_RESOURCE;
+}
+
+int mca_pml_bfo_recv_request_get_frag(mca_pml_bfo_rdma_frag_t* frag);
+
+/* This function tries to continue recvreq that stuck due to resource
+ * unavailability. Recvreq is added to recv_pending list if scheduling of put
+ * operation cannot be accomplished for some reason. */
+void mca_pml_bfo_recv_request_process_pending(void);
+
+END_C_DECLS
+
+#endif
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.c b/ompi/mca/pml/bfo/pml_bfo_sendreq.c
new file mode 100644
index 0000000000..7c676966ba
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.c
@@ -0,0 +1,1595 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2008 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+
+#include "ompi_config.h"
+#include "opal/prefetch.h"
+#include "ompi/constants.h"
+#include "ompi/mca/pml/pml.h"
+#include "ompi/mca/btl/btl.h"
+#include "orte/mca/errmgr/errmgr.h"
+#include "ompi/mca/mpool/mpool.h" 
+#include "pml_bfo.h"
+#include "pml_bfo_hdr.h"
+#include "pml_bfo_sendreq.h"
+#include "pml_bfo_rdmafrag.h"
+#include "pml_bfo_recvreq.h"
+/* BFO FAILOVER CODE - begin */
+#include "pml_bfo_failover.h"
+/* BFO FAILOVER CODE - end */
+#include "ompi/mca/bml/base/base.h"
+#include "ompi/memchecker.h"
+
+OBJ_CLASS_INSTANCE(mca_pml_bfo_send_range_t, ompi_free_list_item_t,
+        NULL, NULL);
+
+void mca_pml_bfo_send_request_process_pending(struct mca_btl_base_module_t *btl)
+{
+    int i, s = opal_list_get_size(&mca_pml_bfo.send_pending);
+
+    /* advance pending requests */
+    for(i = 0; i < s; i++) {
+        mca_pml_bfo_send_pending_t pending_type = MCA_PML_BFO_SEND_PENDING_NONE;
+        mca_pml_bfo_send_request_t* sendreq;
+        mca_bml_base_btl_t *send_dst;
+
+        sendreq = get_request_from_send_pending(&pending_type);
+        if(OPAL_UNLIKELY(NULL == sendreq))
+            break;
+
+        switch(pending_type) {
+        case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
+            if(OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_send_request_schedule_exclusive(sendreq)) ==
+                    OMPI_ERR_OUT_OF_RESOURCE) {
+                return;
+            }
+            break;
+        case MCA_PML_BFO_SEND_PENDING_START:
+            send_dst = mca_bml_base_btl_array_find(
+                    &sendreq->req_endpoint->btl_eager, btl);
+            if( (NULL == send_dst) ||
+                (OPAL_SOS_GET_ERROR_CODE(mca_pml_bfo_send_request_start_btl(sendreq, send_dst)) ==
+                      OMPI_ERR_OUT_OF_RESOURCE) ) {
+                /* prepend to the pending list to minimize reordering in case
+                 * send_dst != 0 */
+                add_request_to_send_pending(sendreq,
+                        MCA_PML_BFO_SEND_PENDING_START, NULL == send_dst);
+                /* if no destination try next request otherwise give up,
+                 * no more resources on this btl */
+                if(send_dst != NULL)
+                    return;
+            }
+            break;
+        default:
+            opal_output(0, "[%s:%d] wrong send request type\n",
+                    __FILE__, __LINE__);
+            break;
+        }
+    }
+}
+
+/*
+ * The free call mark the final stage in a request life-cycle. Starting from this
+ * point the request is completed at both PML and user level, and can be used
+ * for others p2p communications. Therefore, in the case of the BFO PML it should
+ * be added to the free request list.
+ */
+static int mca_pml_bfo_send_request_free(struct ompi_request_t** request)
+{
+    mca_pml_bfo_send_request_t* sendreq = *(mca_pml_bfo_send_request_t**)request;
+    
+    assert( false == sendreq->req_send.req_base.req_free_called );
+
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    sendreq->req_send.req_base.req_free_called = true;
+
+    PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_NOTIFY,
+                             &(sendreq->req_send.req_base), PERUSE_SEND );
+
+    if( true == sendreq->req_send.req_base.req_pml_complete ) {
+        /* make buffer defined when the request is compeleted,
+           and before releasing the objects. */
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+
+        MCA_PML_BFO_SEND_REQUEST_RETURN( sendreq );
+    }
+
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+
+    *request = MPI_REQUEST_NULL;
+    return OMPI_SUCCESS;
+}
+
+static int mca_pml_bfo_send_request_cancel(struct ompi_request_t* request, int complete)
+{
+    /* we dont cancel send requests by now */
+    return OMPI_SUCCESS;
+}
+
+static void mca_pml_bfo_send_request_construct(mca_pml_bfo_send_request_t* req)
+{
+    req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND;
+    req->req_send.req_base.req_ompi.req_free = mca_pml_bfo_send_request_free;
+    req->req_send.req_base.req_ompi.req_cancel = mca_pml_bfo_send_request_cancel;
+    req->req_rdma_cnt = 0;
+    req->req_throttle_sends = false;
+    OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
+    OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
+}
+
+static void mca_pml_bfo_send_request_destruct(mca_pml_bfo_send_request_t* req)
+{
+    OBJ_DESTRUCT(&req->req_send_ranges);
+    OBJ_DESTRUCT(&req->req_send_range_lock);
+}
+
+OBJ_CLASS_INSTANCE( mca_pml_bfo_send_request_t,
+                    mca_pml_base_send_request_t,
+                    mca_pml_bfo_send_request_construct,
+                    mca_pml_bfo_send_request_destruct );
+
+/**
+ * Completion of a short message - nothing left to schedule.
+ */
+
+static inline void
+mca_pml_bfo_match_completion_free_request( struct mca_btl_base_module_t* btl,  
+                                           mca_pml_bfo_send_request_t* sendreq )
+{
+    if( sendreq->req_send.req_bytes_packed > 0 ) {
+        PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
+                                 &(sendreq->req_send.req_base), PERUSE_SEND );
+    }
+
+    /* signal request completion */
+    send_request_pml_complete(sendreq);
+
+    /* check for pending requests */
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+static void
+mca_pml_bfo_match_completion_free( struct mca_btl_base_module_t* btl,  
+                                   struct mca_btl_base_endpoint_t* ep,
+                                   struct mca_btl_base_descriptor_t* des,
+                                   int status )
+{
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+
+    /* check completion status */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+/* BFO FAILOVER CODE - begin */
+        mca_pml_bfo_repost_match_fragment(des); 
+        return;
+/* BFO FAILOVER CODE - end */
+    }
+    mca_pml_bfo_match_completion_free_request( btl, sendreq );
+}
+
+static inline void
+mca_pml_bfo_rndv_completion_request( struct mca_btl_base_module_t* btl,
+                                     mca_pml_bfo_send_request_t* sendreq,
+                                     size_t req_bytes_delivered )
+{
+    if( sendreq->req_send.req_bytes_packed > 0 ) {
+        PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
+                                 &(sendreq->req_send.req_base), PERUSE_SEND );
+    }
+
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
+    /* advance the request */
+    OPAL_THREAD_ADD32(&sendreq->req_state, -1);
+
+    send_request_pml_complete_check(sendreq);
+
+    /* check for pending requests */
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+/*
+ *  Completion of the first fragment of a long message that 
+ *  requires an acknowledgement
+ */
+static void
+mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
+                             struct mca_btl_base_endpoint_t* ep,
+                             struct mca_btl_base_descriptor_t* des,
+                             int status )
+{
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+    size_t req_bytes_delivered = 0;
+
+    /* check completion status */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+/* BFO FAILOVER CODE - begin */
+        /* The completion event for the RNDV message has returned with 
+         * an error. We know that the send request we are looking at is 
+         * valid because it cannot be completed until the sendreq->req_state 
+         * value reaches 0.  And for the sendreq->req_state to reach 0, 
+         * the completion event on the RNDV message must occur.  So, we 
+         * do not bother checking whether the send request is valid, 
+         * because we know it is, but we put a few asserts in for good 
+         * measure.  We then check a few fields in the request to decide what 
+         * to do.  If the sendreq->req_error is set, that means that something 
+         * has happend already to the request and we do not want to restart 
+         * it.  Presumably, we may have received a RECVERRNOTIFY 
+         * message from the receiver.  We also check the sendreq->req_acked 
+         * field to see if it has been acked.  If it has, then again we 
+         * do not restart everything because obviously the RNDV message 
+         * has made it to the other side. */
+        assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_ctx ==
+               sendreq->req_send.req_base.req_comm->c_contextid); 
+        assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_src ==
+               sendreq->req_send.req_base.req_comm->c_my_rank); 
+        assert((mca_pml_bfo_hdr_t*)(des->des_src->seg_addr.pval)->hdr_match.hdr_seq ==
+               (uint16_t)sendreq->req_send.req_base.req_sequence); 
+ 
+        if ((!sendreq->req_error) && (!sendreq->req_acked)) { 
+            sendreq->req_events--; 
+            /* Assume RNDV did not make it, so restart from the beginning. */ 
+            mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); 
+            return; 
+        }
+/* BFO FAILOVER CODE - end */
+    }
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_events--;
+
+    /* Now check the error state.  This request can be in error if the
+     * RNDV message made it over, but the receiver got an error trying
+     * to send the ACK back and therefore sent a RECVERRNOTIFY message.
+     * In that case, we want to start the restart dance as the receiver
+     * has matched this message already.  Only restart if there are no
+     * outstanding events on send request. */
+    if (sendreq->req_error) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDV: completion: sendreq has error, outstanding events=%d, "
+                            "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d",
+                            sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq, (unsigned long)sendreq,
+                            (unsigned long)sendreq->req_recv.pval,
+                            status, sendreq->req_send.req_base.req_peer);
+        if (0 == sendreq->req_events) {
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_RNDV,
+                                                       status, btl);
+        }
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+
+    /* count bytes of user data actually delivered. As the rndv completion only
+     * happens in one thread, the increase of the req_bytes_delivered does not
+     * have to be atomic.
+     */
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src,
+                                        des->des_src_cnt,
+                                        sizeof(mca_pml_bfo_rendezvous_hdr_t),
+                                        req_bytes_delivered );
+
+    mca_pml_bfo_rndv_completion_request( btl, sendreq, req_bytes_delivered );
+}
+
+
+/**
+ * Completion of a get request.
+ */
+
+static void
+mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
+                             struct mca_btl_base_endpoint_t* ep,
+                             struct mca_btl_base_descriptor_t* des,
+                             int status )
+{
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+    size_t req_bytes_delivered = 0;
+/* BFO FAILOVER CODE - begin */
+    /* This can happen if a FIN message arrives after the request was
+     * marked in error.  So, just drop the message.  Note that the
+     * status field is not checked here.  That is because that is the
+     * value returned in the FIN hdr.hdr_fail field and may be used for
+     * other things. */
+    if( OPAL_UNLIKELY(sendreq->req_error)) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "FIN: received on broken request, skipping, "
+                            "PML=%d, src_req=%lx, dst_req=%lx, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            (unsigned long)sendreq, (unsigned long)sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+        btl->btl_free(btl, des);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+
+    /* count bytes of user data actually delivered and check for request completion */
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
+                                        0, req_bytes_delivered );
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
+    send_request_pml_complete_check(sendreq);
+    /* free the descriptor */
+    btl->btl_free(btl, des);
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+
+/**
+ * Completion of a control message - return resources.
+ */
+
+static void
+mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl,
+                                 struct mca_btl_base_endpoint_t* ep,
+                                 struct mca_btl_base_descriptor_t* des,
+                                 int status )
+{
+/* BFO FAILOVER CODE - begin */
+    if(OPAL_LIKELY(OMPI_SUCCESS == status)) {
+        /* check for pending requests */
+        MCA_PML_BFO_PROGRESS_PENDING(btl);
+    } else {
+        mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval;
+        /* If we get an error on the RGET message, then first make
+         * sure that header matches the send request that we are
+         * pointing to.  This is necessary, because even though the
+         * sending side got an error, the RGET may have made it to the
+         * receiving side and the message transfer may have completed.
+         * This would then mean the send request has been completed and
+         * perhaps in use by another communication.  So there is no need
+         * to restart this request.  Therefore, ensure that we are
+         * looking at the same request that the header thinks we are
+         * looking at.  If not, then there is nothing else to be done. */
+        mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+
+        switch (hdr->hdr_common.hdr_type) {
+        case MCA_PML_BFO_HDR_TYPE_RGET:
+            if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
+                (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) ||
+                (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
+                opal_output_verbose(20, mca_pml_bfo_output,
+                                    "RGET: completion event: dropping because no valid request "
+                                    "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
+                                    "RQS:exp=%d,act=%d, dst_req=%p",
+                                    (uint16_t)sendreq->req_send.req_base.req_sequence,
+                                    hdr->hdr_match.hdr_seq,
+                                    sendreq->req_send.req_base.req_comm->c_contextid,
+                                    hdr->hdr_match.hdr_ctx,
+                                    sendreq->req_send.req_base.req_comm->c_my_rank,
+                                    hdr->hdr_match.hdr_src,
+                                    sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
+                                    (void *)sendreq);
+                return;
+            }
+            mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET);
+            return;
+        default:
+            opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)",
+                        __FILE__, __LINE__, hdr->hdr_common.hdr_type);
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+/* BFO FAILOVER CODE - end */
+}
+
+/**
+ * Completion of additional fragments of a large message - may need
+ * to schedule additional fragments.
+ */
+
+static void
+mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
+                             struct mca_btl_base_endpoint_t* ep,
+                             struct mca_btl_base_descriptor_t* des,
+                             int status )
+{
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
+    size_t req_bytes_delivered = 0;
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_events--;
+/* BFO FAILOVER CODE - end */
+
+    /* check completion status */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+/* BFO FAILOVER CODE - begin */
+        sendreq->req_error++;
+/* BFO FAILOVER CODE - end */
+    }
+
+    /* count bytes of user data actually delivered */
+    MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src,
+                                        des->des_src_cnt,
+                                        sizeof(mca_pml_bfo_frag_hdr_t),
+                                        req_bytes_delivered );
+
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+
+/* BFO FAILOVER CODE - begin */
+    /* note we check error after bytes delivered computation in case frag made it */ 
+    if( OPAL_UNLIKELY(sendreq->req_error)) { 
+        opal_output_verbose(30, mca_pml_bfo_output, 
+                            "FRAG: completion: sendreq has error, outstanding events=%d, " 
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", 
+                            sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, 
+                            sendreq->req_restartseq, (void *)sendreq,
+                            sendreq->req_recv.pval, 
+                            status, sendreq->req_send.req_base.req_peer); 
+        if (0 == sendreq->req_events) { 
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_FRAG,
+                                                       status, btl);
+        } 
+        return; 
+    }
+/* BFO FAILOVER CODE - end */
+    if(send_request_pml_complete_check(sendreq) == false) {
+        mca_pml_bfo_send_request_schedule(sendreq);
+/* BFO FAILOVER CODE - begin */
+        if( OPAL_UNLIKELY(sendreq->req_error)) {
+            /* This situation can happen if the scheduling function
+             * determined that a BTL was removed from underneath us
+             * and therefore marked the request in error.  In that
+             * case, the scheduling of fragments can no longer proceed
+             * properly.  Therefore, if no outstanding events, initiate
+             * the restart dance. */
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "FRAG: completion: BTL has been removed, outstanding events=%d, "
+                                "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
+                                sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
+                                sendreq->req_restartseq, (void *)sendreq,
+                                sendreq->req_recv.pval,
+                                status, sendreq->req_send.req_base.req_peer);
+            if (0 == sendreq->req_events) {
+                mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                           MCA_PML_BFO_HDR_TYPE_FRAG,
+                                                           status, btl);
+            }
+        }
+/* BFO FAILOVER CODE - end */
+    }
+
+    /* check for pending requests */
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+/**
+ *  Buffer the entire message and mark as complete.
+ */
+
+int mca_pml_bfo_send_request_start_buffered(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size)
+{
+    mca_btl_base_descriptor_t* des;
+    mca_btl_base_segment_t* segment;
+    mca_pml_bfo_hdr_t* hdr;
+    struct iovec iov;
+    unsigned int iov_count;
+    size_t max_data, req_bytes_delivered;
+    int rc;
+
+    /* allocate descriptor */
+    mca_bml_base_alloc(bml_btl, &des, 
+                       MCA_BTL_NO_ORDER,
+                       sizeof(mca_pml_bfo_rendezvous_hdr_t) + size,
+                       MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    } 
+    segment = des->des_src;
+
+    /* pack the data into the BTL supplied buffer */
+    iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + 
+                                    sizeof(mca_pml_bfo_rendezvous_hdr_t));
+    iov.iov_len = size;
+    iov_count = 1;
+    max_data = size;
+    if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
+                                  &iov,
+                                  &iov_count,
+                                  &max_data)) < 0) {
+        mca_bml_base_free(bml_btl, des);
+        return rc;
+    }
+    req_bytes_delivered = max_data;
+
+    /* build rendezvous header */
+    hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+    hdr->hdr_common.hdr_flags = 0;
+    hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV;
+    hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+    hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+    hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+    hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+    hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
+    hdr->hdr_rndv.hdr_src_req.pval = sendreq;
+/* BFO FAILOVER CODE - begin */
+    if (0 < sendreq->req_restartseq) {
+        hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART;
+        hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv;
+        hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq;
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDV(buffered): restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, "
+                            "src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
+                            sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer);
+    }
+/* BFO FAILOVER CODE - end */
+
+    bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
+                 sendreq->req_send.req_base.req_proc);
+
+    /* update lengths */
+    segment->seg_len = sizeof(mca_pml_bfo_rendezvous_hdr_t) + max_data;
+
+    des->des_cbfunc = mca_pml_bfo_rndv_completion;
+    des->des_cbdata = sendreq;
+
+    /* buffer the remainder of the message */
+    rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq);
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+        mca_bml_base_free(bml_btl, des);
+        return rc;
+    }
+
+    iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data);
+    iov.iov_len = max_data = sendreq->req_send.req_bytes_packed - max_data;
+
+    if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
+                                  &iov,
+                                  &iov_count,
+                                  &max_data)) < 0) {
+        mca_bml_base_free(bml_btl, des);
+        return rc;
+    }
+
+    /* re-init convertor for packed data */
+    opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor,
+                                     &(ompi_mpi_byte.dt.super),
+                                     sendreq->req_send.req_bytes_packed,
+                                     sendreq->req_send.req_addr );
+   
+    /* wait for ack and completion */
+    sendreq->req_state = 2;
+
+    /* request is complete at mpi level */
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+
+    /* send */
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDV);
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+        if( OPAL_LIKELY( 1 == rc ) ) {
+            mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, req_bytes_delivered);
+        }
+/* BFO FAILOVER CODE - begin */
+        if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+            sendreq->req_events++;
+        }
+/* BFO FAILOVER CODE - end */
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, des );
+    return rc;
+}
+
+
+/**
+ *  We work on a buffered request with a size smaller than the eager size 
+ *  or the BTL is not able to send the data IN_PLACE. Request a segment
+ *  that is used for initial hdr and any eager data. This is used only
+ *  from the _START macro.
+ */
+int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
+                                         mca_bml_base_btl_t* bml_btl,
+                                         size_t size )
+{
+    mca_btl_base_descriptor_t* des = NULL;
+    mca_btl_base_segment_t* segment;
+    mca_pml_bfo_hdr_t* hdr;
+    struct iovec iov;
+    unsigned int iov_count;
+    size_t max_data = size;
+    int rc;
+
+    if(NULL != bml_btl->btl->btl_sendi) {
+        mca_pml_bfo_match_hdr_t match;
+        match.hdr_common.hdr_flags = 0;
+        match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH;
+        match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+        match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+        match.hdr_tag = sendreq->req_send.req_base.req_tag;
+        match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+        
+        bfo_hdr_hton(&match, MCA_PML_BFO_HDR_TYPE_MATCH,
+                     sendreq->req_send.req_base.req_proc);
+
+        /* try to send immediately */
+        rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
+                                 &match, OMPI_PML_BFO_MATCH_HDR_LEN, 
+                                 size, MCA_BTL_NO_ORDER, 
+                                 MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
+                                 MCA_PML_BFO_HDR_TYPE_MATCH, 
+                                 &des);
+        if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
+/* BFO FAILOVER CODE - begin */
+            /* Needed for failover */
+            if (NULL != des) {
+                des->des_cbfunc = mca_pml_bfo_match_completion_free;
+                des->des_cbdata = sendreq->req_endpoint;
+            }
+/* BFO FAILOVER CODE - end */
+
+            /* signal request completion */
+            send_request_pml_complete(sendreq);
+
+            /* check for pending requests */
+            MCA_PML_BFO_PROGRESS_PENDING(bml_btl->btl);
+            return OMPI_SUCCESS;
+        }
+    } else { 
+        /* allocate descriptor */
+        mca_bml_base_alloc( bml_btl, &des,
+                            MCA_BTL_NO_ORDER,
+                            OMPI_PML_BFO_MATCH_HDR_LEN + size,
+                            MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+    }
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    segment = des->des_src;
+
+    if(size > 0) {
+        /* pack the data into the supplied buffer */
+        iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
+                                       OMPI_PML_BFO_MATCH_HDR_LEN);
+        iov.iov_len  = size;
+        iov_count    = 1;
+        /*
+         * Before copy the user buffer, make the target part 
+         * accessible.
+         */
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+        (void)opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
+                                   &iov, &iov_count, &max_data );
+         /*
+          *  Packing finished, make the user buffer unaccessable.
+          */
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_noaccess,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+    }
+
+    
+    /* build match header */
+    hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+    hdr->hdr_common.hdr_flags = 0;
+    hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH;
+    hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+    hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+    hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+    hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+
+    bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_MATCH,
+                 sendreq->req_send.req_base.req_proc);
+
+    /* update lengths */
+    segment->seg_len = OMPI_PML_BFO_MATCH_HDR_LEN + max_data;
+
+    /* short message */
+    des->des_cbdata = sendreq;
+    des->des_cbfunc = mca_pml_bfo_match_completion_free;
+
+    /* send */
+    rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_BFO_HDR_TYPE_MATCH);
+    if( OPAL_LIKELY( rc >= OMPI_SUCCESS ) ) {
+        if( OPAL_LIKELY( 1 == rc ) ) {
+            mca_pml_bfo_match_completion_free_request( bml_btl->btl, sendreq );
+        }
+        return OMPI_SUCCESS;
+    }
+    switch(OPAL_SOS_GET_ERROR_CODE(rc)) {
+        case OMPI_ERR_RESOURCE_BUSY:
+            /* No more resources. Allow the upper level to queue the send */
+            rc = OMPI_ERR_OUT_OF_RESOURCE;
+            break;
+        default:
+            mca_bml_base_free(bml_btl, des);
+            break;
+    }
+    return rc;
+}
+
+/**
+ *  BTL can send directly from user buffer so allow the BTL
+ *  to prepare the segment list. Start sending a small message.
+ */
+
+int mca_pml_bfo_send_request_start_prepare( mca_pml_bfo_send_request_t* sendreq,
+                                            mca_bml_base_btl_t* bml_btl,
+                                            size_t size )
+{
+    mca_btl_base_descriptor_t* des;
+    mca_btl_base_segment_t* segment;
+    mca_pml_bfo_hdr_t* hdr;
+    int rc;
+
+    /* prepare descriptor */
+    mca_bml_base_prepare_src( bml_btl,
+                              NULL,
+                              &sendreq->req_send.req_base.req_convertor,
+                              MCA_BTL_NO_ORDER,
+                              OMPI_PML_BFO_MATCH_HDR_LEN,
+                              &size,
+                              MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
+                              &des );
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    segment = des->des_src;
+
+    /* build match header */
+    hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+    hdr->hdr_common.hdr_flags = 0;
+    hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_MATCH;
+    hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+    hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+    hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+    hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+
+    bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_MATCH,
+                 sendreq->req_send.req_base.req_proc);
+
+    /* short message */
+    des->des_cbfunc = mca_pml_bfo_match_completion_free;
+    des->des_cbdata = sendreq;
+
+    /* send */
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_MATCH); 
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+        if( OPAL_LIKELY( 1 == rc ) ) {
+            mca_pml_bfo_match_completion_free_request( bml_btl->btl, sendreq );
+        }
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, des );
+    return rc;
+}
+
+
+/**
+ *  We have contigous data that is registered - schedule across
+ *  available nics.
+ */
+
+int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
+                                         mca_bml_base_btl_t* bml_btl,
+                                         size_t size )
+{
+    /*
+     * When req_rdma array is constructed the first element of the array always
+     * assigned different btl in round robin fashion (if there are more than
+     * one RDMA capable BTLs). This way round robin distribution of RDMA
+     * operation is achieved.
+     */
+
+    mca_btl_base_descriptor_t* des;
+    mca_btl_base_segment_t* segment;
+    mca_pml_bfo_hdr_t* hdr;
+    bool need_local_cb = false;
+    int rc;
+
+    bml_btl = sendreq->req_rdma[0].bml_btl;
+    if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & MCA_BTL_FLAGS_GET)) {
+        mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
+        mca_btl_base_descriptor_t* src;
+        size_t i;
+        size_t old_position = sendreq->req_send.req_base.req_convertor.bConverted;
+
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+        /* prepare source descriptor/segment(s) */
+        /* PML owns this descriptor and will free it in */
+        /*  get_completion */
+        mca_bml_base_prepare_src( bml_btl, 
+                                  reg,
+                                  &sendreq->req_send.req_base.req_convertor,
+                                  MCA_BTL_NO_ORDER,
+                                  0,
+                                  &size,
+                                  0,
+                                  &src );
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_noaccess,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+        if( OPAL_UNLIKELY(NULL == src) ) {
+            opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
+                                        &old_position);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        } 
+        src->des_cbfunc = mca_pml_bfo_rget_completion;
+        src->des_cbdata = sendreq;
+
+        /* allocate space for get hdr + segment list */
+        mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
+                           sizeof(mca_pml_bfo_rget_hdr_t) +
+                           (sizeof(mca_btl_base_segment_t) * (src->des_src_cnt-1)),
+                           MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+        if( OPAL_UNLIKELY(NULL == des) ) {
+            opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
+                                         &old_position );
+            mca_bml_base_free(bml_btl, src);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+        segment = des->des_src;
+
+        /* build match header */
+        hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+        hdr->hdr_common.hdr_flags = MCA_PML_BFO_HDR_FLAGS_CONTIG|MCA_PML_BFO_HDR_FLAGS_PIN;
+        hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RGET;
+        hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+        hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+        hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+        hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+        hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
+        hdr->hdr_rndv.hdr_src_req.pval = sendreq;
+/* BFO FAILOVER CODE - begin */
+        if (0 < sendreq->req_restartseq) {
+            hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART;
+            hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv;
+            hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq;
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RGET: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, "
+                                "src_req=%p, dst_req=%p, peer=%d",
+                                (uint16_t)sendreq->req_send.req_base.req_sequence,
+                                sendreq->req_restartseq,
+                                sendreq->req_send.req_base.req_comm->c_contextid,
+                                sendreq->req_send.req_base.req_comm->c_my_rank,
+                                (void *)sendreq, sendreq->req_recv.pval,
+                                sendreq->req_send.req_base.req_peer);
+        }
+/* BFO FAILOVER CODE - end */
+        hdr->hdr_rget.hdr_des.pval = src;
+        hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt;
+
+        bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RGET,
+                     sendreq->req_send.req_base.req_proc);
+
+        for( i = 0; i < src->des_src_cnt; i++ ) {
+            hdr->hdr_rget.hdr_segs[i].seg_addr.lval = ompi_ptr_ptol(src->des_src[i].seg_addr.pval);
+            hdr->hdr_rget.hdr_segs[i].seg_len       = src->des_src[i].seg_len;
+            hdr->hdr_rget.hdr_segs[i].seg_key.key64 = src->des_src[i].seg_key.key64;
+        }
+
+        des->des_cbfunc = mca_pml_bfo_send_ctl_completion;
+
+        /**
+         * Well, it's a get so we will not know when the peer get the data anyway.
+         * If we generate the PERUSE event here, at least we will know when do we
+         * sent the GET message ...
+         */
+        if( sendreq->req_send.req_bytes_packed > 0 ) {
+            PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN,
+                                     &(sendreq->req_send.req_base), PERUSE_SEND );
+        }
+
+    } else {
+
+        /* allocate a rendezvous header - dont eager send any data 
+         * receiver will schedule rdma put(s) of the entire message
+         */
+
+        mca_bml_base_alloc(bml_btl, &des, 
+                           MCA_BTL_NO_ORDER,
+                           sizeof(mca_pml_bfo_rendezvous_hdr_t),
+                           MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
+        if( OPAL_UNLIKELY(NULL == des)) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+        segment = des->des_src;
+            
+        /* build hdr */
+        hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+        hdr->hdr_common.hdr_flags = MCA_PML_BFO_HDR_FLAGS_CONTIG|MCA_PML_BFO_HDR_FLAGS_PIN;
+        hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV;
+        hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+        hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+        hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+        hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+        hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
+        hdr->hdr_rndv.hdr_src_req.pval = sendreq;
+/* BFO FAILOVER CODE - begin */
+        if (0 < sendreq->req_restartseq) {
+            hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART;
+            hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv;
+            hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq;
+            opal_output_verbose(30, mca_pml_bfo_output,
+                                "RNDV: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, "
+                                "src_req=%p, dst_req=%p, peer=%d",
+                                (uint16_t)sendreq->req_send.req_base.req_sequence,
+                                sendreq->req_restartseq,
+                                sendreq->req_send.req_base.req_comm->c_contextid,
+                                sendreq->req_send.req_base.req_comm->c_my_rank, 
+                                (void *)sendreq, sendreq->req_recv.pval,
+                                sendreq->req_send.req_base.req_peer);
+        }
+/* BFO FAILOVER CODE - end */
+
+        bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
+                     sendreq->req_send.req_base.req_proc);
+
+        /* update lengths with number of bytes actually packed */
+        segment->seg_len = sizeof(mca_pml_bfo_rendezvous_hdr_t);
+    
+        /* first fragment of a long message */
+        des->des_cbfunc = mca_pml_bfo_rndv_completion;
+        need_local_cb = true;
+
+        /* wait for ack and completion */
+        sendreq->req_state = 2;
+    }
+
+    des->des_cbdata = sendreq;
+
+    /* send */
+    rc = mca_bml_base_send(bml_btl, des, hdr->hdr_common.hdr_type);
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+        if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) {
+            mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, 0 );
+        }
+/* BFO FAILOVER CODE - begin */
+        if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) &&
+            (MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) {
+            sendreq->req_events++;
+        }
+/* BFO FAILOVER CODE - end */
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, des);
+    return rc;
+}
+
+
+/**
+ *  Rendezvous is required. Not doing rdma so eager send up to
+ *  the btls eager limit.
+ */
+
+int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
+                                         mca_bml_base_btl_t* bml_btl,
+                                         size_t size,
+                                         int flags )
+{
+    mca_btl_base_descriptor_t* des;
+    mca_btl_base_segment_t* segment;
+    mca_pml_bfo_hdr_t* hdr;
+    int rc;
+
+    /* prepare descriptor */
+    if(size == 0) {
+        mca_bml_base_alloc( bml_btl, 
+                            &des, 
+                            MCA_BTL_NO_ORDER,
+                            sizeof(mca_pml_bfo_rendezvous_hdr_t),
+                            MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ); 
+    } else {
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+        mca_bml_base_prepare_src( bml_btl, 
+                                  NULL,
+                                  &sendreq->req_send.req_base.req_convertor,
+                                  MCA_BTL_NO_ORDER,
+                                  sizeof(mca_pml_bfo_rendezvous_hdr_t),
+                                  &size,
+                                  MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
+                                  &des );
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_noaccess,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+    }
+
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    } 
+    segment = des->des_src;
+
+    /* build hdr */
+    hdr = (mca_pml_bfo_hdr_t*)segment->seg_addr.pval;
+    hdr->hdr_common.hdr_flags = flags;
+    hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDV;
+    hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
+    hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
+    hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
+    hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
+    hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
+    hdr->hdr_rndv.hdr_src_req.pval = sendreq;
+/* BFO FAILOVER CODE - begin */
+    if (0 < sendreq->req_restartseq) {
+        hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART;
+        hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv;
+        hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq;
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RNDV: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, "
+                            "src_req=%p, dst_req=%p, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
+                            sendreq->req_send.req_base.req_comm->c_contextid,
+                            sendreq->req_send.req_base.req_comm->c_my_rank,
+                            (void *)sendreq, sendreq->req_recv.pval,
+                            sendreq->req_send.req_base.req_peer);
+    }
+/* BFO FAILOVER CODE - end */
+
+    bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
+            sendreq->req_send.req_base.req_proc);
+
+    /* first fragment of a long message */
+    des->des_cbdata = sendreq;
+    des->des_cbfunc = mca_pml_bfo_rndv_completion;
+
+    /* wait for ack and completion */
+    sendreq->req_state = 2;
+
+    /* send */
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDV);
+    if( OPAL_LIKELY( rc >= 0 ) ) {
+        if( OPAL_LIKELY( 1 == rc ) ) {
+            mca_pml_bfo_rndv_completion_request( bml_btl->btl, sendreq, size );
+        }
+/* BFO FAILOVER CODE - begin */
+        if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+            sendreq->req_events++;
+        }
+/* BFO FAILOVER CODE - end */
+        return OMPI_SUCCESS;
+    }
+    mca_bml_base_free(bml_btl, des );
+    return rc;
+}
+
+void mca_pml_bfo_send_request_copy_in_out( mca_pml_bfo_send_request_t *sendreq,
+                                           uint64_t send_offset,
+                                           uint64_t send_length )
+{
+    mca_pml_bfo_send_range_t *sr;
+    ompi_free_list_item_t *i;
+    mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    int rc = OMPI_SUCCESS, n;
+    double weight_total = 0;
+
+    if( OPAL_UNLIKELY(0 == send_length) )
+        return;
+
+    OMPI_FREE_LIST_WAIT(&mca_pml_bfo.send_ranges, i, rc);
+
+    sr = (mca_pml_bfo_send_range_t*)i;
+
+    sr->range_send_offset = send_offset;
+    sr->range_send_length = send_length;
+    sr->range_btl_idx = 0;
+
+    for(n = 0; n < num_btls && n < mca_pml_bfo.max_send_per_range; n++) {
+        sr->range_btls[n].bml_btl =
+            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
+        weight_total += sr->range_btls[n].bml_btl->btl_weight;
+    }
+
+    sr->range_btl_cnt = n;
+    mca_pml_bfo_calc_weighted_length(sr->range_btls, n, send_length,
+            weight_total);
+
+    OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
+    opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
+    OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
+}
+
+static inline mca_pml_bfo_send_range_t *
+get_send_range_nolock(mca_pml_bfo_send_request_t* sendreq)
+{
+    opal_list_item_t *item;
+
+    item = opal_list_get_first(&sendreq->req_send_ranges);
+
+    if(opal_list_get_end(&sendreq->req_send_ranges) == item)
+        return NULL;
+
+    return (mca_pml_bfo_send_range_t*)item;
+}
+
+static inline mca_pml_bfo_send_range_t *
+get_send_range(mca_pml_bfo_send_request_t* sendreq)
+{
+    mca_pml_bfo_send_range_t *range;
+
+    OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
+    range = get_send_range_nolock(sendreq);
+    OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
+
+    return range;
+}
+
+static inline mca_pml_bfo_send_range_t *
+get_next_send_range(mca_pml_bfo_send_request_t* sendreq,
+        mca_pml_bfo_send_range_t *range)
+{
+    OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
+    opal_list_remove_item(&sendreq->req_send_ranges, (opal_list_item_t *)range);
+    OMPI_FREE_LIST_RETURN(&mca_pml_bfo.send_ranges, &range->base);
+    range = get_send_range_nolock(sendreq);
+    OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
+
+    return range;
+}
+
+/**
+ *  Schedule pipeline of send descriptors for the given request.
+ *  Up to the rdma threshold. If this is a send based protocol,
+ *  the rdma threshold is the end of the message. Otherwise, schedule
+ *  fragments up to the threshold to overlap initial registration/setup
+ *  costs of the rdma. Only one thread can be inside this function.
+ */
+
+int
+mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq)
+{ 
+    size_t prev_bytes_remaining = 0;
+    mca_pml_bfo_send_range_t *range;
+    int num_fail = 0;
+
+    /* check pipeline_depth here before attempting to get any locks */
+    if(true == sendreq->req_throttle_sends &&
+            sendreq->req_pipeline_depth >= mca_pml_bfo.send_pipeline_depth)
+        return OMPI_SUCCESS;
+
+    range = get_send_range(sendreq);
+
+    while(range && (false == sendreq->req_throttle_sends ||
+            sendreq->req_pipeline_depth < mca_pml_bfo.send_pipeline_depth)) {
+        mca_pml_bfo_frag_hdr_t* hdr;
+        mca_btl_base_descriptor_t* des;
+        int rc, btl_idx;
+        size_t size, offset, data_remaining = 0;
+        mca_bml_base_btl_t* bml_btl;
+
+        assert(range->range_send_length != 0);
+/* BFO FAILOVER CODE - begin */
+        /* Failover code.  If this is true, this means the request thinks we
+         * have more BTLs than there really are.  This can happen because
+         * a BTL was removed from the available list.  In this case, we
+         * want to start over. */
+        if ((int)mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->btl_send)
+            != range->range_btl_cnt) {
+            sendreq->req_error++;
+            return OMPI_ERROR;
+        }
+/* BFO FAILOVER CODE - end */
+
+        if(prev_bytes_remaining == range->range_send_length)
+            num_fail++;
+        else
+            num_fail = 0;
+
+        prev_bytes_remaining = range->range_send_length;
+
+        if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) {
+            assert(sendreq->req_pending == MCA_PML_BFO_SEND_PENDING_NONE);
+            add_request_to_send_pending(sendreq,
+                    MCA_PML_BFO_SEND_PENDING_SCHEDULE, true);
+            /* Note that request remains locked. send_request_process_pending()
+             * function will call shedule_exclusive() directly without taking
+             * the lock */
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+cannot_pack:
+        do {
+            btl_idx = range->range_btl_idx;
+            if(++range->range_btl_idx == range->range_btl_cnt)
+                range->range_btl_idx = 0;
+        } while(!range->range_btls[btl_idx].length);
+
+        bml_btl = range->range_btls[btl_idx].bml_btl;
+        /* If there is a remaining data from another BTL that was too small
+         * for converter to pack then send it through another BTL */
+        range->range_btls[btl_idx].length += data_remaining;
+        size = range->range_btls[btl_idx].length;
+
+        /* makes sure that we don't exceed BTL max send size */
+        if(bml_btl->btl->btl_max_send_size != 0) {
+            size_t max_send_size = bml_btl->btl->btl_max_send_size -
+                sizeof(mca_pml_bfo_frag_hdr_t);
+
+            if (size > max_send_size) {
+                size = max_send_size;
+            }
+        }
+            
+        /* pack into a descriptor */
+        offset = (size_t)range->range_send_offset;
+        opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, 
+                                    &offset);
+        range->range_send_offset = (uint64_t)offset;
+
+        data_remaining = size;
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_defined,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+        mca_bml_base_prepare_src(bml_btl, NULL,
+                                 &sendreq->req_send.req_base.req_convertor,
+                                 MCA_BTL_NO_ORDER,
+                                 sizeof(mca_pml_bfo_frag_hdr_t),
+                                 &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des);
+        MEMCHECKER(
+            memchecker_call(&opal_memchecker_base_mem_noaccess,
+                            sendreq->req_send.req_base.req_addr,
+                            sendreq->req_send.req_base.req_count,
+                            sendreq->req_send.req_base.req_datatype);
+        );
+
+        if( OPAL_UNLIKELY(des == NULL || size == 0) ) {
+            if(des) {
+                /* Converter can't pack this chunk. Append to another chunk
+                 * from other BTL */
+                mca_bml_base_free(bml_btl, des);
+                range->range_btls[btl_idx].length -= data_remaining;
+                goto cannot_pack;
+            }   
+            continue;
+        }
+
+        des->des_cbfunc = mca_pml_bfo_frag_completion;
+        des->des_cbdata = sendreq;
+
+        /* setup header */
+        hdr = (mca_pml_bfo_frag_hdr_t*)des->des_src->seg_addr.pval;
+        hdr->hdr_common.hdr_flags = 0;
+        hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FRAG;
+        hdr->hdr_frag_offset = range->range_send_offset;
+        hdr->hdr_src_req.pval = sendreq;
+        hdr->hdr_dst_req = sendreq->req_recv;
+
+        bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FRAG,
+                sendreq->req_send.req_base.req_proc);
+
+#if OMPI_WANT_PERUSE
+         PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
+                 &(sendreq->req_send.req_base), size, PERUSE_SEND);
+#endif  /* OMPI_WANT_PERUSE */
+
+        /* initiate send - note that this may complete before the call returns */
+        rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_FRAG);
+        if( OPAL_LIKELY(rc >= 0) ) {
+            /* update state */
+            range->range_btls[btl_idx].length -= size;
+            range->range_send_length -= size;
+            range->range_send_offset += size;
+            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            if(range->range_send_length == 0) {
+                range = get_next_send_range(sendreq, range);
+                prev_bytes_remaining = 0;
+            }
+/* BFO FAILOVER CODE - begin */
+            if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+                sendreq->req_events++;
+            }
+/* BFO FAILOVER CODE - end */
+        } else { 
+            mca_bml_base_free(bml_btl,des);
+        }
+    }
+
+    return OMPI_SUCCESS;
+} 
+
+
+/**
+ *  An RDMA put operation has completed:
+ *  (1) Update request status and if required set completed
+ *  (2) Send FIN control message to the destination 
+ */
+
+static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
+                                        struct mca_btl_base_endpoint_t* ep,
+                                        struct mca_btl_base_descriptor_t* des,
+                                        int status )
+{
+    mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
+    mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req;
+    mca_bml_base_btl_t* bml_btl;
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_events--;
+/* BFO FAILOVER CODE - end */
+
+    /* check completion status */
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
+/* BFO FAILOVER CODE - begin */
+        sendreq->req_error++;
+/* BFO FAILOVER CODE - end */
+    }
+
+/* BFO FAILOVER CODE - begin */
+    if ( OPAL_UNLIKELY(sendreq->req_error)) {
+        opal_output_verbose(30, mca_pml_bfo_output,
+                            "RDMA write: completion: sendreq has error, outstanding events=%d, "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
+                            sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq, (void *)sendreq,
+                            sendreq->req_recv.pval,
+                            status, sendreq->req_send.req_base.req_peer);
+        if (0 == sendreq->req_events) {
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_PUT,
+                                                       status, btl);
+        }
+        MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+
+/* BFO FAILOVER CODE - begin */
+    /* Find back the bml_btl that this btl belongs to.  If we cannot
+     * find it, then it may have been removed from underneath us, so
+     * find the next available one to send the FIN message on. */
+    bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_rdma, btl);
+    if( OPAL_UNLIKELY(NULL == bml_btl) ) {
+        opal_output_verbose(20, mca_pml_bfo_output,
+                            "RDMA write completion: BML was removed from underneath us, "
+                            "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
+                            (uint16_t)sendreq->req_send.req_base.req_sequence,
+                            sendreq->req_restartseq, (void *)sendreq,
+                            sendreq->req_recv.pval,
+                            status, sendreq->req_send.req_base.req_peer);
+        bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_rdma);
+    }
+/* BFO FAILOVER CODE - end */
+
+    mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, 
+                         bml_btl,
+                         frag->rdma_hdr.hdr_rdma.hdr_des,
+                         des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence, 
+                         sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, 
+                         sendreq->req_send.req_base.req_comm->c_my_rank); 
+    
+    /* check for request completion */
+    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
+
+    send_request_pml_complete_check(sendreq);
+
+    MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+
+    MCA_PML_BFO_PROGRESS_PENDING(btl);
+}
+
+int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
+{
+    mca_mpool_base_registration_t* reg = NULL;
+    mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
+    mca_btl_base_descriptor_t* des;
+    size_t save_size = frag->rdma_length;
+    int rc;
+
+    /* setup descriptor */
+    mca_bml_base_prepare_src( bml_btl, 
+                              reg,
+                              &frag->convertor, 
+                              MCA_BTL_NO_ORDER,
+                              0,
+                              &frag->rdma_length,
+                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
+                              &des );
+    
+    if( OPAL_UNLIKELY(NULL == des) ) {
+        if(frag->retries < mca_pml_bfo.rdma_put_retries_limit) {
+            size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
+            frag->rdma_length = save_size; 
+            opal_convertor_set_position(&frag->convertor, &offset);
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+        } else {
+            mca_pml_bfo_send_request_t *sendreq =
+                (mca_pml_bfo_send_request_t*)frag->rdma_req;
+
+            /* tell receiver to unregister memory */
+            mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc,
+                    bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
+                    MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence, 
+                    sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, 
+                    sendreq->req_send.req_base.req_comm->c_my_rank); 
+
+            /* send fragment by copy in/out */
+            mca_pml_bfo_send_request_copy_in_out(sendreq,
+                    frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
+            /* if a pointer to a receive request is not set it means that
+             * ACK was not yet received. Don't schedule sends before ACK */
+            if(NULL != sendreq->req_recv.pval)
+                mca_pml_bfo_send_request_schedule(sendreq);
+        }
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    
+    des->des_dst = frag->rdma_segs;
+    des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
+    des->des_cbfunc = mca_pml_bfo_put_completion;
+    des->des_cbdata = frag;
+
+    PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
+                                  &(((mca_pml_bfo_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
+
+    rc = mca_bml_base_put(bml_btl, des);
+    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+        mca_bml_base_free(bml_btl, des);
+        frag->rdma_length = save_size;
+        if(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) {
+            OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+            opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        } else {
+            /* TSW - FIX */
+            ORTE_ERROR_LOG(rc);
+            orte_errmgr.abort(-1, NULL);
+        }
+    }
+/* BFO FAILOVER CODE - begin */
+    if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
+        mca_pml_bfo_send_request_t *sendreq =
+            (mca_pml_bfo_send_request_t*)frag->rdma_req;
+        sendreq->req_events++;
+    }
+/* BFO FAILOVER CODE - end */
+
+    return OMPI_SUCCESS;
+}
+
+/**
+ *  Receiver has scheduled an RDMA operation:
+ *  (1) Allocate an RDMA fragment to maintain the state of the operation
+ *  (2) Call BTL prepare_src to pin/prepare source buffers
+ *  (3) Queue the RDMA put 
+ */
+
+void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
+                                   mca_btl_base_module_t* btl, 
+                                   mca_pml_bfo_rdma_hdr_t* hdr )
+{
+    mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
+    mca_pml_bfo_rdma_frag_t* frag;
+    int rc;
+    size_t i, size = 0;
+
+    if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) { 
+/* BFO FAILOVER CODE - begin */
+        /* Handle the failover case where a RNDV request may
+         * have turned into a RGET and therefore the state
+         * is not being tracked. */
+        if (sendreq->req_state != 0) {
+            OPAL_THREAD_ADD32(&sendreq->req_state, -1);
+        }
+/* BFO FAILOVER CODE - end */
+    }
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */
+    sendreq->req_acked = true;            /* only needed once, but it is OK */
+/* BFO FAILOVER CODE - end */
+
+    MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc); 
+
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        /* TSW - FIX */
+        ORTE_ERROR_LOG(rc);
+        orte_errmgr.abort(-1, NULL);
+    }
+
+    /* setup fragment */
+    for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
+        frag->rdma_segs[i].seg_addr.lval = hdr->hdr_segs[i].seg_addr.lval;
+        frag->rdma_segs[i].seg_len       = hdr->hdr_segs[i].seg_len;
+        frag->rdma_segs[i].seg_key.key64 = hdr->hdr_segs[i].seg_key.key64;
+
+#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
+        if ((sendreq->req_send.req_base.req_proc->proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
+            (ompi_proc_local()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
+            size += opal_swap_bytes4(frag->rdma_segs[i].seg_len);
+        } else 
+#endif
+        {
+            size += frag->rdma_segs[i].seg_len;
+        }
+    }
+
+    frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
+/* BFO FAILOVER CODE - begin */
+    frag->rdma_btl = btl;
+    if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
+        opal_output(0, "[%s:%d] invalid bml for rdma put", __FILE__, __LINE__);
+        MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
+        sendreq->req_error++;
+        if (0 == sendreq->req_events) {
+            opal_output(0, "[%s:%d] Issuing rndvrestartnotify", __FILE__, __LINE__);
+            mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
+                                                       MCA_PML_BFO_HDR_TYPE_PUT,
+                                                       OMPI_ERROR, btl);
+        }
+        return;
+    }
+/* BFO FAILOVER CODE - end */
+    frag->rdma_hdr.hdr_rdma = *hdr;
+    frag->rdma_req = sendreq; 
+    frag->rdma_ep = bml_endpoint;
+    frag->rdma_length = size;
+    frag->rdma_state = MCA_PML_BFO_RDMA_PUT;
+    frag->reg = NULL;
+    frag->retries = 0;
+
+    /* lookup the corresponding registration */
+    for(i=0; i<sendreq->req_rdma_cnt; i++) {
+       if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
+           frag->reg = sendreq->req_rdma[i].btl_reg;
+           break;
+       }
+    } 
+
+    /*  RDMA writes may proceed in parallel to send and to each other, so
+     *  create clone of the convertor for each RDMA fragment
+     */
+    size = hdr->hdr_rdma_offset;
+    opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
+            &frag->convertor, 0, &size);
+
+    mca_pml_bfo_send_request_put_frag(frag);
+}
+
diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.h b/ompi/mca/pml/bfo/pml_bfo_sendreq.h
new file mode 100644
index 0000000000..510c54e2f3
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#ifndef OMPI_PML_BFO_SEND_REQUEST_H
+#define OMPI_PML_BFO_SEND_REQUEST_H
+
+#include "ompi/mca/btl/btl.h"
+#include "ompi/mca/pml/base/pml_base_sendreq.h"
+#include "ompi/mca/mpool/base/base.h"
+#include "pml_bfo_comm.h"
+#include "pml_bfo_hdr.h"
+#include "pml_bfo_rdma.h"
+#include "pml_bfo_rdmafrag.h"
+#include "opal/datatype/opal_convertor.h"
+#include "ompi/mca/bml/bml.h" 
+
+BEGIN_C_DECLS
+
+typedef enum {
+    MCA_PML_BFO_SEND_PENDING_NONE,
+    MCA_PML_BFO_SEND_PENDING_SCHEDULE,
+    MCA_PML_BFO_SEND_PENDING_START
+} mca_pml_bfo_send_pending_t;
+
+struct mca_pml_bfo_send_request_t {
+    mca_pml_base_send_request_t req_send;
+    mca_bml_base_endpoint_t* req_endpoint;
+    ompi_ptr_t req_recv;
+/* BFO FAILOVER CODE - begin */
+    int32_t req_events;     /* number of outstanding events on request */
+    int32_t req_restartseq; /* sequence number of restarted request */
+    int32_t req_restart;    /* state of restarted request */
+    int32_t req_error;      /* non-zero when error has occurred on request */
+    bool    req_acked;      /* indicates request has been acked */
+/* BFO FAILOVER CODE - end */
+    int32_t req_state;
+    int32_t req_lock;
+    bool req_throttle_sends;
+    size_t req_pipeline_depth;
+    size_t req_bytes_delivered;
+    uint32_t req_rdma_cnt; 
+    mca_pml_bfo_send_pending_t req_pending;
+    opal_mutex_t req_send_range_lock; 
+    opal_list_t req_send_ranges;
+    mca_pml_bfo_com_btl_t req_rdma[1]; 
+};
+typedef struct mca_pml_bfo_send_request_t mca_pml_bfo_send_request_t;
+
+OBJ_CLASS_DECLARATION(mca_pml_bfo_send_request_t);
+
+struct mca_pml_bfo_send_range_t {
+    ompi_free_list_item_t base;
+    uint64_t range_send_offset;
+    uint64_t range_send_length;
+    int range_btl_idx;
+    int range_btl_cnt;
+    mca_pml_bfo_com_btl_t range_btls[1];
+};
+typedef struct mca_pml_bfo_send_range_t mca_pml_bfo_send_range_t;
+OBJ_CLASS_DECLARATION(mca_pml_bfo_send_range_t);
+
+static inline bool lock_send_request(mca_pml_bfo_send_request_t *sendreq)
+{
+    return OPAL_THREAD_ADD32(&sendreq->req_lock,  1) == 1;
+}
+
+static inline bool unlock_send_request(mca_pml_bfo_send_request_t *sendreq)
+{
+    return OPAL_THREAD_ADD32(&sendreq->req_lock, -1) == 0;
+}
+
+static inline void
+add_request_to_send_pending(mca_pml_bfo_send_request_t* sendreq,
+                            const mca_pml_bfo_send_pending_t type,
+                            const bool append)
+{
+    opal_list_item_t *item = (opal_list_item_t*)sendreq;
+
+    OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+    sendreq->req_pending = type;
+    if(append)
+        opal_list_append(&mca_pml_bfo.send_pending, item);
+    else
+        opal_list_prepend(&mca_pml_bfo.send_pending, item);
+
+    OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+}
+
+static inline mca_pml_bfo_send_request_t*
+get_request_from_send_pending(mca_pml_bfo_send_pending_t *type)
+{
+    mca_pml_bfo_send_request_t *sendreq;
+
+    OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
+    sendreq = (mca_pml_bfo_send_request_t*)
+        opal_list_remove_first(&mca_pml_bfo.send_pending);
+    if(sendreq) {
+        *type = sendreq->req_pending;
+        sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
+    }
+    OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
+
+    return sendreq;
+}
+
+#define MCA_PML_BFO_SEND_REQUEST_ALLOC( comm,                           \
+                                        dst,                            \
+                                        sendreq,                        \
+                                        rc)                             \
+    {                                                                   \
+        ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst );         \
+        ompi_free_list_item_t* item;                                    \
+                                                                        \
+        rc = OMPI_ERR_OUT_OF_RESOURCE;                                  \
+        if( OPAL_LIKELY(NULL != proc) ) {                               \
+            rc = OMPI_SUCCESS;                                          \
+            OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
+            sendreq = (mca_pml_bfo_send_request_t*)item;                \
+            sendreq->req_send.req_base.req_proc = proc;                 \
+        }                                                               \
+    }
+
+
+#define MCA_PML_BFO_SEND_REQUEST_INIT( sendreq,                         \
+                                       buf,                             \
+                                       count,                           \
+                                       datatype,                        \
+                                       dst,                             \
+                                       tag,                             \
+                                       comm,                            \
+                                       sendmode,                        \
+                                       persistent)                      \
+    {                                                                   \
+        MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send,              \
+                                       buf,                             \
+                                       count,                           \
+                                       datatype,                        \
+                                       dst,                             \
+                                       tag,                             \
+                                       comm,                            \
+                                       sendmode,                        \
+                                       persistent,                      \
+                                       0); /* convertor_flags */        \
+        (sendreq)->req_recv.pval = NULL;                                \
+    }
+
+
+static inline void mca_pml_bfo_free_rdma_resources(mca_pml_bfo_send_request_t* sendreq)
+{
+    size_t r;
+
+    /* return mpool resources */
+    for(r = 0; r < sendreq->req_rdma_cnt; r++) {
+        mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
+        if( NULL != reg && reg->mpool != NULL ) {
+            reg->mpool->mpool_deregister(reg->mpool, reg);
+        }
+    }
+    sendreq->req_rdma_cnt = 0;
+}
+
+
+/**
+ * Start a send request. 
+ */
+
+#define MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc)       \
+    do {                                                  \
+        rc = mca_pml_bfo_send_request_start(sendreq);     \
+    } while (0)
+
+
+/*
+ * Mark a send request as completed at the MPI level.
+ */
+
+#define MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal)                  \
+do {                                                                                 \
+   (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE =                     \
+       (sendreq)->req_send.req_base.req_comm->c_my_rank;                             \
+   (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG =                        \
+        (sendreq)->req_send.req_base.req_tag;                                        \
+   (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS;        \
+   (sendreq)->req_send.req_base.req_ompi.req_status._count =                         \
+        (int)(sendreq)->req_send.req_bytes_packed;                                   \
+   ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \
+                                                                                     \
+   PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE,                                \
+                            &(sendreq->req_send.req_base), PERUSE_SEND);             \
+} while(0)
+
+/*
+ * Release resources associated with a request
+ */
+
+#define MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq)                        \
+    do {                                                                \
+    /*  Let the base handle the reference counts */                     \
+    MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send));             \
+    OMPI_FREE_LIST_RETURN( &mca_pml_base_send_requests,                 \
+                           (ompi_free_list_item_t*)sendreq);            \
+    } while(0)
+
+
+/*
+ * The PML has completed a send request. Note that this request
+ * may have been orphaned by the user or have already completed
+ * at the MPI level.
+ * This function will never be called directly from the upper level, as it
+ * should only be an internal call to the PML.
+ *
+ */
+static inline void
+send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
+{
+    assert(false == sendreq->req_send.req_base.req_pml_complete);
+
+    if(sendreq->req_send.req_bytes_packed > 0) {
+        PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END,
+                                 &(sendreq->req_send.req_base), PERUSE_SEND);
+    }
+
+    /* return mpool resources */
+    mca_pml_bfo_free_rdma_resources(sendreq);
+
+    if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
+        sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
+        mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
+    }
+
+    OPAL_THREAD_LOCK(&ompi_request_lock);
+    if(false == sendreq->req_send.req_base.req_ompi.req_complete) {
+        /* Should only be called for long messages (maybe synchronous) */
+        MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
+    }
+    sendreq->req_send.req_base.req_pml_complete = true;
+/* BFO FAILOVER CODE - begin */
+    assert(0 == sendreq->req_events);
+    sendreq->req_restartseq = 0;
+    /* Since sequence numbers increase monotonically and
+     * roll over, initialize it to a value far away from
+     * what it was.  I cannot set it to something like -1
+     * as that is not within the valid range. */
+    sendreq->req_send.req_base.req_sequence =
+        sendreq->req_send.req_base.req_sequence - 10;
+/* BFO FAILOVER CODE - end */
+
+    if(sendreq->req_send.req_base.req_free_called) {
+        MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
+    }
+    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+}
+
+/* returns true if request was completed on PML level */
+static inline bool
+send_request_pml_complete_check(mca_pml_bfo_send_request_t *sendreq)
+{
+#if OPAL_HAVE_THREAD_SUPPORT
+    opal_atomic_rmb();
+#endif
+    /* if no more events are expected for the request and the whole message is
+     * already sent and send fragment scheduling isn't running in another
+     * thread then complete the request on PML level. From now on, if user
+     * called free on this request, the request structure can be reused for
+     * another request or if the request is persistent it can be restarted */
+    if(sendreq->req_state == 0 &&
+            sendreq->req_bytes_delivered >= sendreq->req_send.req_bytes_packed
+            && lock_send_request(sendreq)) {
+        send_request_pml_complete(sendreq);
+        return true;
+    }
+
+    return false;
+}
+
+/**
+ *  Schedule additional fragments 
+ */
+int
+mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t*);
+
+static inline int 
+mca_pml_bfo_send_request_schedule_exclusive(mca_pml_bfo_send_request_t* sendreq)
+{
+    int rc;
+    do {
+        rc = mca_pml_bfo_send_request_schedule_once(sendreq);
+        if(OPAL_SOS_GET_ERROR_CODE(rc) == OMPI_ERR_OUT_OF_RESOURCE)
+            break;
+    } while(!unlock_send_request(sendreq));
+
+    if(OMPI_SUCCESS == rc)
+        send_request_pml_complete_check(sendreq);
+
+    return rc;
+}
+
+static inline void
+mca_pml_bfo_send_request_schedule(mca_pml_bfo_send_request_t* sendreq)
+{
+    /*
+     * Only allow one thread in this routine for a given request.
+     * However, we cannot block callers on a mutex, so simply keep track
+     * of the number of times the routine has been called and run through
+     * the scheduling logic once for every call.
+     */
+
+    if(!lock_send_request(sendreq))
+        return;
+
+    mca_pml_bfo_send_request_schedule_exclusive(sendreq);
+}
+
+/**
+ *  Start the specified request
+ */
+
+int mca_pml_bfo_send_request_start_buffered(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size);
+
+int mca_pml_bfo_send_request_start_copy(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size);
+
+int mca_pml_bfo_send_request_start_prepare(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size);
+
+int mca_pml_bfo_send_request_start_rdma(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size);
+
+int mca_pml_bfo_send_request_start_rndv(
+    mca_pml_bfo_send_request_t* sendreq,
+    mca_bml_base_btl_t* bml_btl,
+    size_t size,
+    int flags);
+
+static inline int
+mca_pml_bfo_send_request_start_btl( mca_pml_bfo_send_request_t* sendreq,
+                                    mca_bml_base_btl_t* bml_btl )
+{
+    size_t size = sendreq->req_send.req_bytes_packed;
+    mca_btl_base_module_t* btl = bml_btl->btl;
+    size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_bfo_hdr_t);
+    int rc;
+
+    assert(btl->btl_eager_limit >= sizeof(mca_pml_bfo_hdr_t));
+    if( OPAL_LIKELY(size <= eager_limit) ) {
+        switch(sendreq->req_send.req_send_mode) {
+        case MCA_PML_BASE_SEND_SYNCHRONOUS:
+            rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
+            break;
+        case MCA_PML_BASE_SEND_BUFFERED:
+            rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
+            break;
+        case MCA_PML_BASE_SEND_COMPLETE:
+            rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
+            break;
+        default:
+            if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
+                rc = mca_pml_bfo_send_request_start_prepare(sendreq, bml_btl, size);
+            } else {
+                rc = mca_pml_bfo_send_request_start_copy(sendreq, bml_btl, size);
+            }
+            break;
+        }
+    } else {
+        size = eager_limit;
+        if(OPAL_UNLIKELY(btl->btl_rndv_eager_limit < eager_limit))
+            size = btl->btl_rndv_eager_limit;
+        if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
+            rc = mca_pml_bfo_send_request_start_buffered(sendreq, bml_btl, size);
+        } else if
+                (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+            unsigned char *base;
+            opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
+            
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_bfo_rdma_btls(
+                                                                              sendreq->req_endpoint,
+                                                                              base,
+                                                                              sendreq->req_send.req_bytes_packed,
+                                                                              sendreq->req_rdma))) {
+                rc = mca_pml_bfo_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_bfo_free_rdma_resources(sendreq);
+                }
+            } else {
+                rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size,
+                                                         MCA_PML_BFO_HDR_FLAGS_CONTIG);
+            }
+        } else {
+            rc = mca_pml_bfo_send_request_start_rndv(sendreq, bml_btl, size, 0);
+        }
+    }
+
+    return rc;
+}
+
+static inline int
+mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
+{   
+    mca_pml_bfo_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
+    mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
+                                        sendreq->req_send.req_base.req_proc->proc_bml;
+    size_t i;
+
+    if( OPAL_UNLIKELY(endpoint == NULL) ) {
+        return OMPI_ERR_UNREACH;
+    }
+
+    sendreq->req_endpoint = endpoint;
+    sendreq->req_state = 0;
+    sendreq->req_lock = 0;
+    sendreq->req_pipeline_depth = 0;
+    sendreq->req_bytes_delivered = 0;
+    sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
+    sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
+        &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
+/* BFO FAILOVER CODE - begin */
+    sendreq->req_restartseq = 0;      /* counts up restarts */
+    sendreq->req_restart = 0;         /* reset in case we restart again */
+    sendreq->req_error = 0;           /* clear error state */
+    sendreq->req_events = 0;          /* clear events, probably 0 anyways */
+    sendreq->req_acked = false;
+/* BFO FAILOVER CODE - end */
+
+    MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
+
+    for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
+        mca_bml_base_btl_t* bml_btl;
+        int rc;
+
+        /* select a btl */
+        bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+        rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
+        if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != OPAL_SOS_GET_ERROR_CODE(rc)) )
+            return rc;
+    }
+    add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
+
+    return OMPI_SUCCESS;
+}
+
+/**
+ *  Initiate a put scheduled by the receiver.
+ */
+
+void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
+                                   mca_btl_base_module_t* btl,
+                                   mca_pml_bfo_rdma_hdr_t* hdr );
+
+int mca_pml_bfo_send_request_put_frag(mca_pml_bfo_rdma_frag_t* frag);
+
+/* This function tries to continue sendreq that was stuck because of resource
+ * unavailability. A sendreq may be added to send_pending list if there is no
+ * resource to send initial packet or there is not resource to schedule data
+ * for sending. The reason the sendreq was added to the list is stored inside
+ * sendreq struct and appropriate operation is retried when resource became
+ * available. bml_btl passed to the function doesn't represents sendreq
+ * destination, it represents BTL on which resource was freed, so only this BTL
+ * should be considered for sending packets */
+void mca_pml_bfo_send_request_process_pending(mca_btl_base_module_t *btl);
+
+void mca_pml_bfo_send_request_copy_in_out(mca_pml_bfo_send_request_t *sendreq,
+                uint64_t send_offset, uint64_t send_length);
+
+END_C_DECLS
+
+#endif  /* OMPI_PML_BFO_SEND_REQUEST_H */
diff --git a/ompi/mca/pml/bfo/pml_bfo_start.c b/ompi/mca/pml/bfo/pml_bfo_start.c
new file mode 100644
index 0000000000..46ccf429d0
--- /dev/null
+++ b/ompi/mca/pml/bfo/pml_bfo_start.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "pml_bfo.h"
+#include "pml_bfo_recvreq.h"
+#include "pml_bfo_sendreq.h"
+#include "ompi/memchecker.h"
+
+
+int mca_pml_bfo_start(size_t count, ompi_request_t** requests)
+{
+    int rc;
+    size_t i;
+    bool reuse_old_request = true;
+
+    for(i=0; i<count; i++) {
+        mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i];
+        if(NULL == pml_request) {
+            continue;
+        }
+        if (OMPI_REQUEST_PML != requests[i]->req_type) {
+            continue;
+        }
+
+        /* If the persistent request is currently active - obtain the
+         * request lock and verify the status is incomplete. if the
+         * pml layer has not completed the request - mark the request
+         * as free called - so that it will be freed when the request
+         * completes - and create a new request.
+         */
+
+        reuse_old_request = true;
+        switch(pml_request->req_ompi.req_state) {
+            case OMPI_REQUEST_INACTIVE:
+                if(pml_request->req_pml_complete == true)
+                    break;
+                /* otherwise fall through */
+            case OMPI_REQUEST_ACTIVE: {
+            
+                ompi_request_t *request;
+                OPAL_THREAD_LOCK(&ompi_request_lock);
+                if (pml_request->req_pml_complete == false) {
+                    /* free request after it completes */
+                    pml_request->req_free_called = true;
+                } else {
+                    /* can reuse the existing request */
+                    OPAL_THREAD_UNLOCK(&ompi_request_lock);
+                    break;
+                }
+
+                reuse_old_request = false;
+                /* allocate a new request */
+                switch(pml_request->req_type) {
+                    case MCA_PML_REQUEST_SEND: {
+                         mca_pml_base_send_mode_t sendmode = 
+                             ((mca_pml_base_send_request_t*)pml_request)->req_send_mode;
+                         rc = mca_pml_bfo_isend_init(
+                              pml_request->req_addr,
+                              pml_request->req_count,
+                              pml_request->req_datatype,
+                              pml_request->req_peer,
+                              pml_request->req_tag,
+                              sendmode,
+                              pml_request->req_comm,
+                              &request);
+                         break;
+                    }
+                    case MCA_PML_REQUEST_RECV:
+                         rc = mca_pml_bfo_irecv_init(
+                              pml_request->req_addr,
+                              pml_request->req_count,
+                              pml_request->req_datatype,
+                              pml_request->req_peer,
+                              pml_request->req_tag,
+                              pml_request->req_comm,
+                              &request);
+                         break;
+                    default:
+                         rc = OMPI_ERR_REQUEST;
+                         break;
+                }
+                OPAL_THREAD_UNLOCK(&ompi_request_lock);
+                if(OMPI_SUCCESS != rc)
+                    return rc;
+                pml_request = (mca_pml_base_request_t*)request;
+                requests[i] = request;
+                break;
+            }
+            default:
+                return OMPI_ERR_REQUEST;
+        }
+
+        /* start the request */
+        switch(pml_request->req_type) {
+            case MCA_PML_REQUEST_SEND: 
+            {
+                mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)pml_request;
+                if( reuse_old_request && (sendreq->req_send.req_bytes_packed != 0) ) {
+                    size_t offset = 0;
+                    /**
+                     * Reset the convertor in case we're dealing with the original
+                     * request, which when completed do not reset the convertor.
+                     */
+                    opal_convertor_set_position( &sendreq->req_send.req_base.req_convertor,
+                                                 &offset );
+                }
+                MCA_PML_BFO_SEND_REQUEST_START(sendreq, rc);
+                if(rc != OMPI_SUCCESS)
+                    return rc;
+                break;
+            }
+            case MCA_PML_REQUEST_RECV:
+            {
+                mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)pml_request;
+                MCA_PML_BFO_RECV_REQUEST_START(recvreq);
+                break;
+            }
+            default:
+                return OMPI_ERR_REQUEST;
+        }
+    }
+    return OMPI_SUCCESS;
+}
+
diff --git a/ompi/mca/pml/bfo/post_configure.sh b/ompi/mca/pml/bfo/post_configure.sh
new file mode 100644
index 0000000000..77a7d52608
--- /dev/null
+++ b/ompi/mca/pml/bfo/post_configure.sh
@@ -0,0 +1 @@
+DIRECT_CALL_HEADER="ompi/mca/pml/bfo/pml_bfo.h"