Improve CUDA GPU transfers over openib BTL. Use aynchronous copies.

This is RFC that was submitted in July and December of 2012. This commit was SVN r27862.
2013-01-17 22:34:43 +00:00 · 2013-01-17 22:34:43 +00:00 · f63c88701f
--- a/ompi/mca/btl/btl.h
+++ b/ompi/mca/btl/btl.h
@ -199,6 +199,8 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_FLAGS_CUDA_PUT        0x0400
 #define MCA_BTL_FLAGS_CUDA_GET        0x0800
 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
+#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
+#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000

 /* Default exclusivity levels */
 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
@ -298,6 +300,10 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
 */
 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK    0x0004

+/* Tell the PML that the copy is being done asynchronously
+ */
+#define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC   0x0008
+
 /* Type of transfer that will be done with this frag.
 */
 #define MCA_BTL_DES_FLAGS_PUT               0x0010
--- a/ompi/mca/btl/openib/btl_openib.c
+++ b/ompi/mca/btl/openib/btl_openib.c
@ -55,6 +55,10 @@
 #include "ompi/mca/mpool/base/base.h"
 #include "ompi/mca/mpool/mpool.h"
 #include "ompi/mca/mpool/grdma/mpool_grdma.h"
+#if OMPI_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */
 #include "orte/util/proc_info.h"
 #include <errno.h>
 #include <sys/types.h>
@ -1287,6 +1291,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
    iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
    rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+    /* If the convertor is copying the data asynchronously, then record an event
+     * that will trigger the callback when it completes.  Mark descriptor as async.*/
+    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
+        mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
+        to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+
    *size = max_data;

    /* not all upper layer users set this */
--- a/ompi/mca/btl/openib/btl_openib.h
+++ b/ompi/mca/btl/openib/btl_openib.h
@ -308,6 +308,10 @@ struct mca_btl_openib_component_t {
    size_t memalign_threshold;
    void* (*previous_malloc_hook)(size_t __size, const void*);
 #endif
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+    int cuda_async_send;
+    int cuda_async_recv;
+#endif /* OMPI_CUDA_SUPPORT */
 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;

 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
--- a/ompi/mca/btl/openib/btl_openib_component.c
+++ b/ompi/mca/btl/openib/btl_openib_component.c
@ -103,6 +103,12 @@ static int btl_openib_component_open(void);
 static int btl_openib_component_close(void);
 static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
 static int btl_openib_component_progress(void);
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
+                                                 mca_btl_openib_endpoint_t *ep,
+                                                 mca_btl_base_descriptor_t* des,
+                                                 int status);
+#endif /* OMPI_CUDA_SUPPORT */
 /*
 * Local variables
 */
@ -3060,8 +3066,24 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
    if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
        /* call registered callback */
        mca_btl_active_message_callback_t* reg;
+
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+        /* The COPY_ASYNC flag should not be set */
+        assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
+#endif /* OMPI_CUDA_SUPPORT */
        reg = mca_btl_base_active_message_trigger + hdr->tag;
        reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+        if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { 
+            /* Since ASYNC flag is set, we know this descriptor is being used 
+             * for asynchronous copy and cannot be freed yet. Therefore, set
+             * up callback for PML to call when complete, add argument into
+             * descriptor and return. */
+            des->des_cbfunc = btl_openib_handle_incoming_completion;
+            des->des_cbdata = (void *)ep;
+            return OMPI_SUCCESS;
+        }
+#endif /* OMPI_CUDA_SUPPORT */
        if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
            cqp = (hdr->credits >> 11) & 0x0f;
            hdr->credits &= 0x87ff;
@ -3152,6 +3174,85 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
    return OMPI_SUCCESS;
 }

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+/**
+ * Called by the PML when the copying of the data out of the fragment
+ * is complete.
+ */
+static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
+                                                 mca_btl_base_endpoint_t *ep,
+                                                 mca_btl_base_descriptor_t* des,
+                                                 int status)
+{
+    mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
+    mca_btl_openib_header_t *hdr = frag->hdr;
+    int rqp = to_base_frag(frag)->base.order, cqp;
+    uint16_t rcredits = 0, credits;
+    bool is_credit_msg;
+
+    OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
+
+    if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
+        cqp = (hdr->credits >> 11) & 0x0f;
+        hdr->credits &= 0x87ff;
+    } else {
+        cqp = rqp;
+    }
+    if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
+        rcredits = BTL_OPENIB_CREDITS(hdr->credits);
+        hdr->credits = 0;
+    }
+
+    credits = hdr->credits;
+
+    if(hdr->cm_seen)
+         OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
+
+    /* We should not be here with eager or control messages */
+    assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
+    assert(0 == is_cts_message(frag));
+    /* HACK - clear out flags.  Must be better way */
+    des->des_flags = 0;
+    /* Otherwise, FRAG_RETURN it and repost if necessary */
+    MCA_BTL_IB_FRAG_RETURN(frag);
+    if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
+        if (OPAL_UNLIKELY(is_credit_msg)) {
+            OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
+        } else {
+            OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
+        }
+        mca_btl_openib_endpoint_post_rr(ep, cqp);
+    } else {
+        mca_btl_openib_module_t *btl = ep->endpoint_btl;
+        OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
+        mca_btl_openib_post_srr(btl, rqp);
+    }
+
+    assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
+
+    /* If we got any credits (RDMA or send), then try to progress all
+       the no_credits_pending_frags lists */
+    if (rcredits > 0) {
+        OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
+    }
+    if (credits > 0) {
+        OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
+    }
+    if (rcredits + credits > 0) {
+        int rc;
+
+        if (OMPI_SUCCESS !=
+            (rc = progress_no_credits_pending_frags(ep))) {
+            return rc;
+        }
+    }
+
+    send_credits(ep, cqp);
+
+    return OMPI_SUCCESS;
+}
+#endif /* OMPI_CUDA_SUPPORT */
+
 static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
 {
    switch(status) {
@ -3632,6 +3733,27 @@ static int btl_openib_component_progress(void)
        count += progress_one_device(device);
    }

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+    /* Check to see if there are any outstanding dtoh CUDA events that
+     * have completed.  If so, issue the PML callbacks on the fragments.
+     * The only thing that gets completed here are asynchronous copies
+     * so there is no need to free anything.
+     */
+    {   
+        int local_count = 0;
+        mca_btl_base_descriptor_t *frag;
+        while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
+            opal_output(-1, "btl_openib: event completed on frag=%p", (void *)frag);
+            frag->des_cbfunc(NULL, NULL, frag, OMPI_SUCCESS);
+            local_count++;
+        }
+        count += local_count;
+    }
+    if (count > 0) {
+        opal_output(-1, "btl_openib: DONE with openib progress, count=%d", count);
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+
    return count;

 #if OPAL_HAVE_THREADS
--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -566,6 +566,31 @@ int btl_openib_register_mca_params(void)
            &mca_btl_openib_component.super.btl_version,
            &mca_btl_openib_module.super));

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+    /* Default is enabling CUDA asynchronous send copies */
+    CHECK(reg_int("cuda_async_send", NULL,
+                  "Enable or disable CUDA async send copies "
+                  "(1 = async; 0 = sync)",
+                  1, &ival, 0));
+    mca_btl_openib_component.cuda_async_send = (0 != ival);
+    if (mca_btl_openib_component.cuda_async_send) {
+        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
+    }
+    /* Default is enabling CUDA asynchronous receive copies */
+    CHECK(reg_int("cuda_async_recv", NULL,
+                  "Enable or disable CUDA async recv copies "
+                  "(1 = async; 0 = sync)",
+                 1, &ival, 0));
+    mca_btl_openib_component.cuda_async_recv = (0 != ival);
+    if (mca_btl_openib_component.cuda_async_recv) {
+        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
+    }
+    /* Also make the max send size larger for better GPU buffer performance */
+   mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
+   /* Turn of message coalescing - not sure if it works with GPU buffers */
+   mca_btl_openib_component.use_message_coalescing = 0;
+#endif /* OMPI_CUDA_SUPPORT */
+
    /* setup all the qp stuff */
    /* round mid_qp_size to smallest power of two */
    mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -73,7 +73,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            } else {
-                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
            }
        }
    } else {
--- a/ompi/mca/pml/ob1/pml_ob1_progress.c
+++ b/ompi/mca/pml/ob1/pml_ob1_progress.c
@ -22,6 +22,11 @@
 #include "pml_ob1.h"
 #include "pml_ob1_sendreq.h"
 #include "ompi/mca/bml/base/base.h" 
+#if OMPI_CUDA_SUPPORT
+#include "ompi/mca/common/cuda/common_cuda.h"
+#include "pml_ob1_recvreq.h"
+static void mca_pml_ob1_process_pending_cuda_async_copies(void);
+#endif /* OMPI_CUDA_SUPPORT */

 int mca_pml_ob1_progress(void)
 {
@ -29,6 +34,10 @@ int mca_pml_ob1_progress(void)
    int j, completed_requests = 0;
    bool send_succedded;

+#if OMPI_CUDA_SUPPORT
+    mca_pml_ob1_process_pending_cuda_async_copies();
+#endif /* OMPI_CUDA_SUPPORT */
+
    if( OPAL_LIKELY(0 == queue_length) )
        return 0;

@ -77,3 +86,20 @@ int mca_pml_ob1_progress(void)
    return completed_requests;
 }

+#if OMPI_CUDA_SUPPORT
+static void mca_pml_ob1_process_pending_cuda_async_copies(void)
+{
+    mca_btl_base_descriptor_t *frag;
+    int progress;
+
+    do {
+        progress = progress_one_cuda_htod_event(&frag);
+        if (1 == progress) {
+            /* Call the finish function to make progress. */
+            mca_pml_ob1_recv_request_frag_copy_finished(NULL, NULL, frag, 0);
+        }
+    } while (progress > 0);
+    /* Consider progressing dtoh events here in future */
+
+}
+#endif /* OMPI_CUDA_SUPPORT */
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@ -44,6 +44,10 @@
 #include "pml_ob1_recvreq.h"
 #include "pml_ob1_sendreq.h"
 #include "pml_ob1_hdr.h"
+#if OMPI_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */

 OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
                    ompi_free_list_item_t,
@ -332,6 +336,17 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
        OPAL_THREAD_ADD32(&sendreq->req_state, -1);
    }

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
+        /* The user's buffer is GPU and this BTL can support asynchronous copies,
+         * so adjust the convertor accordingly.  All the subsequent fragments will
+         * use the asynchronous copy. */
+        void *strm = mca_common_cuda_get_dtoh_stream();
+        opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
+    }
+#endif /* OMPI_CUDA_SUPPORT */
+
    if(send_request_pml_complete_check(sendreq) == false)
        mca_pml_ob1_send_request_schedule(sendreq);

@ -351,6 +366,22 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
    }
    ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
    recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+    /* If data is destined for GPU buffer and convertor was set up for asynchronous
+     * copies, then start the copy and return.  The copy completion will trigger
+     * the next phase. */
+    if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) {
+        assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
+
+        /* This will trigger the opal_convertor_pack to start asynchronous copy. */
+        mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_dst_cnt,des);
+        
+        /* Let BTL know that it CANNOT free the frag */
+        des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
+
+        return;
+    }
+#endif /* OMPI_CUDA_SUPPORT */
    mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);

    return;
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@ -38,6 +38,10 @@
 #include "orte/mca/errmgr/errmgr.h"
 #include "opal/util/arch.h"
 #include "ompi/memchecker.h"
+#if OMPI_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "ompi/mca/common/cuda/common_cuda.h"
+#endif /* OMPI_CUDA_SUPPORT */

 #if OMPI_CUDA_SUPPORT
 int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
@ -527,6 +531,85 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
    }
 }

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+/**
+ * This function is basically the first half of the code in the
+ * mca_pml_ob1_recv_request_progress_frag function.  This fires off
+ * the asynchronous copy and returns.  Unused fields in the descriptor
+ * are used to pass extra information for when the asynchronous copy
+ * completes.  No memchecker support in this function as copies are 
+ * happening asynchronously.
+ */
+void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq,
+                                               mca_btl_base_module_t* btl,
+                                               mca_btl_base_segment_t* segments,
+                                               size_t num_segments,
+                                               mca_btl_base_descriptor_t* des)
+{
+    int result;
+    size_t bytes_received = 0, data_offset = 0;
+    size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */
+    mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
+
+    OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des));
+
+    bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
+                                                              sizeof(mca_pml_ob1_frag_hdr_t));
+    data_offset     = hdr->hdr_frag.hdr_frag_offset;
+
+    MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
+                                     segments,
+                                     num_segments,
+                                     sizeof(mca_pml_ob1_frag_hdr_t),
+                                     data_offset,
+                                     bytes_received,
+                                     bytes_delivered );
+    /* Store the receive request in unused context pointer. */
+    des->des_context = (void *)recvreq;
+    /* Store the amount of bytes in unused src count value */
+    des->des_src_cnt = bytes_delivered;
+    /* Then record an event that will get triggered by a PML progress call which
+     * checks the stream events.  If we get an error, abort.  Should get message
+     * from CUDA code about what went wrong. */
+    result = mca_common_cuda_record_htod_event("pml", des);
+    if (OMPI_SUCCESS != result) {
+        opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+}
+
+/**
+ * This function is basically the second half of the code in the
+ * mca_pml_ob1_recv_request_progress_frag function.  The number of
+ * bytes delivered is updated.  Then a call is made into the BTL so it
+ * can free the fragment that held that data.  This is currently
+ * called directly by the common CUDA code. No memchecker support
+ * in this function as copies are happening asynchronously.
+ */
+void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
+                                                  struct mca_btl_base_endpoint_t* ep,
+                                                  struct mca_btl_base_descriptor_t* des,
+                                                  int status )
+{
+    mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
+    size_t bytes_received = des->des_src_cnt;
+
+    OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
+    /* Call into the BTL so it can free the descriptor.  At this point, it is 
+     * known that the data has been copied out of the descriptor. */
+    des->des_cbfunc(NULL, (struct mca_btl_base_endpoint_t *)des->des_cbdata, des, 0);
+
+    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
+
+    /* check completion status */
+    if(recv_request_pml_complete_check(recvreq) == false &&
+            recvreq->req_rdma_offset < recvreq->req_send_offset) {
+        /* schedule additional rdma operations */
+        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
+    }
+}
+#endif /* OMPI_CUDA_SUPPORT */
+
 /*
 * Update the recv request status to reflect the number of bytes
 * received and actually delivered to the application. 
@ -701,6 +784,17 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
        /* schedule additional rdma operations */
        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
    }
+
+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
+    /* If BTL supports it and this is a CUDA buffer being received into,
+     * have all subsequent FRAGS copied in asynchronously. */
+    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) {
+        void *strm = mca_common_cuda_get_htod_stream();
+        opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm);
+    }
+#endif
+    
 }

 /*
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@ -305,6 +305,19 @@ void mca_pml_ob1_recv_request_progress_frag(
    mca_btl_base_segment_t* segments,
    size_t num_segments);

+#if OMPI_CUDA_SUPPORT
+void mca_pml_ob1_recv_request_frag_copy_start(
+    mca_pml_ob1_recv_request_t* req,
+    struct mca_btl_base_module_t* btl,
+    mca_btl_base_segment_t* segments,
+    size_t num_segments,
+    mca_btl_base_descriptor_t* des);
+
+void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* btl,  
+    struct mca_btl_base_endpoint_t* ep,
+    struct mca_btl_base_descriptor_t* des,
+    int status );
+#endif /* OMPI_CUDA_SUPPORT */
 /**
 *
 */
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -332,6 +332,39 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 }

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+/**
+ * This function is called when the copy of the frag from the GPU buffer
+ * to the internal buffer is complete.  Used to support asynchronous
+ * copies from GPU to host buffers. Now the data can be sent.
+ */
+static void
+mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
+                                  struct mca_btl_base_endpoint_t* ep,
+                                  struct mca_btl_base_descriptor_t* des,
+                                  int status )
+{
+    int rc;
+    mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
+
+    des->des_cbfunc = mca_pml_ob1_frag_completion;
+    /* Reset the BTL onwership flag as the BTL can free it after completion. */
+    des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    opal_output(-1, "copy_frag_completion FRAG frag=%p", (void *)des);
+    /* Currently, we cannot support a failure in the send.  In the blocking
+     * case, the counters tracking the fragments being sent are not adjusted
+     * until the function returns success, so it handles the error by leaving
+     * all the buffer counters intact.  In this case, it is too late so
+     * we just abort.  In theory, a new queue could be created to hold this
+     * fragment and then attempt to send it out on another BTL. */
+    rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
+    if(OPAL_UNLIKELY(rc < 0)) {
+        opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
+        orte_errmgr.abort(-1, NULL);
+    }
+}
+#endif /* OMPI_CUDA_SUPPORT */
+
 /**
 *  Buffer the entire message and mark as complete.
 */
@ -1030,6 +1063,32 @@ cannot_pack:
                 &(sendreq->req_send.req_base), size, PERUSE_SEND);
 #endif  /* OMPI_WANT_PERUSE */

+#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
+         /* At this point, check to see if the BTL is doing an asynchronous
+          * copy.  This would have been initiated in the mca_bml_base_prepare_src
+          * called above.  The flag is checked here as we let the hdr be
+          * set up prior to checking.
+          */
+        if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
+            opal_output(-1, "Initiating async copy on FRAG frag=%p", (void *)des);
+            /* Need to make sure BTL does not free frag after completion
+             * of asynchronous copy as we still need to send the fragment. */
+            des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+            /* Unclear that this flag needs to be set but to be sure, set it */
+            des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
+            des->des_cbfunc = mca_pml_ob1_copy_frag_completion;
+            range->range_btls[btl_idx].length -= size;
+            range->range_send_length -= size;
+            range->range_send_offset += size;
+            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            if(range->range_send_length == 0) {
+                range = get_next_send_range(sendreq, range);
+                prev_bytes_remaining = 0;
+            }
+            continue;
+        }
+#endif /* OMPI_CUDA_SUPPORT */
+
        /* initiate send - note that this may complete before the call returns */
        rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
        if( OPAL_LIKELY(rc >= 0) ) {
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@ -41,7 +41,7 @@
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif

 extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
@ -55,7 +55,7 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
    convertor->remoteArch     = opal_local_arch;
    convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
 #if OPAL_CUDA_SUPPORT
-    convertor->cbmemcpy       = &memcpy;
+    convertor->cbmemcpy       = &opal_cuda_memcpy;
 #endif
 }

--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@ -55,6 +55,7 @@ BEGIN_C_DECLS
 #define CONVERTOR_NO_OP            0x00100000
 #define CONVERTOR_WITH_CHECKSUM    0x00200000
 #define CONVERTOR_CUDA             0x00400000
+#define CONVERTOR_CUDA_ASYNC       0x00800000
 #define CONVERTOR_TYPE_MASK        0x00FF0000
 #define CONVERTOR_STATE_START      0x01000000
 #define CONVERTOR_STATE_COMPLETE   0x02000000
@ -69,7 +70,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
                                            uint32_t* out_size,
                                            size_t* max_data );
 typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );

 /* The master convertor struct (defined in convertor_internal.h) */
 struct opal_convertor_master_t;
@ -116,6 +117,7 @@ struct opal_convertor_t {

 #if OPAL_CUDA_SUPPORT
    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+    void *                        stream;         /**< CUstream for async copy */
 #endif
    /* size: 248, cachelines: 4, members: 20 */
    /* last cacheline: 56 bytes */
@ -164,9 +166,6 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
        convertor->pStack     = convertor->static_stack;
        convertor->stack_size = DT_STATIC_STACK_SIZE;
    }
-#if OPAL_CUDA_SUPPORT
-    convertor->cbmemcpy = &memcpy;
-#endif
    convertor->pDesc     = NULL;
    convertor->stack_pos = 0;
    convertor->flags     = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@ -78,7 +78,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
 #undef MEM_OP_NAME
 #define MEM_OP_NAME non_overlap_cuda
 #undef MEM_OP
-#define MEM_OP opal_cuda_memcpy
+#define MEM_OP opal_cuda_memcpy_sync
 #include "opal_datatype_copy.h"

 #undef MEM_OP_NAME
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@ -86,11 +86,41 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 }

 /*
- * Need intermediate cuMemcpy function so we can check the return code
- * of the call.  If we see an error, abort as there is no recovery at
- * this point.
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * Note that if there is an error with any of the CUDA calls, the program
+ * aborts as there is no recovering.
 */
-void *opal_cuda_memcpy(void *dest, void *src, size_t size)
+void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
+{
+    int res;
+
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        return memcpy(dest, src, size);
+    }
+            
+    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
+        res = cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
+                            (CUstream)convertor->stream);
+    } else {
+        res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    }
+
+    if (res != CUDA_SUCCESS) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, src, (int)size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * This function is needed in cases where we do not have contiguous
+ * datatypes.  The current code has macros that cannot handle a convertor
+ * argument to the memcpy call.
+ */
+void *opal_cuda_memcpy_sync(void *dest, void *src, size_t size)
 {
    int res;
    res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
@ -170,3 +200,13 @@ static void opal_cuda_support_init(void)

    initialized = true;
 }
+
+/**
+ * Tell the convertor that copies will be asynchronous CUDA copies.  The
+ * flags are cleared when the convertor is reinitialized.
+ */
+void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
+{
+    convertor->flags |= CONVERTOR_CUDA_ASYNC;
+    convertor->stream = stream;
+}
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@ -12,8 +12,10 @@

 void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
 bool opal_cuda_check_bufs(char *dest, char *src);
-void* opal_cuda_memcpy(void * dest, void * src, size_t size);
+void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
+void* opal_cuda_memcpy_sync(void * dest, void * src, size_t size);
 void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(void));
+void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);

 #endif
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@ -26,7 +26,7 @@
 /* Make use of existing macro to do CUDA style memcpy */
 #undef MEMCPY_CSUM
 #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif

 static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@ -324,7 +324,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
     * memory, need to use the special host to device memory copy.
     * Note this code path was only seen on large receives of
     * noncontiguous data via buffered sends. */
-    pConvertor->cbmemcpy(saved_data, real_data, data_length );
+    pConvertor->cbmemcpy(saved_data, real_data, data_length, pConvertor );
 #else
    /* Save the content of the user memory */
    MEMCPY( saved_data, real_data, data_length );
@ -347,10 +347,10 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
     * data via buffered sends. */
    {
        char resaved_data[16];
-        pConvertor->cbmemcpy(resaved_data, real_data, data_length );
+        pConvertor->cbmemcpy(resaved_data, real_data, data_length, pConvertor );
        for( i = 0; i < data_length; i++ ) {
            if( unused_byte == resaved_data[i] )
-                pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1);
+                pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1, pConvertor);
        }
    }
 #else
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@ -26,7 +26,7 @@
 /* Make use of existing macro to do CUDA style memcpy */
 #undef MEMCPY_CSUM
 #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif

 #include "opal/datatype/opal_convertor.h"