Improve CUDA GPU transfers over openib BTL. Use aynchronous copies.

This is RFC that was submitted in July and December of 2012. This commit was SVN r27862.
2013-01-17 22:34:43 +00:00 · 2013-01-17 22:34:43 +00:00 · f63c88701f
--- a/ompi/mca/btl/btl.h
+++ b/ompi/mca/btl/btl.h
@ -199,6 +199,8 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_FLAGS_CUDA_PUT        0x0400
 #define MCA_BTL_FLAGS_CUDA_GET        0x0800
 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
 #define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
 /* Default exclusivity levels */
 #define MCA_BTL_EXCLUSIVITY_HIGH     (64*1024) /* internal loopback */
@ -298,6 +300,10 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
 */
 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK    0x0004
 /* Tell the PML that the copy is being done asynchronously
 */
 #define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC   0x0008
 /* Type of transfer that will be done with this frag.
 */
 #define MCA_BTL_DES_FLAGS_PUT               0x0010
--- a/ompi/mca/btl/openib/btl_openib.c
+++ b/ompi/mca/btl/openib/btl_openib.c
@ -55,6 +55,10 @@
 #include "ompi/mca/mpool/base/base.h"
 #include "ompi/mca/mpool/mpool.h"
 #include "ompi/mca/mpool/grdma/mpool_grdma.h"
 #if OMPI_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
 #endif /* OMPI_CUDA_SUPPORT */
 #include "orte/util/proc_info.h"
 #include <errno.h>
 #include <sys/types.h>
@ -1287,6 +1291,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
    iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
    rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    /* If the convertor is copying the data asynchronously, then record an event
     * that will trigger the callback when it completes.  Mark descriptor as async.*/
    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
        mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
        to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
    }
 #endif /* OMPI_CUDA_SUPPORT */
    *size = max_data;
    /* not all upper layer users set this */
--- a/ompi/mca/btl/openib/btl_openib.h
+++ b/ompi/mca/btl/openib/btl_openib.h
@ -308,6 +308,10 @@ struct mca_btl_openib_component_t {
    size_t memalign_threshold;
    void* (*previous_malloc_hook)(size_t __size, const void*);
 #endif
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    int cuda_async_send;
    int cuda_async_recv;
 #endif /* OMPI_CUDA_SUPPORT */
 }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
 OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
--- a/ompi/mca/btl/openib/btl_openib_component.c
+++ b/ompi/mca/btl/openib/btl_openib_component.c
@ -103,6 +103,12 @@ static int btl_openib_component_open(void);
 static int btl_openib_component_close(void);
 static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
 static int btl_openib_component_progress(void);
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
                                                 mca_btl_openib_endpoint_t *ep,
                                                 mca_btl_base_descriptor_t* des,
                                                 int status);
 #endif /* OMPI_CUDA_SUPPORT */
 /*
 * Local variables
 */
@ -3060,8 +3066,24 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
    if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
        /* call registered callback */
        mca_btl_active_message_callback_t* reg;
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
        /* The COPY_ASYNC flag should not be set */
        assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
 #endif /* OMPI_CUDA_SUPPORT */
        reg = mca_btl_base_active_message_trigger + hdr->tag;
        reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
        if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { 
            /* Since ASYNC flag is set, we know this descriptor is being used 
             * for asynchronous copy and cannot be freed yet. Therefore, set
             * up callback for PML to call when complete, add argument into
             * descriptor and return. */
            des->des_cbfunc = btl_openib_handle_incoming_completion;
            des->des_cbdata = (void *)ep;
            return OMPI_SUCCESS;
        }
 #endif /* OMPI_CUDA_SUPPORT */
        if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
            cqp = (hdr->credits >> 11) & 0x0f;
            hdr->credits &= 0x87ff;
@ -3152,6 +3174,85 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
    return OMPI_SUCCESS;
 }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 /**
 * Called by the PML when the copying of the data out of the fragment
 * is complete.
 */
 static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
                                                 mca_btl_base_endpoint_t *ep,
                                                 mca_btl_base_descriptor_t* des,
                                                 int status)
 {
    mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
    mca_btl_openib_header_t *hdr = frag->hdr;
    int rqp = to_base_frag(frag)->base.order, cqp;
    uint16_t rcredits = 0, credits;
    bool is_credit_msg;
    OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
    if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
        cqp = (hdr->credits >> 11) & 0x0f;
        hdr->credits &= 0x87ff;
    } else {
        cqp = rqp;
    }
    if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
        rcredits = BTL_OPENIB_CREDITS(hdr->credits);
        hdr->credits = 0;
    }
    credits = hdr->credits;
    if(hdr->cm_seen)
         OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
    /* We should not be here with eager or control messages */
    assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
    assert(0 == is_cts_message(frag));
    /* HACK - clear out flags.  Must be better way */
    des->des_flags = 0;
    /* Otherwise, FRAG_RETURN it and repost if necessary */
    MCA_BTL_IB_FRAG_RETURN(frag);
    if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
        if (OPAL_UNLIKELY(is_credit_msg)) {
            OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
        } else {
            OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
        }
        mca_btl_openib_endpoint_post_rr(ep, cqp);
    } else {
        mca_btl_openib_module_t *btl = ep->endpoint_btl;
        OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
        mca_btl_openib_post_srr(btl, rqp);
    }
    assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
    /* If we got any credits (RDMA or send), then try to progress all
       the no_credits_pending_frags lists */
    if (rcredits > 0) {
        OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
    }
    if (credits > 0) {
        OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
    }
    if (rcredits + credits > 0) {
        int rc;
        if (OMPI_SUCCESS !=
            (rc = progress_no_credits_pending_frags(ep))) {
            return rc;
        }
    }
    send_credits(ep, cqp);
    return OMPI_SUCCESS;
 }
 #endif /* OMPI_CUDA_SUPPORT */
 static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
 {
    switch(status) {
@ -3632,6 +3733,27 @@ static int btl_openib_component_progress(void)
        count += progress_one_device(device);
    }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    /* Check to see if there are any outstanding dtoh CUDA events that
     * have completed.  If so, issue the PML callbacks on the fragments.
     * The only thing that gets completed here are asynchronous copies
     * so there is no need to free anything.
     */
    {   
        int local_count = 0;
        mca_btl_base_descriptor_t *frag;
        while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
            opal_output(-1, "btl_openib: event completed on frag=%p", (void *)frag);
            frag->des_cbfunc(NULL, NULL, frag, OMPI_SUCCESS);
            local_count++;
        }
        count += local_count;
    }
    if (count > 0) {
        opal_output(-1, "btl_openib: DONE with openib progress, count=%d", count);
    }
 #endif /* OMPI_CUDA_SUPPORT */
    return count;
 #if OPAL_HAVE_THREADS
--- a/ompi/mca/btl/openib/btl_openib_mca.c
+++ b/ompi/mca/btl/openib/btl_openib_mca.c
@ -566,6 +566,31 @@ int btl_openib_register_mca_params(void)
            &mca_btl_openib_component.super.btl_version,
            &mca_btl_openib_module.super));
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* Default is enabling CUDA asynchronous send copies */
    CHECK(reg_int("cuda_async_send", NULL,
                  "Enable or disable CUDA async send copies "
                  "(1 = async; 0 = sync)",
                  1, &ival, 0));
    mca_btl_openib_component.cuda_async_send = (0 != ival);
    if (mca_btl_openib_component.cuda_async_send) {
        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
    }
    /* Default is enabling CUDA asynchronous receive copies */
    CHECK(reg_int("cuda_async_recv", NULL,
                  "Enable or disable CUDA async recv copies "
                  "(1 = async; 0 = sync)",
                 1, &ival, 0));
    mca_btl_openib_component.cuda_async_recv = (0 != ival);
    if (mca_btl_openib_component.cuda_async_recv) {
        mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
    }
    /* Also make the max send size larger for better GPU buffer performance */
   mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
   /* Turn of message coalescing - not sure if it works with GPU buffers */
   mca_btl_openib_component.use_message_coalescing = 0;
 #endif /* OMPI_CUDA_SUPPORT */
    /* setup all the qp stuff */
    /* round mid_qp_size to smallest power of two */
    mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@ -73,7 +73,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            } else {
-                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
            }
        }
    } else {
--- a/ompi/mca/pml/ob1/pml_ob1_progress.c
+++ b/ompi/mca/pml/ob1/pml_ob1_progress.c
@ -22,6 +22,11 @@
 #include "pml_ob1.h"
 #include "pml_ob1_sendreq.h"
 #include "ompi/mca/bml/base/base.h" 
 #if OMPI_CUDA_SUPPORT
 #include "ompi/mca/common/cuda/common_cuda.h"
 #include "pml_ob1_recvreq.h"
 static void mca_pml_ob1_process_pending_cuda_async_copies(void);
 #endif /* OMPI_CUDA_SUPPORT */
 int mca_pml_ob1_progress(void)
 {
@ -29,6 +34,10 @@ int mca_pml_ob1_progress(void)
    int j, completed_requests = 0;
    bool send_succedded;
 #if OMPI_CUDA_SUPPORT
    mca_pml_ob1_process_pending_cuda_async_copies();
 #endif /* OMPI_CUDA_SUPPORT */
    if( OPAL_LIKELY(0 == queue_length) )
        return 0;
@ -77,3 +86,20 @@ int mca_pml_ob1_progress(void)
    return completed_requests;
 }
 #if OMPI_CUDA_SUPPORT
 static void mca_pml_ob1_process_pending_cuda_async_copies(void)
 {
    mca_btl_base_descriptor_t *frag;
    int progress;
    do {
        progress = progress_one_cuda_htod_event(&frag);
        if (1 == progress) {
            /* Call the finish function to make progress. */
            mca_pml_ob1_recv_request_frag_copy_finished(NULL, NULL, frag, 0);
        }
    } while (progress > 0);
    /* Consider progressing dtoh events here in future */
 }
 #endif /* OMPI_CUDA_SUPPORT */
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@ -44,6 +44,10 @@
 #include "pml_ob1_recvreq.h"
 #include "pml_ob1_sendreq.h"
 #include "pml_ob1_hdr.h"
 #if OMPI_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
 #endif /* OMPI_CUDA_SUPPORT */
 OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
                    ompi_free_list_item_t,
@ -332,6 +336,17 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
        OPAL_THREAD_ADD32(&sendreq->req_state, -1);
    }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
        /* The user's buffer is GPU and this BTL can support asynchronous copies,
         * so adjust the convertor accordingly.  All the subsequent fragments will
         * use the asynchronous copy. */
        void *strm = mca_common_cuda_get_dtoh_stream();
        opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
    }
 #endif /* OMPI_CUDA_SUPPORT */
    if(send_request_pml_complete_check(sendreq) == false)
        mca_pml_ob1_send_request_schedule(sendreq);
@ -351,6 +366,22 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
    }
    ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
    recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* If data is destined for GPU buffer and convertor was set up for asynchronous
     * copies, then start the copy and return.  The copy completion will trigger
     * the next phase. */
    if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) {
        assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
        /* This will trigger the opal_convertor_pack to start asynchronous copy. */
        mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_dst_cnt,des);
        /* Let BTL know that it CANNOT free the frag */
        des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
        return;
    }
 #endif /* OMPI_CUDA_SUPPORT */
    mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
    return;
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@ -38,6 +38,10 @@
 #include "orte/mca/errmgr/errmgr.h"
 #include "opal/util/arch.h"
 #include "ompi/memchecker.h"
 #if OMPI_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #include "ompi/mca/common/cuda/common_cuda.h"
 #endif /* OMPI_CUDA_SUPPORT */
 #if OMPI_CUDA_SUPPORT
 int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
@ -527,6 +531,85 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
    }
 }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
 /**
 * This function is basically the first half of the code in the
 * mca_pml_ob1_recv_request_progress_frag function.  This fires off
 * the asynchronous copy and returns.  Unused fields in the descriptor
 * are used to pass extra information for when the asynchronous copy
 * completes.  No memchecker support in this function as copies are 
 * happening asynchronously.
 */
 void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq,
                                               mca_btl_base_module_t* btl,
                                               mca_btl_base_segment_t* segments,
                                               size_t num_segments,
                                               mca_btl_base_descriptor_t* des)
 {
    int result;
    size_t bytes_received = 0, data_offset = 0;
    size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */
    mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
    OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des));
    bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
                                                              sizeof(mca_pml_ob1_frag_hdr_t));
    data_offset     = hdr->hdr_frag.hdr_frag_offset;
    MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
                                     segments,
                                     num_segments,
                                     sizeof(mca_pml_ob1_frag_hdr_t),
                                     data_offset,
                                     bytes_received,
                                     bytes_delivered );
    /* Store the receive request in unused context pointer. */
    des->des_context = (void *)recvreq;
    /* Store the amount of bytes in unused src count value */
    des->des_src_cnt = bytes_delivered;
    /* Then record an event that will get triggered by a PML progress call which
     * checks the stream events.  If we get an error, abort.  Should get message
     * from CUDA code about what went wrong. */
    result = mca_common_cuda_record_htod_event("pml", des);
    if (OMPI_SUCCESS != result) {
        opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
        orte_errmgr.abort(-1, NULL);
    }
 }
 /**
 * This function is basically the second half of the code in the
 * mca_pml_ob1_recv_request_progress_frag function.  The number of
 * bytes delivered is updated.  Then a call is made into the BTL so it
 * can free the fragment that held that data.  This is currently
 * called directly by the common CUDA code. No memchecker support
 * in this function as copies are happening asynchronously.
 */
 void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
                                                  struct mca_btl_base_endpoint_t* ep,
                                                  struct mca_btl_base_descriptor_t* des,
                                                  int status )
 {
    mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
    size_t bytes_received = des->des_src_cnt;
    OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
    /* Call into the BTL so it can free the descriptor.  At this point, it is 
     * known that the data has been copied out of the descriptor. */
    des->des_cbfunc(NULL, (struct mca_btl_base_endpoint_t *)des->des_cbdata, des, 0);
    OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
    /* check completion status */
    if(recv_request_pml_complete_check(recvreq) == false &&
            recvreq->req_rdma_offset < recvreq->req_send_offset) {
        /* schedule additional rdma operations */
        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
    }
 }
 #endif /* OMPI_CUDA_SUPPORT */
 /*
 * Update the recv request status to reflect the number of bytes
 * received and actually delivered to the application. 
@ -701,6 +784,17 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
        /* schedule additional rdma operations */
        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
    }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
    /* If BTL supports it and this is a CUDA buffer being received into,
     * have all subsequent FRAGS copied in asynchronously. */
    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
        (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) {
        void *strm = mca_common_cuda_get_htod_stream();
        opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm);
    }
 #endif
 }
 /*
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@ -305,6 +305,19 @@ void mca_pml_ob1_recv_request_progress_frag(
    mca_btl_base_segment_t* segments,
    size_t num_segments);
 #if OMPI_CUDA_SUPPORT
 void mca_pml_ob1_recv_request_frag_copy_start(
    mca_pml_ob1_recv_request_t* req,
    struct mca_btl_base_module_t* btl,
    mca_btl_base_segment_t* segments,
    size_t num_segments,
    mca_btl_base_descriptor_t* des);
 void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* btl,  
    struct mca_btl_base_endpoint_t* ep,
    struct mca_btl_base_descriptor_t* des,
    int status );
 #endif /* OMPI_CUDA_SUPPORT */
 /**
 *
 */
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@ -332,6 +332,39 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 }
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
 /**
 * This function is called when the copy of the frag from the GPU buffer
 * to the internal buffer is complete.  Used to support asynchronous
 * copies from GPU to host buffers. Now the data can be sent.
 */
 static void
 mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
                                  struct mca_btl_base_endpoint_t* ep,
                                  struct mca_btl_base_descriptor_t* des,
                                  int status )
 {
    int rc;
    mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
    des->des_cbfunc = mca_pml_ob1_frag_completion;
    /* Reset the BTL onwership flag as the BTL can free it after completion. */
    des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
    opal_output(-1, "copy_frag_completion FRAG frag=%p", (void *)des);
    /* Currently, we cannot support a failure in the send.  In the blocking
     * case, the counters tracking the fragments being sent are not adjusted
     * until the function returns success, so it handles the error by leaving
     * all the buffer counters intact.  In this case, it is too late so
     * we just abort.  In theory, a new queue could be created to hold this
     * fragment and then attempt to send it out on another BTL. */
    rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
    if(OPAL_UNLIKELY(rc < 0)) {
        opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
        orte_errmgr.abort(-1, NULL);
    }
 }
 #endif /* OMPI_CUDA_SUPPORT */
 /**
 *  Buffer the entire message and mark as complete.
 */
@ -1030,6 +1063,32 @@ cannot_pack:
                 &(sendreq->req_send.req_base), size, PERUSE_SEND);
 #endif  /* OMPI_WANT_PERUSE */
 #if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
         /* At this point, check to see if the BTL is doing an asynchronous
          * copy.  This would have been initiated in the mca_bml_base_prepare_src
          * called above.  The flag is checked here as we let the hdr be
          * set up prior to checking.
          */
        if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
            opal_output(-1, "Initiating async copy on FRAG frag=%p", (void *)des);
            /* Need to make sure BTL does not free frag after completion
             * of asynchronous copy as we still need to send the fragment. */
            des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
            /* Unclear that this flag needs to be set but to be sure, set it */
            des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
            des->des_cbfunc = mca_pml_ob1_copy_frag_completion;
            range->range_btls[btl_idx].length -= size;
            range->range_send_length -= size;
            range->range_send_offset += size;
            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
            if(range->range_send_length == 0) {
                range = get_next_send_range(sendreq, range);
                prev_bytes_remaining = 0;
            }
            continue;
        }
 #endif /* OMPI_CUDA_SUPPORT */
        /* initiate send - note that this may complete before the call returns */
        rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
        if( OPAL_LIKELY(rc >= 0) ) {
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@ -41,7 +41,7 @@
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
 extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
@ -55,7 +55,7 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
    convertor->remoteArch     = opal_local_arch;
    convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
 #if OPAL_CUDA_SUPPORT
-    convertor->cbmemcpy       = &memcpy;
+    convertor->cbmemcpy       = &opal_cuda_memcpy;
 #endif
 }
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@ -55,6 +55,7 @@ BEGIN_C_DECLS
 #define CONVERTOR_NO_OP            0x00100000
 #define CONVERTOR_WITH_CHECKSUM    0x00200000
 #define CONVERTOR_CUDA             0x00400000
 #define CONVERTOR_CUDA_ASYNC       0x00800000
 #define CONVERTOR_TYPE_MASK        0x00FF0000
 #define CONVERTOR_STATE_START      0x01000000
 #define CONVERTOR_STATE_COMPLETE   0x02000000
@ -69,7 +70,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
                                            uint32_t* out_size,
                                            size_t* max_data );
 typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
 /* The master convertor struct (defined in convertor_internal.h) */
 struct opal_convertor_master_t;
@ -116,6 +117,7 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
    void *                        stream;         /**< CUstream for async copy */
 #endif
    /* size: 248, cachelines: 4, members: 20 */
    /* last cacheline: 56 bytes */
@ -164,9 +166,6 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
        convertor->pStack     = convertor->static_stack;
        convertor->stack_size = DT_STATIC_STACK_SIZE;
    }
 #if OPAL_CUDA_SUPPORT
    convertor->cbmemcpy = &memcpy;
 #endif
    convertor->pDesc     = NULL;
    convertor->stack_pos = 0;
    convertor->flags     = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@ -78,7 +78,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
 #undef MEM_OP_NAME
 #define MEM_OP_NAME non_overlap_cuda
 #undef MEM_OP
-#define MEM_OP opal_cuda_memcpy
+#define MEM_OP opal_cuda_memcpy_sync
 #include "opal_datatype_copy.h"
 #undef MEM_OP_NAME
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@ -86,11 +86,41 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 }
 /*
- * Need intermediate cuMemcpy function so we can check the return code
+ * With CUDA enabled, all contiguous copies will pass through this function.
- * of the call.  If we see an error, abort as there is no recovery at
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
- * this point.
+ * Note that if there is an error with any of the CUDA calls, the program
 * aborts as there is no recovering.
 */
-void *opal_cuda_memcpy(void *dest, void *src, size_t size)
+void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
 {
    int res;
    if (!(convertor->flags & CONVERTOR_CUDA)) {
        return memcpy(dest, src, size);
    }
    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
        res = cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
                            (CUstream)convertor->stream);
    } else {
        res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
    }
    if (res != CUDA_SUCCESS) {
        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, src, (int)size);
        abort();
    } else {
        return dest;
    }
 }
 /*
 * This function is needed in cases where we do not have contiguous
 * datatypes.  The current code has macros that cannot handle a convertor
 * argument to the memcpy call.
 */
 void *opal_cuda_memcpy_sync(void *dest, void *src, size_t size)
 {
    int res;
    res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
@ -170,3 +200,13 @@ static void opal_cuda_support_init(void)
    initialized = true;
 }
 /**
 * Tell the convertor that copies will be asynchronous CUDA copies.  The
 * flags are cleared when the convertor is reinitialized.
 */
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
 {
    convertor->flags |= CONVERTOR_CUDA_ASYNC;
    convertor->stream = stream;
 }
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@ -12,8 +12,10 @@
 void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
 bool opal_cuda_check_bufs(char *dest, char *src);
-void* opal_cuda_memcpy(void * dest, void * src, size_t size);
+void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, void * src, size_t size);
 void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(void));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 #endif
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@ -26,7 +26,7 @@
 /* Make use of existing macro to do CUDA style memcpy */
 #undef MEMCPY_CSUM
 #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
 static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@ -324,7 +324,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
     * memory, need to use the special host to device memory copy.
     * Note this code path was only seen on large receives of
     * noncontiguous data via buffered sends. */
-    pConvertor->cbmemcpy(saved_data, real_data, data_length );
+    pConvertor->cbmemcpy(saved_data, real_data, data_length, pConvertor );
 #else
    /* Save the content of the user memory */
    MEMCPY( saved_data, real_data, data_length );
@ -347,10 +347,10 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
     * data via buffered sends. */
    {
        char resaved_data[16];
-        pConvertor->cbmemcpy(resaved_data, real_data, data_length );
+        pConvertor->cbmemcpy(resaved_data, real_data, data_length, pConvertor );
        for( i = 0; i < data_length; i++ ) {
            if( unused_byte == resaved_data[i] )
-                pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1);
+                pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1, pConvertor);
        }
    }
 #else
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@ -26,7 +26,7 @@
 /* Make use of existing macro to do CUDA style memcpy */
 #undef MEMCPY_CSUM
 #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
+    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
 #include "opal/datatype/opal_convertor.h"