diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index ef3240aa37..bde52a1d29 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -199,6 +199,8 @@ typedef uint8_t mca_btl_base_tag_t; #define MCA_BTL_FLAGS_CUDA_PUT 0x0400 #define MCA_BTL_FLAGS_CUDA_GET 0x0800 #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT) +#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000 +#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000 /* Default exclusivity levels */ #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ @@ -298,6 +300,10 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); */ #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004 +/* Tell the PML that the copy is being done asynchronously + */ +#define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008 + /* Type of transfer that will be done with this frag. */ #define MCA_BTL_DES_FLAGS_PUT 0x0010 diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 99df0c1d40..1858fa4227 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -55,6 +55,10 @@ #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" #include "ompi/mca/mpool/grdma/mpool_grdma.h" +#if OMPI_CUDA_SUPPORT +#include "opal/datatype/opal_datatype_cuda.h" +#include "ompi/mca/common/cuda/common_cuda.h" +#endif /* OMPI_CUDA_SUPPORT */ #include "orte/util/proc_info.h" #include #include @@ -1287,6 +1291,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ + /* If the convertor is copying the data asynchronously, then record an event + * that will trigger the callback when it completes. Mark descriptor as async.*/ + if (convertor->flags & CONVERTOR_CUDA_ASYNC) { + mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag); + to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; + } +#endif /* OMPI_CUDA_SUPPORT */ + *size = max_data; /* not all upper layer users set this */ diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 9161e0006b..a8132891c8 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -308,6 +308,10 @@ struct mca_btl_openib_component_t { size_t memalign_threshold; void* (*previous_malloc_hook)(size_t __size, const void*); #endif +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ + int cuda_async_send; + int cuda_async_recv; +#endif /* OMPI_CUDA_SUPPORT */ }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index d3206b066d..b1d959f190 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -103,6 +103,12 @@ static int btl_openib_component_open(void); static int btl_openib_component_close(void); static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static int btl_openib_component_progress(void); +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ +static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl, + mca_btl_openib_endpoint_t *ep, + mca_btl_base_descriptor_t* des, + int status); +#endif /* OMPI_CUDA_SUPPORT */ /* * Local variables */ @@ -3060,8 +3066,24 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) { /* call registered callback */ mca_btl_active_message_callback_t* reg; + +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ + /* The COPY_ASYNC flag should not be set */ + assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC)); +#endif /* OMPI_CUDA_SUPPORT */ reg = mca_btl_base_active_message_trigger + hdr->tag; reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata ); +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ + if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { + /* Since ASYNC flag is set, we know this descriptor is being used + * for asynchronous copy and cannot be freed yet. Therefore, set + * up callback for PML to call when complete, add argument into + * descriptor and return. */ + des->des_cbfunc = btl_openib_handle_incoming_completion; + des->des_cbdata = (void *)ep; + return OMPI_SUCCESS; + } +#endif /* OMPI_CUDA_SUPPORT */ if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { cqp = (hdr->credits >> 11) & 0x0f; hdr->credits &= 0x87ff; @@ -3152,6 +3174,85 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, return OMPI_SUCCESS; } +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ +/** + * Called by the PML when the copying of the data out of the fragment + * is complete. + */ +static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t *ep, + mca_btl_base_descriptor_t* des, + int status) +{ + mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des; + mca_btl_openib_header_t *hdr = frag->hdr; + int rqp = to_base_frag(frag)->base.order, cqp; + uint16_t rcredits = 0, credits; + bool is_credit_msg; + + OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des)); + + if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { + cqp = (hdr->credits >> 11) & 0x0f; + hdr->credits &= 0x87ff; + } else { + cqp = rqp; + } + if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { + rcredits = BTL_OPENIB_CREDITS(hdr->credits); + hdr->credits = 0; + } + + credits = hdr->credits; + + if(hdr->cm_seen) + OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen); + + /* We should not be here with eager or control messages */ + assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA); + assert(0 == is_cts_message(frag)); + /* HACK - clear out flags. Must be better way */ + des->des_flags = 0; + /* Otherwise, FRAG_RETURN it and repost if necessary */ + MCA_BTL_IB_FRAG_RETURN(frag); + if (BTL_OPENIB_QP_TYPE_PP(rqp)) { + if (OPAL_UNLIKELY(is_credit_msg)) { + OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1); + } else { + OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1); + } + mca_btl_openib_endpoint_post_rr(ep, cqp); + } else { + mca_btl_openib_module_t *btl = ep->endpoint_btl; + OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1); + mca_btl_openib_post_srr(btl, rqp); + } + + assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits); + + /* If we got any credits (RDMA or send), then try to progress all + the no_credits_pending_frags lists */ + if (rcredits > 0) { + OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits); + } + if (credits > 0) { + OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits); + } + if (rcredits + credits > 0) { + int rc; + + if (OMPI_SUCCESS != + (rc = progress_no_credits_pending_frags(ep))) { + return rc; + } + } + + send_credits(ep, cqp); + + return OMPI_SUCCESS; +} +#endif /* OMPI_CUDA_SUPPORT */ + static char* btl_openib_component_status_to_string(enum ibv_wc_status status) { switch(status) { @@ -3632,6 +3733,27 @@ static int btl_openib_component_progress(void) count += progress_one_device(device); } +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ + /* Check to see if there are any outstanding dtoh CUDA events that + * have completed. If so, issue the PML callbacks on the fragments. + * The only thing that gets completed here are asynchronous copies + * so there is no need to free anything. + */ + { + int local_count = 0; + mca_btl_base_descriptor_t *frag; + while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) { + opal_output(-1, "btl_openib: event completed on frag=%p", (void *)frag); + frag->des_cbfunc(NULL, NULL, frag, OMPI_SUCCESS); + local_count++; + } + count += local_count; + } + if (count > 0) { + opal_output(-1, "btl_openib: DONE with openib progress, count=%d", count); + } +#endif /* OMPI_CUDA_SUPPORT */ + return count; #if OPAL_HAVE_THREADS diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index a4f09cebcb..8efac47258 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -566,6 +566,31 @@ int btl_openib_register_mca_params(void) &mca_btl_openib_component.super.btl_version, &mca_btl_openib_module.super)); +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ + /* Default is enabling CUDA asynchronous send copies */ + CHECK(reg_int("cuda_async_send", NULL, + "Enable or disable CUDA async send copies " + "(1 = async; 0 = sync)", + 1, &ival, 0)); + mca_btl_openib_component.cuda_async_send = (0 != ival); + if (mca_btl_openib_component.cuda_async_send) { + mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND; + } + /* Default is enabling CUDA asynchronous receive copies */ + CHECK(reg_int("cuda_async_recv", NULL, + "Enable or disable CUDA async recv copies " + "(1 = async; 0 = sync)", + 1, &ival, 0)); + mca_btl_openib_component.cuda_async_recv = (0 != ival); + if (mca_btl_openib_component.cuda_async_recv) { + mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV; + } + /* Also make the max send size larger for better GPU buffer performance */ + mca_btl_openib_module.super.btl_max_send_size = 128 * 1024; + /* Turn of message coalescing - not sure if it works with GPU buffers */ + mca_btl_openib_component.use_message_coalescing = 0; +#endif /* OMPI_CUDA_SUPPORT */ + /* setup all the qp stuff */ /* round mid_qp_size to smallest power of two */ mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1; diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c index bdc7f6b7ae..19cdcf050a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_cuda.c +++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c @@ -73,7 +73,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq, rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, MCA_PML_OB1_HDR_FLAGS_CONTIG); } else { - rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); + rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0); } } } else { diff --git a/ompi/mca/pml/ob1/pml_ob1_progress.c b/ompi/mca/pml/ob1/pml_ob1_progress.c index bbd505d2fe..0689e03d81 100644 --- a/ompi/mca/pml/ob1/pml_ob1_progress.c +++ b/ompi/mca/pml/ob1/pml_ob1_progress.c @@ -22,6 +22,11 @@ #include "pml_ob1.h" #include "pml_ob1_sendreq.h" #include "ompi/mca/bml/base/base.h" +#if OMPI_CUDA_SUPPORT +#include "ompi/mca/common/cuda/common_cuda.h" +#include "pml_ob1_recvreq.h" +static void mca_pml_ob1_process_pending_cuda_async_copies(void); +#endif /* OMPI_CUDA_SUPPORT */ int mca_pml_ob1_progress(void) { @@ -29,6 +34,10 @@ int mca_pml_ob1_progress(void) int j, completed_requests = 0; bool send_succedded; +#if OMPI_CUDA_SUPPORT + mca_pml_ob1_process_pending_cuda_async_copies(); +#endif /* OMPI_CUDA_SUPPORT */ + if( OPAL_LIKELY(0 == queue_length) ) return 0; @@ -77,3 +86,20 @@ int mca_pml_ob1_progress(void) return completed_requests; } +#if OMPI_CUDA_SUPPORT +static void mca_pml_ob1_process_pending_cuda_async_copies(void) +{ + mca_btl_base_descriptor_t *frag; + int progress; + + do { + progress = progress_one_cuda_htod_event(&frag); + if (1 == progress) { + /* Call the finish function to make progress. */ + mca_pml_ob1_recv_request_frag_copy_finished(NULL, NULL, frag, 0); + } + } while (progress > 0); + /* Consider progressing dtoh events here in future */ + +} +#endif /* OMPI_CUDA_SUPPORT */ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 3c8e9132e1..5b2e9f2a2e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -44,6 +44,10 @@ #include "pml_ob1_recvreq.h" #include "pml_ob1_sendreq.h" #include "pml_ob1_hdr.h" +#if OMPI_CUDA_SUPPORT +#include "opal/datatype/opal_datatype_cuda.h" +#include "ompi/mca/common/cuda/common_cuda.h" +#endif /* OMPI_CUDA_SUPPORT */ OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t, ompi_free_list_item_t, @@ -332,6 +336,17 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl, OPAL_THREAD_ADD32(&sendreq->req_state, -1); } +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ + if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) && + (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) { + /* The user's buffer is GPU and this BTL can support asynchronous copies, + * so adjust the convertor accordingly. All the subsequent fragments will + * use the asynchronous copy. */ + void *strm = mca_common_cuda_get_dtoh_stream(); + opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm); + } +#endif /* OMPI_CUDA_SUPPORT */ + if(send_request_pml_complete_check(sendreq) == false) mca_pml_ob1_send_request_schedule(sendreq); @@ -351,6 +366,22 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ + /* If data is destined for GPU buffer and convertor was set up for asynchronous + * copies, then start the copy and return. The copy completion will trigger + * the next phase. */ + if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) { + assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV); + + /* This will trigger the opal_convertor_pack to start asynchronous copy. */ + mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_dst_cnt,des); + + /* Let BTL know that it CANNOT free the frag */ + des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; + + return; + } +#endif /* OMPI_CUDA_SUPPORT */ mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); return; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 6bbe4c80a4..0cff1633c2 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -38,6 +38,10 @@ #include "orte/mca/errmgr/errmgr.h" #include "opal/util/arch.h" #include "ompi/memchecker.h" +#if OMPI_CUDA_SUPPORT +#include "opal/datatype/opal_datatype_cuda.h" +#include "ompi/mca/common/cuda/common_cuda.h" +#endif /* OMPI_CUDA_SUPPORT */ #if OMPI_CUDA_SUPPORT int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq, @@ -527,6 +531,85 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq } } +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ +/** + * This function is basically the first half of the code in the + * mca_pml_ob1_recv_request_progress_frag function. This fires off + * the asynchronous copy and returns. Unused fields in the descriptor + * are used to pass extra information for when the asynchronous copy + * completes. No memchecker support in this function as copies are + * happening asynchronously. + */ +void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments, + mca_btl_base_descriptor_t* des) +{ + int result; + size_t bytes_received = 0, data_offset = 0; + size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */ + mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; + + OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des)); + + bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, + sizeof(mca_pml_ob1_frag_hdr_t)); + data_offset = hdr->hdr_frag.hdr_frag_offset; + + MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq, + segments, + num_segments, + sizeof(mca_pml_ob1_frag_hdr_t), + data_offset, + bytes_received, + bytes_delivered ); + /* Store the receive request in unused context pointer. */ + des->des_context = (void *)recvreq; + /* Store the amount of bytes in unused src count value */ + des->des_src_cnt = bytes_delivered; + /* Then record an event that will get triggered by a PML progress call which + * checks the stream events. If we get an error, abort. Should get message + * from CUDA code about what went wrong. */ + result = mca_common_cuda_record_htod_event("pml", des); + if (OMPI_SUCCESS != result) { + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } +} + +/** + * This function is basically the second half of the code in the + * mca_pml_ob1_recv_request_progress_frag function. The number of + * bytes delivered is updated. Then a call is made into the BTL so it + * can free the fragment that held that data. This is currently + * called directly by the common CUDA code. No memchecker support + * in this function as copies are happening asynchronously. + */ +void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context; + size_t bytes_received = des->des_src_cnt; + + OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des)); + /* Call into the BTL so it can free the descriptor. At this point, it is + * known that the data has been copied out of the descriptor. */ + des->des_cbfunc(NULL, (struct mca_btl_base_endpoint_t *)des->des_cbdata, des, 0); + + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); + + /* check completion status */ + if(recv_request_pml_complete_check(recvreq) == false && + recvreq->req_rdma_offset < recvreq->req_send_offset) { + /* schedule additional rdma operations */ + mca_pml_ob1_recv_request_schedule(recvreq, NULL); + } +} +#endif /* OMPI_CUDA_SUPPORT */ + /* * Update the recv request status to reflect the number of bytes * received and actually delivered to the application. @@ -701,6 +784,17 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq /* schedule additional rdma operations */ mca_pml_ob1_recv_request_schedule(recvreq, NULL); } + +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ + /* If BTL supports it and this is a CUDA buffer being received into, + * have all subsequent FRAGS copied in asynchronously. */ + if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) && + (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) { + void *strm = mca_common_cuda_get_htod_stream(); + opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm); + } +#endif + } /* diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 7558c7b4c2..c259712abd 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -305,6 +305,19 @@ void mca_pml_ob1_recv_request_progress_frag( mca_btl_base_segment_t* segments, size_t num_segments); +#if OMPI_CUDA_SUPPORT +void mca_pml_ob1_recv_request_frag_copy_start( + mca_pml_ob1_recv_request_t* req, + struct mca_btl_base_module_t* btl, + mca_btl_base_segment_t* segments, + size_t num_segments, + mca_btl_base_descriptor_t* des); + +void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ); +#endif /* OMPI_CUDA_SUPPORT */ /** * */ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 1c6a476b88..7d40e93a46 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -332,6 +332,39 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ +/** + * This function is called when the copy of the frag from the GPU buffer + * to the internal buffer is complete. Used to support asynchronous + * copies from GPU to host buffers. Now the data can be sent. + */ +static void +mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + int rc; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + + des->des_cbfunc = mca_pml_ob1_frag_completion; + /* Reset the BTL onwership flag as the BTL can free it after completion. */ + des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + opal_output(-1, "copy_frag_completion FRAG frag=%p", (void *)des); + /* Currently, we cannot support a failure in the send. In the blocking + * case, the counters tracking the fragments being sent are not adjusted + * until the function returns success, so it handles the error by leaving + * all the buffer counters intact. In this case, it is too late so + * we just abort. In theory, a new queue could be created to hold this + * fragment and then attempt to send it out on another BTL. */ + rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); + if(OPAL_UNLIKELY(rc < 0)) { + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } +} +#endif /* OMPI_CUDA_SUPPORT */ + /** * Buffer the entire message and mark as complete. */ @@ -1030,6 +1063,32 @@ cannot_pack: &(sendreq->req_send.req_base), size, PERUSE_SEND); #endif /* OMPI_WANT_PERUSE */ +#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ + /* At this point, check to see if the BTL is doing an asynchronous + * copy. This would have been initiated in the mca_bml_base_prepare_src + * called above. The flag is checked here as we let the hdr be + * set up prior to checking. + */ + if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { + opal_output(-1, "Initiating async copy on FRAG frag=%p", (void *)des); + /* Need to make sure BTL does not free frag after completion + * of asynchronous copy as we still need to send the fragment. */ + des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + /* Unclear that this flag needs to be set but to be sure, set it */ + des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + des->des_cbfunc = mca_pml_ob1_copy_frag_completion; + range->range_btls[btl_idx].length -= size; + range->range_send_length -= size; + range->range_send_offset += size; + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); + if(range->range_send_length == 0) { + range = get_next_send_range(sendreq, range); + prev_bytes_remaining = 0; + } + continue; + } +#endif /* OMPI_CUDA_SUPPORT */ + /* initiate send - note that this may complete before the call returns */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); if( OPAL_LIKELY(rc >= 0) ) { diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 61d3342985..3c300570cf 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -41,7 +41,7 @@ #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_datatype_cuda.h" #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \ - CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) + CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor, @@ -55,7 +55,7 @@ static void opal_convertor_construct( opal_convertor_t* convertor ) convertor->remoteArch = opal_local_arch; convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; #if OPAL_CUDA_SUPPORT - convertor->cbmemcpy = &memcpy; + convertor->cbmemcpy = &opal_cuda_memcpy; #endif } diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h index ca212bc738..dd3e560daa 100644 --- a/opal/datatype/opal_convertor.h +++ b/opal/datatype/opal_convertor.h @@ -55,6 +55,7 @@ BEGIN_C_DECLS #define CONVERTOR_NO_OP 0x00100000 #define CONVERTOR_WITH_CHECKSUM 0x00200000 #define CONVERTOR_CUDA 0x00400000 +#define CONVERTOR_CUDA_ASYNC 0x00800000 #define CONVERTOR_TYPE_MASK 0x00FF0000 #define CONVERTOR_STATE_START 0x01000000 #define CONVERTOR_STATE_COMPLETE 0x02000000 @@ -69,7 +70,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor, uint32_t* out_size, size_t* max_data ); typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata ); -typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n ); +typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor ); /* The master convertor struct (defined in convertor_internal.h) */ struct opal_convertor_master_t; @@ -116,6 +117,7 @@ struct opal_convertor_t { #if OPAL_CUDA_SUPPORT memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */ + void * stream; /**< CUstream for async copy */ #endif /* size: 248, cachelines: 4, members: 20 */ /* last cacheline: 56 bytes */ @@ -164,9 +166,6 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor ) convertor->pStack = convertor->static_stack; convertor->stack_size = DT_STATIC_STACK_SIZE; } -#if OPAL_CUDA_SUPPORT - convertor->cbmemcpy = &memcpy; -#endif convertor->pDesc = NULL; convertor->stack_pos = 0; convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index 9f03e0c306..8bf0a5c54d 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -78,7 +78,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; #undef MEM_OP_NAME #define MEM_OP_NAME non_overlap_cuda #undef MEM_OP -#define MEM_OP opal_cuda_memcpy +#define MEM_OP opal_cuda_memcpy_sync #include "opal_datatype_copy.h" #undef MEM_OP_NAME diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c index 95df915c7f..ea11c8fc7b 100755 --- a/opal/datatype/opal_datatype_cuda.c +++ b/opal/datatype/opal_datatype_cuda.c @@ -86,11 +86,41 @@ bool opal_cuda_check_bufs(char *dest, char *src) } /* - * Need intermediate cuMemcpy function so we can check the return code - * of the call. If we see an error, abort as there is no recovery at - * this point. + * With CUDA enabled, all contiguous copies will pass through this function. + * Therefore, the first check is to see if the convertor is a GPU buffer. + * Note that if there is an error with any of the CUDA calls, the program + * aborts as there is no recovering. */ -void *opal_cuda_memcpy(void *dest, void *src, size_t size) +void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor) +{ + int res; + + if (!(convertor->flags & CONVERTOR_CUDA)) { + return memcpy(dest, src, size); + } + + if (convertor->flags & CONVERTOR_CUDA_ASYNC) { + res = cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size, + (CUstream)convertor->stream); + } else { + res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); + } + + if (res != CUDA_SUCCESS) { + opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", + res, dest, src, (int)size); + abort(); + } else { + return dest; + } +} + +/* + * This function is needed in cases where we do not have contiguous + * datatypes. The current code has macros that cannot handle a convertor + * argument to the memcpy call. + */ +void *opal_cuda_memcpy_sync(void *dest, void *src, size_t size) { int res; res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); @@ -170,3 +200,13 @@ static void opal_cuda_support_init(void) initialized = true; } + +/** + * Tell the convertor that copies will be asynchronous CUDA copies. The + * flags are cleared when the convertor is reinitialized. + */ +void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream) +{ + convertor->flags |= CONVERTOR_CUDA_ASYNC; + convertor->stream = stream; +} diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h index 9118368210..c3f89af4e8 100755 --- a/opal/datatype/opal_datatype_cuda.h +++ b/opal/datatype/opal_datatype_cuda.h @@ -12,8 +12,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf); bool opal_cuda_check_bufs(char *dest, char *src); -void* opal_cuda_memcpy(void * dest, void * src, size_t size); +void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor); +void* opal_cuda_memcpy_sync(void * dest, void * src, size_t size); void* opal_cuda_memmove(void * dest, void * src, size_t size); void opal_cuda_add_initialization_function(int (*fptr)(void)); +void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream); #endif diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 75324c6a36..1de7ddef33 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -26,7 +26,7 @@ /* Make use of existing macro to do CUDA style memcpy */ #undef MEMCPY_CSUM #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ - CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) + CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index b07fbb0d55..9e61fb3da5 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -324,7 +324,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle * memory, need to use the special host to device memory copy. * Note this code path was only seen on large receives of * noncontiguous data via buffered sends. */ - pConvertor->cbmemcpy(saved_data, real_data, data_length ); + pConvertor->cbmemcpy(saved_data, real_data, data_length, pConvertor ); #else /* Save the content of the user memory */ MEMCPY( saved_data, real_data, data_length ); @@ -347,10 +347,10 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle * data via buffered sends. */ { char resaved_data[16]; - pConvertor->cbmemcpy(resaved_data, real_data, data_length ); + pConvertor->cbmemcpy(resaved_data, real_data, data_length, pConvertor ); for( i = 0; i < data_length; i++ ) { if( unused_byte == resaved_data[i] ) - pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1); + pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1, pConvertor); } } #else diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index ee6dd5d0f1..972f53b172 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -26,7 +26,7 @@ /* Make use of existing macro to do CUDA style memcpy */ #undef MEMCPY_CSUM #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ - CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) + CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif #include "opal/datatype/opal_convertor.h"