Improve CUDA GPU transfers over openib BTL. Use aynchronous copies.
This is RFC that was submitted in July and December of 2012. This commit was SVN r27862.
Этот коммит содержится в:
родитель
92e297d1fa
Коммит
f63c88701f
@ -199,6 +199,8 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_FLAGS_CUDA_PUT 0x0400
|
||||
#define MCA_BTL_FLAGS_CUDA_GET 0x0800
|
||||
#define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
|
||||
#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
|
||||
#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
|
||||
@ -298,6 +300,10 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
|
||||
*/
|
||||
#define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004
|
||||
|
||||
/* Tell the PML that the copy is being done asynchronously
|
||||
*/
|
||||
#define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008
|
||||
|
||||
/* Type of transfer that will be done with this frag.
|
||||
*/
|
||||
#define MCA_BTL_DES_FLAGS_PUT 0x0010
|
||||
|
@ -55,6 +55,10 @@
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#include "ompi/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
#include "orte/util/proc_info.h"
|
||||
#include <errno.h>
|
||||
#include <sys/types.h>
|
||||
@ -1287,6 +1291,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
|
||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
/* If the convertor is copying the data asynchronously, then record an event
|
||||
* that will trigger the callback when it completes. Mark descriptor as async.*/
|
||||
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
|
||||
mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
|
||||
to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
*size = max_data;
|
||||
|
||||
/* not all upper layer users set this */
|
||||
|
@ -308,6 +308,10 @@ struct mca_btl_openib_component_t {
|
||||
size_t memalign_threshold;
|
||||
void* (*previous_malloc_hook)(size_t __size, const void*);
|
||||
#endif
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
int cuda_async_send;
|
||||
int cuda_async_recv;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
|
||||
|
@ -103,6 +103,12 @@ static int btl_openib_component_open(void);
|
||||
static int btl_openib_component_close(void);
|
||||
static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
|
||||
static int btl_openib_component_progress(void);
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
|
||||
mca_btl_openib_endpoint_t *ep,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
int status);
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
/*
|
||||
* Local variables
|
||||
*/
|
||||
@ -3060,8 +3066,24 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
|
||||
/* call registered callback */
|
||||
mca_btl_active_message_callback_t* reg;
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/* The COPY_ASYNC flag should not be set */
|
||||
assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
|
||||
/* Since ASYNC flag is set, we know this descriptor is being used
|
||||
* for asynchronous copy and cannot be freed yet. Therefore, set
|
||||
* up callback for PML to call when complete, add argument into
|
||||
* descriptor and return. */
|
||||
des->des_cbfunc = btl_openib_handle_incoming_completion;
|
||||
des->des_cbdata = (void *)ep;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
|
||||
cqp = (hdr->credits >> 11) & 0x0f;
|
||||
hdr->credits &= 0x87ff;
|
||||
@ -3152,6 +3174,85 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/**
|
||||
* Called by the PML when the copying of the data out of the fragment
|
||||
* is complete.
|
||||
*/
|
||||
static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
|
||||
mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_base_descriptor_t* des,
|
||||
int status)
|
||||
{
|
||||
mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
|
||||
mca_btl_openib_header_t *hdr = frag->hdr;
|
||||
int rqp = to_base_frag(frag)->base.order, cqp;
|
||||
uint16_t rcredits = 0, credits;
|
||||
bool is_credit_msg;
|
||||
|
||||
OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
|
||||
|
||||
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
|
||||
cqp = (hdr->credits >> 11) & 0x0f;
|
||||
hdr->credits &= 0x87ff;
|
||||
} else {
|
||||
cqp = rqp;
|
||||
}
|
||||
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
|
||||
rcredits = BTL_OPENIB_CREDITS(hdr->credits);
|
||||
hdr->credits = 0;
|
||||
}
|
||||
|
||||
credits = hdr->credits;
|
||||
|
||||
if(hdr->cm_seen)
|
||||
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
|
||||
|
||||
/* We should not be here with eager or control messages */
|
||||
assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
|
||||
assert(0 == is_cts_message(frag));
|
||||
/* HACK - clear out flags. Must be better way */
|
||||
des->des_flags = 0;
|
||||
/* Otherwise, FRAG_RETURN it and repost if necessary */
|
||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||
if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
|
||||
if (OPAL_UNLIKELY(is_credit_msg)) {
|
||||
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
|
||||
} else {
|
||||
OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
|
||||
}
|
||||
mca_btl_openib_endpoint_post_rr(ep, cqp);
|
||||
} else {
|
||||
mca_btl_openib_module_t *btl = ep->endpoint_btl;
|
||||
OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
|
||||
mca_btl_openib_post_srr(btl, rqp);
|
||||
}
|
||||
|
||||
assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
|
||||
|
||||
/* If we got any credits (RDMA or send), then try to progress all
|
||||
the no_credits_pending_frags lists */
|
||||
if (rcredits > 0) {
|
||||
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
|
||||
}
|
||||
if (credits > 0) {
|
||||
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
|
||||
}
|
||||
if (rcredits + credits > 0) {
|
||||
int rc;
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
(rc = progress_no_credits_pending_frags(ep))) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
send_credits(ep, cqp);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
|
||||
{
|
||||
switch(status) {
|
||||
@ -3632,6 +3733,27 @@ static int btl_openib_component_progress(void)
|
||||
count += progress_one_device(device);
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
/* Check to see if there are any outstanding dtoh CUDA events that
|
||||
* have completed. If so, issue the PML callbacks on the fragments.
|
||||
* The only thing that gets completed here are asynchronous copies
|
||||
* so there is no need to free anything.
|
||||
*/
|
||||
{
|
||||
int local_count = 0;
|
||||
mca_btl_base_descriptor_t *frag;
|
||||
while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
|
||||
opal_output(-1, "btl_openib: event completed on frag=%p", (void *)frag);
|
||||
frag->des_cbfunc(NULL, NULL, frag, OMPI_SUCCESS);
|
||||
local_count++;
|
||||
}
|
||||
count += local_count;
|
||||
}
|
||||
if (count > 0) {
|
||||
opal_output(-1, "btl_openib: DONE with openib progress, count=%d", count);
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
return count;
|
||||
|
||||
#if OPAL_HAVE_THREADS
|
||||
|
@ -566,6 +566,31 @@ int btl_openib_register_mca_params(void)
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
&mca_btl_openib_module.super));
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/* Default is enabling CUDA asynchronous send copies */
|
||||
CHECK(reg_int("cuda_async_send", NULL,
|
||||
"Enable or disable CUDA async send copies "
|
||||
"(1 = async; 0 = sync)",
|
||||
1, &ival, 0));
|
||||
mca_btl_openib_component.cuda_async_send = (0 != ival);
|
||||
if (mca_btl_openib_component.cuda_async_send) {
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
|
||||
}
|
||||
/* Default is enabling CUDA asynchronous receive copies */
|
||||
CHECK(reg_int("cuda_async_recv", NULL,
|
||||
"Enable or disable CUDA async recv copies "
|
||||
"(1 = async; 0 = sync)",
|
||||
1, &ival, 0));
|
||||
mca_btl_openib_component.cuda_async_recv = (0 != ival);
|
||||
if (mca_btl_openib_component.cuda_async_recv) {
|
||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
|
||||
}
|
||||
/* Also make the max send size larger for better GPU buffer performance */
|
||||
mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
|
||||
/* Turn of message coalescing - not sure if it works with GPU buffers */
|
||||
mca_btl_openib_component.use_message_coalescing = 0;
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/* setup all the qp stuff */
|
||||
/* round mid_qp_size to smallest power of two */
|
||||
mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;
|
||||
|
@ -73,7 +73,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
|
||||
MCA_PML_OB1_HDR_FLAGS_CONTIG);
|
||||
} else {
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
|
||||
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -22,6 +22,11 @@
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_sendreq.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
#include "ompi/mca/common/cuda/common_cuda.h"
|
||||
#include "pml_ob1_recvreq.h"
|
||||
static void mca_pml_ob1_process_pending_cuda_async_copies(void);
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
int mca_pml_ob1_progress(void)
|
||||
{
|
||||
@ -29,6 +34,10 @@ int mca_pml_ob1_progress(void)
|
||||
int j, completed_requests = 0;
|
||||
bool send_succedded;
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
mca_pml_ob1_process_pending_cuda_async_copies();
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
if( OPAL_LIKELY(0 == queue_length) )
|
||||
return 0;
|
||||
|
||||
@ -77,3 +86,20 @@ int mca_pml_ob1_progress(void)
|
||||
return completed_requests;
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
static void mca_pml_ob1_process_pending_cuda_async_copies(void)
|
||||
{
|
||||
mca_btl_base_descriptor_t *frag;
|
||||
int progress;
|
||||
|
||||
do {
|
||||
progress = progress_one_cuda_htod_event(&frag);
|
||||
if (1 == progress) {
|
||||
/* Call the finish function to make progress. */
|
||||
mca_pml_ob1_recv_request_frag_copy_finished(NULL, NULL, frag, 0);
|
||||
}
|
||||
} while (progress > 0);
|
||||
/* Consider progressing dtoh events here in future */
|
||||
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
@ -44,6 +44,10 @@
|
||||
#include "pml_ob1_recvreq.h"
|
||||
#include "pml_ob1_sendreq.h"
|
||||
#include "pml_ob1_hdr.h"
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#include "ompi/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
|
||||
ompi_free_list_item_t,
|
||||
@ -332,6 +336,17 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
|
||||
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
|
||||
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
|
||||
/* The user's buffer is GPU and this BTL can support asynchronous copies,
|
||||
* so adjust the convertor accordingly. All the subsequent fragments will
|
||||
* use the asynchronous copy. */
|
||||
void *strm = mca_common_cuda_get_dtoh_stream();
|
||||
opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
if(send_request_pml_complete_check(sendreq) == false)
|
||||
mca_pml_ob1_send_request_schedule(sendreq);
|
||||
|
||||
@ -351,6 +366,22 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
|
||||
}
|
||||
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/* If data is destined for GPU buffer and convertor was set up for asynchronous
|
||||
* copies, then start the copy and return. The copy completion will trigger
|
||||
* the next phase. */
|
||||
if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) {
|
||||
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
|
||||
|
||||
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
|
||||
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_dst_cnt,des);
|
||||
|
||||
/* Let BTL know that it CANNOT free the frag */
|
||||
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
|
||||
|
||||
return;
|
||||
|
@ -38,6 +38,10 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "opal/util/arch.h"
|
||||
#include "ompi/memchecker.h"
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#include "ompi/mca/common/cuda/common_cuda.h"
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
|
||||
@ -527,6 +531,85 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
|
||||
}
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/**
|
||||
* This function is basically the first half of the code in the
|
||||
* mca_pml_ob1_recv_request_progress_frag function. This fires off
|
||||
* the asynchronous copy and returns. Unused fields in the descriptor
|
||||
* are used to pass extra information for when the asynchronous copy
|
||||
* completes. No memchecker support in this function as copies are
|
||||
* happening asynchronously.
|
||||
*/
|
||||
void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq,
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
int result;
|
||||
size_t bytes_received = 0, data_offset = 0;
|
||||
size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */
|
||||
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
|
||||
|
||||
OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des));
|
||||
|
||||
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t));
|
||||
data_offset = hdr->hdr_frag.hdr_frag_offset;
|
||||
|
||||
MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
|
||||
segments,
|
||||
num_segments,
|
||||
sizeof(mca_pml_ob1_frag_hdr_t),
|
||||
data_offset,
|
||||
bytes_received,
|
||||
bytes_delivered );
|
||||
/* Store the receive request in unused context pointer. */
|
||||
des->des_context = (void *)recvreq;
|
||||
/* Store the amount of bytes in unused src count value */
|
||||
des->des_src_cnt = bytes_delivered;
|
||||
/* Then record an event that will get triggered by a PML progress call which
|
||||
* checks the stream events. If we get an error, abort. Should get message
|
||||
* from CUDA code about what went wrong. */
|
||||
result = mca_common_cuda_record_htod_event("pml", des);
|
||||
if (OMPI_SUCCESS != result) {
|
||||
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is basically the second half of the code in the
|
||||
* mca_pml_ob1_recv_request_progress_frag function. The number of
|
||||
* bytes delivered is updated. Then a call is made into the BTL so it
|
||||
* can free the fragment that held that data. This is currently
|
||||
* called directly by the common CUDA code. No memchecker support
|
||||
* in this function as copies are happening asynchronously.
|
||||
*/
|
||||
void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
|
||||
size_t bytes_received = des->des_src_cnt;
|
||||
|
||||
OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
|
||||
/* Call into the BTL so it can free the descriptor. At this point, it is
|
||||
* known that the data has been copied out of the descriptor. */
|
||||
des->des_cbfunc(NULL, (struct mca_btl_base_endpoint_t *)des->des_cbdata, des, 0);
|
||||
|
||||
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
|
||||
|
||||
/* check completion status */
|
||||
if(recv_request_pml_complete_check(recvreq) == false &&
|
||||
recvreq->req_rdma_offset < recvreq->req_send_offset) {
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, NULL);
|
||||
}
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/*
|
||||
* Update the recv request status to reflect the number of bytes
|
||||
* received and actually delivered to the application.
|
||||
@ -701,6 +784,17 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
|
||||
/* schedule additional rdma operations */
|
||||
mca_pml_ob1_recv_request_schedule(recvreq, NULL);
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
|
||||
/* If BTL supports it and this is a CUDA buffer being received into,
|
||||
* have all subsequent FRAGS copied in asynchronously. */
|
||||
if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
|
||||
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) {
|
||||
void *strm = mca_common_cuda_get_htod_stream();
|
||||
opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -305,6 +305,19 @@ void mca_pml_ob1_recv_request_progress_frag(
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments);
|
||||
|
||||
#if OMPI_CUDA_SUPPORT
|
||||
void mca_pml_ob1_recv_request_frag_copy_start(
|
||||
mca_pml_ob1_recv_request_t* req,
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_segment_t* segments,
|
||||
size_t num_segments,
|
||||
mca_btl_base_descriptor_t* des);
|
||||
|
||||
void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status );
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -332,6 +332,39 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
|
||||
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
|
||||
}
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
/**
|
||||
* This function is called when the copy of the frag from the GPU buffer
|
||||
* to the internal buffer is complete. Used to support asynchronous
|
||||
* copies from GPU to host buffers. Now the data can be sent.
|
||||
*/
|
||||
static void
|
||||
mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* ep,
|
||||
struct mca_btl_base_descriptor_t* des,
|
||||
int status )
|
||||
{
|
||||
int rc;
|
||||
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
||||
|
||||
des->des_cbfunc = mca_pml_ob1_frag_completion;
|
||||
/* Reset the BTL onwership flag as the BTL can free it after completion. */
|
||||
des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
opal_output(-1, "copy_frag_completion FRAG frag=%p", (void *)des);
|
||||
/* Currently, we cannot support a failure in the send. In the blocking
|
||||
* case, the counters tracking the fragments being sent are not adjusted
|
||||
* until the function returns success, so it handles the error by leaving
|
||||
* all the buffer counters intact. In this case, it is too late so
|
||||
* we just abort. In theory, a new queue could be created to hold this
|
||||
* fragment and then attempt to send it out on another BTL. */
|
||||
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
if(OPAL_UNLIKELY(rc < 0)) {
|
||||
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/**
|
||||
* Buffer the entire message and mark as complete.
|
||||
*/
|
||||
@ -1030,6 +1063,32 @@ cannot_pack:
|
||||
&(sendreq->req_send.req_base), size, PERUSE_SEND);
|
||||
#endif /* OMPI_WANT_PERUSE */
|
||||
|
||||
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
|
||||
/* At this point, check to see if the BTL is doing an asynchronous
|
||||
* copy. This would have been initiated in the mca_bml_base_prepare_src
|
||||
* called above. The flag is checked here as we let the hdr be
|
||||
* set up prior to checking.
|
||||
*/
|
||||
if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
|
||||
opal_output(-1, "Initiating async copy on FRAG frag=%p", (void *)des);
|
||||
/* Need to make sure BTL does not free frag after completion
|
||||
* of asynchronous copy as we still need to send the fragment. */
|
||||
des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
/* Unclear that this flag needs to be set but to be sure, set it */
|
||||
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
des->des_cbfunc = mca_pml_ob1_copy_frag_completion;
|
||||
range->range_btls[btl_idx].length -= size;
|
||||
range->range_send_length -= size;
|
||||
range->range_send_offset += size;
|
||||
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
|
||||
if(range->range_send_length == 0) {
|
||||
range = get_next_send_range(sendreq, range);
|
||||
prev_bytes_remaining = 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
#endif /* OMPI_CUDA_SUPPORT */
|
||||
|
||||
/* initiate send - note that this may complete before the call returns */
|
||||
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
|
||||
if( OPAL_LIKELY(rc >= 0) ) {
|
||||
|
@ -41,7 +41,7 @@
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
#include "opal/datatype/opal_datatype_cuda.h"
|
||||
#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
|
||||
#endif
|
||||
|
||||
extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
|
||||
@ -55,7 +55,7 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
|
||||
convertor->remoteArch = opal_local_arch;
|
||||
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
convertor->cbmemcpy = &memcpy;
|
||||
convertor->cbmemcpy = &opal_cuda_memcpy;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -55,6 +55,7 @@ BEGIN_C_DECLS
|
||||
#define CONVERTOR_NO_OP 0x00100000
|
||||
#define CONVERTOR_WITH_CHECKSUM 0x00200000
|
||||
#define CONVERTOR_CUDA 0x00400000
|
||||
#define CONVERTOR_CUDA_ASYNC 0x00800000
|
||||
#define CONVERTOR_TYPE_MASK 0x00FF0000
|
||||
#define CONVERTOR_STATE_START 0x01000000
|
||||
#define CONVERTOR_STATE_COMPLETE 0x02000000
|
||||
@ -69,7 +70,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
|
||||
uint32_t* out_size,
|
||||
size_t* max_data );
|
||||
typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
|
||||
typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n );
|
||||
typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
|
||||
|
||||
/* The master convertor struct (defined in convertor_internal.h) */
|
||||
struct opal_convertor_master_t;
|
||||
@ -116,6 +117,7 @@ struct opal_convertor_t {
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */
|
||||
void * stream; /**< CUstream for async copy */
|
||||
#endif
|
||||
/* size: 248, cachelines: 4, members: 20 */
|
||||
/* last cacheline: 56 bytes */
|
||||
@ -164,9 +166,6 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
|
||||
convertor->pStack = convertor->static_stack;
|
||||
convertor->stack_size = DT_STATIC_STACK_SIZE;
|
||||
}
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
convertor->cbmemcpy = &memcpy;
|
||||
#endif
|
||||
convertor->pDesc = NULL;
|
||||
convertor->stack_pos = 0;
|
||||
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
|
||||
|
@ -78,7 +78,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
|
||||
#undef MEM_OP_NAME
|
||||
#define MEM_OP_NAME non_overlap_cuda
|
||||
#undef MEM_OP
|
||||
#define MEM_OP opal_cuda_memcpy
|
||||
#define MEM_OP opal_cuda_memcpy_sync
|
||||
#include "opal_datatype_copy.h"
|
||||
|
||||
#undef MEM_OP_NAME
|
||||
|
@ -86,11 +86,41 @@ bool opal_cuda_check_bufs(char *dest, char *src)
|
||||
}
|
||||
|
||||
/*
|
||||
* Need intermediate cuMemcpy function so we can check the return code
|
||||
* of the call. If we see an error, abort as there is no recovery at
|
||||
* this point.
|
||||
* With CUDA enabled, all contiguous copies will pass through this function.
|
||||
* Therefore, the first check is to see if the convertor is a GPU buffer.
|
||||
* Note that if there is an error with any of the CUDA calls, the program
|
||||
* aborts as there is no recovering.
|
||||
*/
|
||||
void *opal_cuda_memcpy(void *dest, void *src, size_t size)
|
||||
void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
|
||||
{
|
||||
int res;
|
||||
|
||||
if (!(convertor->flags & CONVERTOR_CUDA)) {
|
||||
return memcpy(dest, src, size);
|
||||
}
|
||||
|
||||
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
|
||||
res = cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
|
||||
(CUstream)convertor->stream);
|
||||
} else {
|
||||
res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
|
||||
}
|
||||
|
||||
if (res != CUDA_SUCCESS) {
|
||||
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
|
||||
res, dest, src, (int)size);
|
||||
abort();
|
||||
} else {
|
||||
return dest;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is needed in cases where we do not have contiguous
|
||||
* datatypes. The current code has macros that cannot handle a convertor
|
||||
* argument to the memcpy call.
|
||||
*/
|
||||
void *opal_cuda_memcpy_sync(void *dest, void *src, size_t size)
|
||||
{
|
||||
int res;
|
||||
res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
|
||||
@ -170,3 +200,13 @@ static void opal_cuda_support_init(void)
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell the convertor that copies will be asynchronous CUDA copies. The
|
||||
* flags are cleared when the convertor is reinitialized.
|
||||
*/
|
||||
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
|
||||
{
|
||||
convertor->flags |= CONVERTOR_CUDA_ASYNC;
|
||||
convertor->stream = stream;
|
||||
}
|
||||
|
@ -12,8 +12,10 @@
|
||||
|
||||
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
|
||||
bool opal_cuda_check_bufs(char *dest, char *src);
|
||||
void* opal_cuda_memcpy(void * dest, void * src, size_t size);
|
||||
void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
|
||||
void* opal_cuda_memcpy_sync(void * dest, void * src, size_t size);
|
||||
void* opal_cuda_memmove(void * dest, void * src, size_t size);
|
||||
void opal_cuda_add_initialization_function(int (*fptr)(void));
|
||||
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
|
||||
|
||||
#endif
|
||||
|
@ -26,7 +26,7 @@
|
||||
/* Make use of existing macro to do CUDA style memcpy */
|
||||
#undef MEMCPY_CSUM
|
||||
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
|
||||
#endif
|
||||
|
||||
static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
|
||||
|
@ -324,7 +324,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
|
||||
* memory, need to use the special host to device memory copy.
|
||||
* Note this code path was only seen on large receives of
|
||||
* noncontiguous data via buffered sends. */
|
||||
pConvertor->cbmemcpy(saved_data, real_data, data_length );
|
||||
pConvertor->cbmemcpy(saved_data, real_data, data_length, pConvertor );
|
||||
#else
|
||||
/* Save the content of the user memory */
|
||||
MEMCPY( saved_data, real_data, data_length );
|
||||
@ -347,10 +347,10 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
|
||||
* data via buffered sends. */
|
||||
{
|
||||
char resaved_data[16];
|
||||
pConvertor->cbmemcpy(resaved_data, real_data, data_length );
|
||||
pConvertor->cbmemcpy(resaved_data, real_data, data_length, pConvertor );
|
||||
for( i = 0; i < data_length; i++ ) {
|
||||
if( unused_byte == resaved_data[i] )
|
||||
pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1);
|
||||
pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1, pConvertor);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
@ -26,7 +26,7 @@
|
||||
/* Make use of existing macro to do CUDA style memcpy */
|
||||
#undef MEMCPY_CSUM
|
||||
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) )
|
||||
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
|
||||
#endif
|
||||
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user