1
1

Improve CUDA GPU transfers over openib BTL. Use aynchronous copies.

This is RFC that was submitted in July and December of 2012.

This commit was SVN r27862.
Этот коммит содержится в:
Rolf vandeVaart 2013-01-17 22:34:43 +00:00
родитель 92e297d1fa
Коммит f63c88701f
19 изменённых файлов: 452 добавлений и 18 удалений

Просмотреть файл

@ -199,6 +199,8 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_FLAGS_CUDA_PUT 0x0400 #define MCA_BTL_FLAGS_CUDA_PUT 0x0400
#define MCA_BTL_FLAGS_CUDA_GET 0x0800 #define MCA_BTL_FLAGS_CUDA_GET 0x0800
#define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT) #define MCA_BTL_FLAGS_CUDA_RDMA (MCA_BTL_FLAGS_CUDA_GET|MCA_BTL_FLAGS_CUDA_PUT)
#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND 0x1000
#define MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV 0x2000
/* Default exclusivity levels */ /* Default exclusivity levels */
#define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */
@ -298,6 +300,10 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t);
*/ */
#define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004 #define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004
/* Tell the PML that the copy is being done asynchronously
*/
#define MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC 0x0008
/* Type of transfer that will be done with this frag. /* Type of transfer that will be done with this frag.
*/ */
#define MCA_BTL_DES_FLAGS_PUT 0x0010 #define MCA_BTL_DES_FLAGS_PUT 0x0010

Просмотреть файл

@ -55,6 +55,10 @@
#include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/mpool.h" #include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/grdma/mpool_grdma.h" #include "ompi/mca/mpool/grdma/mpool_grdma.h"
#if OMPI_CUDA_SUPPORT
#include "opal/datatype/opal_datatype_cuda.h"
#include "ompi/mca/common/cuda/common_cuda.h"
#endif /* OMPI_CUDA_SUPPORT */
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include <errno.h> #include <errno.h>
#include <sys/types.h> #include <sys/types.h>
@ -1287,6 +1291,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve );
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
/* If the convertor is copying the data asynchronously, then record an event
* that will trigger the callback when it completes. Mark descriptor as async.*/
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag);
to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
}
#endif /* OMPI_CUDA_SUPPORT */
*size = max_data; *size = max_data;
/* not all upper layer users set this */ /* not all upper layer users set this */

Просмотреть файл

@ -308,6 +308,10 @@ struct mca_btl_openib_component_t {
size_t memalign_threshold; size_t memalign_threshold;
void* (*previous_malloc_hook)(size_t __size, const void*); void* (*previous_malloc_hook)(size_t __size, const void*);
#endif #endif
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
int cuda_async_send;
int cuda_async_recv;
#endif /* OMPI_CUDA_SUPPORT */
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component; OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;

Просмотреть файл

@ -103,6 +103,12 @@ static int btl_openib_component_open(void);
static int btl_openib_component_close(void); static int btl_openib_component_close(void);
static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
static int btl_openib_component_progress(void); static int btl_openib_component_progress(void);
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
mca_btl_openib_endpoint_t *ep,
mca_btl_base_descriptor_t* des,
int status);
#endif /* OMPI_CUDA_SUPPORT */
/* /*
* Local variables * Local variables
*/ */
@ -3060,8 +3066,24 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) { if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
/* call registered callback */ /* call registered callback */
mca_btl_active_message_callback_t* reg; mca_btl_active_message_callback_t* reg;
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/* The COPY_ASYNC flag should not be set */
assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
#endif /* OMPI_CUDA_SUPPORT */
reg = mca_btl_base_active_message_trigger + hdr->tag; reg = mca_btl_base_active_message_trigger + hdr->tag;
reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata ); reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
/* Since ASYNC flag is set, we know this descriptor is being used
* for asynchronous copy and cannot be freed yet. Therefore, set
* up callback for PML to call when complete, add argument into
* descriptor and return. */
des->des_cbfunc = btl_openib_handle_incoming_completion;
des->des_cbdata = (void *)ep;
return OMPI_SUCCESS;
}
#endif /* OMPI_CUDA_SUPPORT */
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
cqp = (hdr->credits >> 11) & 0x0f; cqp = (hdr->credits >> 11) & 0x0f;
hdr->credits &= 0x87ff; hdr->credits &= 0x87ff;
@ -3152,6 +3174,85 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/**
* Called by the PML when the copying of the data out of the fragment
* is complete.
*/
static int btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t *ep,
mca_btl_base_descriptor_t* des,
int status)
{
mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
mca_btl_openib_header_t *hdr = frag->hdr;
int rqp = to_base_frag(frag)->base.order, cqp;
uint16_t rcredits = 0, credits;
bool is_credit_msg;
OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
cqp = (hdr->credits >> 11) & 0x0f;
hdr->credits &= 0x87ff;
} else {
cqp = rqp;
}
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
rcredits = BTL_OPENIB_CREDITS(hdr->credits);
hdr->credits = 0;
}
credits = hdr->credits;
if(hdr->cm_seen)
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
/* We should not be here with eager or control messages */
assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
assert(0 == is_cts_message(frag));
/* HACK - clear out flags. Must be better way */
des->des_flags = 0;
/* Otherwise, FRAG_RETURN it and repost if necessary */
MCA_BTL_IB_FRAG_RETURN(frag);
if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
if (OPAL_UNLIKELY(is_credit_msg)) {
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
} else {
OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
}
mca_btl_openib_endpoint_post_rr(ep, cqp);
} else {
mca_btl_openib_module_t *btl = ep->endpoint_btl;
OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(btl, rqp);
}
assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
/* If we got any credits (RDMA or send), then try to progress all
the no_credits_pending_frags lists */
if (rcredits > 0) {
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
}
if (credits > 0) {
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
}
if (rcredits + credits > 0) {
int rc;
if (OMPI_SUCCESS !=
(rc = progress_no_credits_pending_frags(ep))) {
return rc;
}
}
send_credits(ep, cqp);
return OMPI_SUCCESS;
}
#endif /* OMPI_CUDA_SUPPORT */
static char* btl_openib_component_status_to_string(enum ibv_wc_status status) static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
{ {
switch(status) { switch(status) {
@ -3632,6 +3733,27 @@ static int btl_openib_component_progress(void)
count += progress_one_device(device); count += progress_one_device(device);
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
/* Check to see if there are any outstanding dtoh CUDA events that
* have completed. If so, issue the PML callbacks on the fragments.
* The only thing that gets completed here are asynchronous copies
* so there is no need to free anything.
*/
{
int local_count = 0;
mca_btl_base_descriptor_t *frag;
while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
opal_output(-1, "btl_openib: event completed on frag=%p", (void *)frag);
frag->des_cbfunc(NULL, NULL, frag, OMPI_SUCCESS);
local_count++;
}
count += local_count;
}
if (count > 0) {
opal_output(-1, "btl_openib: DONE with openib progress, count=%d", count);
}
#endif /* OMPI_CUDA_SUPPORT */
return count; return count;
#if OPAL_HAVE_THREADS #if OPAL_HAVE_THREADS

Просмотреть файл

@ -566,6 +566,31 @@ int btl_openib_register_mca_params(void)
&mca_btl_openib_component.super.btl_version, &mca_btl_openib_component.super.btl_version,
&mca_btl_openib_module.super)); &mca_btl_openib_module.super));
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/* Default is enabling CUDA asynchronous send copies */
CHECK(reg_int("cuda_async_send", NULL,
"Enable or disable CUDA async send copies "
"(1 = async; 0 = sync)",
1, &ival, 0));
mca_btl_openib_component.cuda_async_send = (0 != ival);
if (mca_btl_openib_component.cuda_async_send) {
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND;
}
/* Default is enabling CUDA asynchronous receive copies */
CHECK(reg_int("cuda_async_recv", NULL,
"Enable or disable CUDA async recv copies "
"(1 = async; 0 = sync)",
1, &ival, 0));
mca_btl_openib_component.cuda_async_recv = (0 != ival);
if (mca_btl_openib_component.cuda_async_recv) {
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV;
}
/* Also make the max send size larger for better GPU buffer performance */
mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
/* Turn of message coalescing - not sure if it works with GPU buffers */
mca_btl_openib_component.use_message_coalescing = 0;
#endif /* OMPI_CUDA_SUPPORT */
/* setup all the qp stuff */ /* setup all the qp stuff */
/* round mid_qp_size to smallest power of two */ /* round mid_qp_size to smallest power of two */
mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1; mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1;

Просмотреть файл

@ -73,7 +73,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
MCA_PML_OB1_HDR_FLAGS_CONTIG); MCA_PML_OB1_HDR_FLAGS_CONTIG);
} else { } else {
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
} }
} }
} else { } else {

Просмотреть файл

@ -22,6 +22,11 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_sendreq.h" #include "pml_ob1_sendreq.h"
#include "ompi/mca/bml/base/base.h" #include "ompi/mca/bml/base/base.h"
#if OMPI_CUDA_SUPPORT
#include "ompi/mca/common/cuda/common_cuda.h"
#include "pml_ob1_recvreq.h"
static void mca_pml_ob1_process_pending_cuda_async_copies(void);
#endif /* OMPI_CUDA_SUPPORT */
int mca_pml_ob1_progress(void) int mca_pml_ob1_progress(void)
{ {
@ -29,6 +34,10 @@ int mca_pml_ob1_progress(void)
int j, completed_requests = 0; int j, completed_requests = 0;
bool send_succedded; bool send_succedded;
#if OMPI_CUDA_SUPPORT
mca_pml_ob1_process_pending_cuda_async_copies();
#endif /* OMPI_CUDA_SUPPORT */
if( OPAL_LIKELY(0 == queue_length) ) if( OPAL_LIKELY(0 == queue_length) )
return 0; return 0;
@ -77,3 +86,20 @@ int mca_pml_ob1_progress(void)
return completed_requests; return completed_requests;
} }
#if OMPI_CUDA_SUPPORT
static void mca_pml_ob1_process_pending_cuda_async_copies(void)
{
mca_btl_base_descriptor_t *frag;
int progress;
do {
progress = progress_one_cuda_htod_event(&frag);
if (1 == progress) {
/* Call the finish function to make progress. */
mca_pml_ob1_recv_request_frag_copy_finished(NULL, NULL, frag, 0);
}
} while (progress > 0);
/* Consider progressing dtoh events here in future */
}
#endif /* OMPI_CUDA_SUPPORT */

Просмотреть файл

@ -44,6 +44,10 @@
#include "pml_ob1_recvreq.h" #include "pml_ob1_recvreq.h"
#include "pml_ob1_sendreq.h" #include "pml_ob1_sendreq.h"
#include "pml_ob1_hdr.h" #include "pml_ob1_hdr.h"
#if OMPI_CUDA_SUPPORT
#include "opal/datatype/opal_datatype_cuda.h"
#include "ompi/mca/common/cuda/common_cuda.h"
#endif /* OMPI_CUDA_SUPPORT */
OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t, OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
ompi_free_list_item_t, ompi_free_list_item_t,
@ -332,6 +336,17 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
OPAL_THREAD_ADD32(&sendreq->req_state, -1); OPAL_THREAD_ADD32(&sendreq->req_state, -1);
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND)) {
/* The user's buffer is GPU and this BTL can support asynchronous copies,
* so adjust the convertor accordingly. All the subsequent fragments will
* use the asynchronous copy. */
void *strm = mca_common_cuda_get_dtoh_stream();
opal_cuda_set_copy_function_async(&sendreq->req_send.req_base.req_convertor, strm);
}
#endif /* OMPI_CUDA_SUPPORT */
if(send_request_pml_complete_check(sendreq) == false) if(send_request_pml_complete_check(sendreq) == false)
mca_pml_ob1_send_request_schedule(sendreq); mca_pml_ob1_send_request_schedule(sendreq);
@ -351,6 +366,22 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/* If data is destined for GPU buffer and convertor was set up for asynchronous
* copies, then start the copy and return. The copy completion will trigger
* the next phase. */
if (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA_ASYNC) {
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_dst_cnt,des);
/* Let BTL know that it CANNOT free the frag */
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
return;
}
#endif /* OMPI_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return; return;

Просмотреть файл

@ -38,6 +38,10 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "opal/util/arch.h" #include "opal/util/arch.h"
#include "ompi/memchecker.h" #include "ompi/memchecker.h"
#if OMPI_CUDA_SUPPORT
#include "opal/datatype/opal_datatype_cuda.h"
#include "ompi/mca/common/cuda/common_cuda.h"
#endif /* OMPI_CUDA_SUPPORT */
#if OMPI_CUDA_SUPPORT #if OMPI_CUDA_SUPPORT
int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq, int mca_pml_ob1_cuda_need_buffers(mca_pml_ob1_recv_request_t* recvreq,
@ -527,6 +531,85 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
} }
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/**
* This function is basically the first half of the code in the
* mca_pml_ob1_recv_request_progress_frag function. This fires off
* the asynchronous copy and returns. Unused fields in the descriptor
* are used to pass extra information for when the asynchronous copy
* completes. No memchecker support in this function as copies are
* happening asynchronously.
*/
void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvreq,
mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments,
mca_btl_base_descriptor_t* des)
{
int result;
size_t bytes_received = 0, data_offset = 0;
size_t bytes_delivered __opal_attribute_unused__; /* is being set to zero in MCA_PML_OB1_RECV_REQUEST_UNPACK */
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
OPAL_OUTPUT((-1, "start_frag_copy frag=%p", (void *)des));
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset;
MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq,
segments,
num_segments,
sizeof(mca_pml_ob1_frag_hdr_t),
data_offset,
bytes_received,
bytes_delivered );
/* Store the receive request in unused context pointer. */
des->des_context = (void *)recvreq;
/* Store the amount of bytes in unused src count value */
des->des_src_cnt = bytes_delivered;
/* Then record an event that will get triggered by a PML progress call which
* checks the stream events. If we get an error, abort. Should get message
* from CUDA code about what went wrong. */
result = mca_common_cuda_record_htod_event("pml", des);
if (OMPI_SUCCESS != result) {
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
}
}
/**
* This function is basically the second half of the code in the
* mca_pml_ob1_recv_request_progress_frag function. The number of
* bytes delivered is updated. Then a call is made into the BTL so it
* can free the fragment that held that data. This is currently
* called directly by the common CUDA code. No memchecker support
* in this function as copies are happening asynchronously.
*/
void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
size_t bytes_received = des->des_src_cnt;
OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
/* Call into the BTL so it can free the descriptor. At this point, it is
* known that the data has been copied out of the descriptor. */
des->des_cbfunc(NULL, (struct mca_btl_base_endpoint_t *)des->des_cbdata, des, 0);
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
/* check completion status */
if(recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, NULL);
}
}
#endif /* OMPI_CUDA_SUPPORT */
/* /*
* Update the recv request status to reflect the number of bytes * Update the recv request status to reflect the number of bytes
* received and actually delivered to the application. * received and actually delivered to the application.
@ -701,6 +784,17 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
/* schedule additional rdma operations */ /* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, NULL); mca_pml_ob1_recv_request_schedule(recvreq, NULL);
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
/* If BTL supports it and this is a CUDA buffer being received into,
* have all subsequent FRAGS copied in asynchronously. */
if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) {
void *strm = mca_common_cuda_get_htod_stream();
opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm);
}
#endif
} }
/* /*

Просмотреть файл

@ -305,6 +305,19 @@ void mca_pml_ob1_recv_request_progress_frag(
mca_btl_base_segment_t* segments, mca_btl_base_segment_t* segments,
size_t num_segments); size_t num_segments);
#if OMPI_CUDA_SUPPORT
void mca_pml_ob1_recv_request_frag_copy_start(
mca_pml_ob1_recv_request_t* req,
struct mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments,
size_t num_segments,
mca_btl_base_descriptor_t* des);
void mca_pml_ob1_recv_request_frag_copy_finished(struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status );
#endif /* OMPI_CUDA_SUPPORT */
/** /**
* *
*/ */

Просмотреть файл

@ -332,6 +332,39 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
/**
* This function is called when the copy of the frag from the GPU buffer
* to the internal buffer is complete. Used to support asynchronous
* copies from GPU to host buffers. Now the data can be sent.
*/
static void
mca_pml_ob1_copy_frag_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{
int rc;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
des->des_cbfunc = mca_pml_ob1_frag_completion;
/* Reset the BTL onwership flag as the BTL can free it after completion. */
des->des_flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
opal_output(-1, "copy_frag_completion FRAG frag=%p", (void *)des);
/* Currently, we cannot support a failure in the send. In the blocking
* case, the counters tracking the fragments being sent are not adjusted
* until the function returns success, so it handles the error by leaving
* all the buffer counters intact. In this case, it is too late so
* we just abort. In theory, a new queue could be created to hold this
* fragment and then attempt to send it out on another BTL. */
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
if(OPAL_UNLIKELY(rc < 0)) {
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
}
}
#endif /* OMPI_CUDA_SUPPORT */
/** /**
* Buffer the entire message and mark as complete. * Buffer the entire message and mark as complete.
*/ */
@ -1030,6 +1063,32 @@ cannot_pack:
&(sendreq->req_send.req_base), size, PERUSE_SEND); &(sendreq->req_send.req_base), size, PERUSE_SEND);
#endif /* OMPI_WANT_PERUSE */ #endif /* OMPI_WANT_PERUSE */
#if OMPI_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
/* At this point, check to see if the BTL is doing an asynchronous
* copy. This would have been initiated in the mca_bml_base_prepare_src
* called above. The flag is checked here as we let the hdr be
* set up prior to checking.
*/
if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
opal_output(-1, "Initiating async copy on FRAG frag=%p", (void *)des);
/* Need to make sure BTL does not free frag after completion
* of asynchronous copy as we still need to send the fragment. */
des->des_flags &= ~MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
/* Unclear that this flag needs to be set but to be sure, set it */
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
des->des_cbfunc = mca_pml_ob1_copy_frag_completion;
range->range_btls[btl_idx].length -= size;
range->range_send_length -= size;
range->range_send_offset += size;
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
if(range->range_send_length == 0) {
range = get_next_send_range(sendreq, range);
prev_bytes_remaining = 0;
}
continue;
}
#endif /* OMPI_CUDA_SUPPORT */
/* initiate send - note that this may complete before the call returns */ /* initiate send - note that this may complete before the call returns */
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG);
if( OPAL_LIKELY(rc >= 0) ) { if( OPAL_LIKELY(rc >= 0) ) {

Просмотреть файл

@ -41,7 +41,7 @@
#if OPAL_CUDA_SUPPORT #if OPAL_CUDA_SUPPORT
#include "opal/datatype/opal_datatype_cuda.h" #include "opal/datatype/opal_datatype_cuda.h"
#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \ #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif #endif
extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor, extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
@ -55,7 +55,7 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
convertor->remoteArch = opal_local_arch; convertor->remoteArch = opal_local_arch;
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
#if OPAL_CUDA_SUPPORT #if OPAL_CUDA_SUPPORT
convertor->cbmemcpy = &memcpy; convertor->cbmemcpy = &opal_cuda_memcpy;
#endif #endif
} }

Просмотреть файл

@ -55,6 +55,7 @@ BEGIN_C_DECLS
#define CONVERTOR_NO_OP 0x00100000 #define CONVERTOR_NO_OP 0x00100000
#define CONVERTOR_WITH_CHECKSUM 0x00200000 #define CONVERTOR_WITH_CHECKSUM 0x00200000
#define CONVERTOR_CUDA 0x00400000 #define CONVERTOR_CUDA 0x00400000
#define CONVERTOR_CUDA_ASYNC 0x00800000
#define CONVERTOR_TYPE_MASK 0x00FF0000 #define CONVERTOR_TYPE_MASK 0x00FF0000
#define CONVERTOR_STATE_START 0x01000000 #define CONVERTOR_STATE_START 0x01000000
#define CONVERTOR_STATE_COMPLETE 0x02000000 #define CONVERTOR_STATE_COMPLETE 0x02000000
@ -69,7 +70,7 @@ typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
uint32_t* out_size, uint32_t* out_size,
size_t* max_data ); size_t* max_data );
typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata ); typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n ); typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
/* The master convertor struct (defined in convertor_internal.h) */ /* The master convertor struct (defined in convertor_internal.h) */
struct opal_convertor_master_t; struct opal_convertor_master_t;
@ -116,6 +117,7 @@ struct opal_convertor_t {
#if OPAL_CUDA_SUPPORT #if OPAL_CUDA_SUPPORT
memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */ memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */
void * stream; /**< CUstream for async copy */
#endif #endif
/* size: 248, cachelines: 4, members: 20 */ /* size: 248, cachelines: 4, members: 20 */
/* last cacheline: 56 bytes */ /* last cacheline: 56 bytes */
@ -164,9 +166,6 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
convertor->pStack = convertor->static_stack; convertor->pStack = convertor->static_stack;
convertor->stack_size = DT_STATIC_STACK_SIZE; convertor->stack_size = DT_STATIC_STACK_SIZE;
} }
#if OPAL_CUDA_SUPPORT
convertor->cbmemcpy = &memcpy;
#endif
convertor->pDesc = NULL; convertor->pDesc = NULL;
convertor->stack_pos = 0; convertor->stack_pos = 0;
convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;

Просмотреть файл

@ -78,7 +78,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
#undef MEM_OP_NAME #undef MEM_OP_NAME
#define MEM_OP_NAME non_overlap_cuda #define MEM_OP_NAME non_overlap_cuda
#undef MEM_OP #undef MEM_OP
#define MEM_OP opal_cuda_memcpy #define MEM_OP opal_cuda_memcpy_sync
#include "opal_datatype_copy.h" #include "opal_datatype_copy.h"
#undef MEM_OP_NAME #undef MEM_OP_NAME

Просмотреть файл

@ -86,11 +86,41 @@ bool opal_cuda_check_bufs(char *dest, char *src)
} }
/* /*
* Need intermediate cuMemcpy function so we can check the return code * With CUDA enabled, all contiguous copies will pass through this function.
* of the call. If we see an error, abort as there is no recovery at * Therefore, the first check is to see if the convertor is a GPU buffer.
* this point. * Note that if there is an error with any of the CUDA calls, the program
* aborts as there is no recovering.
*/ */
void *opal_cuda_memcpy(void *dest, void *src, size_t size) void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
{
int res;
if (!(convertor->flags & CONVERTOR_CUDA)) {
return memcpy(dest, src, size);
}
if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
res = cuMemcpyAsync((CUdeviceptr)dest, (CUdeviceptr)src, size,
(CUstream)convertor->stream);
} else {
res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
}
if (res != CUDA_SUCCESS) {
opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
res, dest, src, (int)size);
abort();
} else {
return dest;
}
}
/*
* This function is needed in cases where we do not have contiguous
* datatypes. The current code has macros that cannot handle a convertor
* argument to the memcpy call.
*/
void *opal_cuda_memcpy_sync(void *dest, void *src, size_t size)
{ {
int res; int res;
res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
@ -170,3 +200,13 @@ static void opal_cuda_support_init(void)
initialized = true; initialized = true;
} }
/**
* Tell the convertor that copies will be asynchronous CUDA copies. The
* flags are cleared when the convertor is reinitialized.
*/
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
{
convertor->flags |= CONVERTOR_CUDA_ASYNC;
convertor->stream = stream;
}

Просмотреть файл

@ -12,8 +12,10 @@
void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf); void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
bool opal_cuda_check_bufs(char *dest, char *src); bool opal_cuda_check_bufs(char *dest, char *src);
void* opal_cuda_memcpy(void * dest, void * src, size_t size); void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
void* opal_cuda_memcpy_sync(void * dest, void * src, size_t size);
void* opal_cuda_memmove(void * dest, void * src, size_t size); void* opal_cuda_memmove(void * dest, void * src, size_t size);
void opal_cuda_add_initialization_function(int (*fptr)(void)); void opal_cuda_add_initialization_function(int (*fptr)(void));
void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
#endif #endif

Просмотреть файл

@ -26,7 +26,7 @@
/* Make use of existing macro to do CUDA style memcpy */ /* Make use of existing macro to do CUDA style memcpy */
#undef MEMCPY_CSUM #undef MEMCPY_CSUM
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif #endif
static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,

Просмотреть файл

@ -324,7 +324,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
* memory, need to use the special host to device memory copy. * memory, need to use the special host to device memory copy.
* Note this code path was only seen on large receives of * Note this code path was only seen on large receives of
* noncontiguous data via buffered sends. */ * noncontiguous data via buffered sends. */
pConvertor->cbmemcpy(saved_data, real_data, data_length ); pConvertor->cbmemcpy(saved_data, real_data, data_length, pConvertor );
#else #else
/* Save the content of the user memory */ /* Save the content of the user memory */
MEMCPY( saved_data, real_data, data_length ); MEMCPY( saved_data, real_data, data_length );
@ -347,10 +347,10 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
* data via buffered sends. */ * data via buffered sends. */
{ {
char resaved_data[16]; char resaved_data[16];
pConvertor->cbmemcpy(resaved_data, real_data, data_length ); pConvertor->cbmemcpy(resaved_data, real_data, data_length, pConvertor );
for( i = 0; i < data_length; i++ ) { for( i = 0; i < data_length; i++ ) {
if( unused_byte == resaved_data[i] ) if( unused_byte == resaved_data[i] )
pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1); pConvertor->cbmemcpy(&real_data[i], &saved_data[i], 1, pConvertor);
} }
} }
#else #else

Просмотреть файл

@ -26,7 +26,7 @@
/* Make use of existing macro to do CUDA style memcpy */ /* Make use of existing macro to do CUDA style memcpy */
#undef MEMCPY_CSUM #undef MEMCPY_CSUM
#define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH) ) CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif #endif
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"