From 433cfa36658e6e5e429d9fd630c215a92bb6a78a Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 25 Oct 2011 18:38:42 +0000 Subject: [PATCH] use single copy for some sends This commit was SVN r25365. --- ompi/mca/btl/vader/btl_vader.c | 26 +++++++++++--- ompi/mca/btl/vader/btl_vader_component.c | 46 ++++++++++++++++-------- ompi/mca/btl/vader/btl_vader_frag.h | 16 ++++++--- ompi/mca/btl/vader/btl_vader_send.c | 5 +++ 4 files changed, 69 insertions(+), 24 deletions(-) diff --git a/ompi/mca/btl/vader/btl_vader.c b/ompi/mca/btl/vader/btl_vader.c index ea7ddf517a..2c6cde7751 100644 --- a/ompi/mca/btl/vader/btl_vader.c +++ b/ompi/mca/btl/vader/btl_vader.c @@ -31,6 +31,8 @@ #include "btl_vader_endpoint.h" #include "btl_vader_fifo.h" +int mca_btl_vader_max_inline_send = 256; + static int vader_del_procs (struct mca_btl_base_module_t *btl, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers); @@ -306,7 +308,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) i = ompi_free_list_init_new(&component->vader_frags_eager, sizeof (mca_btl_vader_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t), - sizeof (mca_btl_vader_hdr_t) + component->eager_limit, + sizeof (mca_btl_vader_hdr_t) + mca_btl_vader_max_inline_send, opal_cache_line_size, component->vader_free_list_num, component->vader_free_list_max, @@ -634,7 +636,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ mca_btl_vader_frag_t *frag; uint32_t iov_count = 1; void *data_ptr; - struct iovec iov; + struct iovec iov, *lcl_mem; int rc; if (OPAL_LIKELY(reserve)) { @@ -645,6 +647,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ } if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + /* non-contiguous data requires the convertor */ iov.iov_len = mca_btl_vader_component.eager_limit - reserve; iov.iov_base = (IOVBASE_TYPE *)(((uintptr_t)(frag->segment.seg_addr.pval)) + @@ -655,13 +658,26 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ MCA_BTL_VADER_FRAG_RETURN(frag); return NULL; } + frag->segment.seg_len = reserve + *size; + } else if ((*size + reserve) > mca_btl_vader_max_inline_send) { + /* single copy send */ + /* pack the iovec after the reserved memory */ + lcl_mem = (struct iovec *) ((uintptr_t)(frag->hdr + 1) + reserve); + + frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; + opal_convertor_get_current_pointer (convertor, &data_ptr); + + lcl_mem->iov_base = data_ptr; + lcl_mem->iov_len = *size; + + frag->segment.seg_len = reserve; } else { /* NTH: the covertor adds some latency so we bypass it if possible */ - opal_convertor_get_current_pointer (convertor, &data_ptr); - memmove ((void *)((uintptr_t)frag->segment.seg_addr.pval + reserve), data_ptr, *size); + opal_convertor_get_current_pointer (convertor, &data_ptr); + memmove ((void *)((uintptr_t)frag->segment.seg_addr.pval + reserve), data_ptr, *size); + frag->segment.seg_len = reserve + *size; } - frag->segment.seg_len = reserve + *size; } else { /* put/get fragment */ MCA_BTL_VADER_FRAG_ALLOC_USER(frag, rc); diff --git a/ompi/mca/btl/vader/btl_vader_component.c b/ompi/mca/btl/vader/btl_vader_component.c index 00cbac689b..d1fd05198a 100644 --- a/ompi/mca/btl/vader/btl_vader_component.c +++ b/ompi/mca/btl/vader/btl_vader_component.c @@ -129,12 +129,12 @@ static int mca_btl_vader_component_register (void) msb(mca_btl_vader_param_register_int("segment_multiple", mca_btl_vader_segment_multiple)); mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; - mca_btl_vader.super.btl_eager_limit = 4*1024; - mca_btl_vader.super.btl_rndv_eager_limit = 4*1024; - mca_btl_vader.super.btl_max_send_size = 4*1024; - mca_btl_vader.super.btl_rdma_pipeline_send_length = 4*1024; - mca_btl_vader.super.btl_rdma_pipeline_frag_size = 4*1024; - mca_btl_vader.super.btl_min_rdma_pipeline_size = 4*1024; + mca_btl_vader.super.btl_eager_limit = 64 * 1024; + mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit; + mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit; mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; @@ -264,7 +264,7 @@ static inline void mca_btl_vader_progress_sends (void) frag = (mca_btl_vader_frag_t *) item; next = opal_list_get_next (item); - if (frag->hdr->complete) { + if (OPAL_LIKELY(frag->hdr->complete)) { opal_list_remove_item (&mca_btl_vader_component.active_sends, item); if (OPAL_UNLIKELY(MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { @@ -289,13 +289,15 @@ static int mca_btl_vader_component_progress (void) mca_btl_active_message_callback_t *reg; mca_btl_vader_frag_t frag; mca_btl_vader_hdr_t *hdr; + mca_btl_base_segment_t segments[2]; + mca_mpool_base_registration_t *xpmem_reg = NULL; + bool single_copy; /* check active sends for completion */ mca_btl_vader_progress_sends (); /* poll the fifo once */ hdr = (mca_btl_vader_hdr_t *) vader_fifo_read (fifo); - if (VADER_FIFO_FREE == hdr) { return 0; } @@ -304,13 +306,29 @@ static int mca_btl_vader_component_progress (void) * memory address, to a true virtual address */ hdr = (mca_btl_vader_hdr_t *) RELATIVE2VIRTUAL(hdr); - /* recv upcall */ reg = mca_btl_base_active_message_trigger + hdr->tag; - frag.segment.seg_addr.pval = (void *) (hdr + 1); - frag.segment.seg_len = hdr->len; - frag.base.des_dst_cnt = 1; - frag.base.des_dst = &(frag.segment); - reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); + frag.base.des_dst = segments; + + segments[0].seg_addr.pval = (void *) (hdr + 1); + segments[0].seg_len = hdr->len; + + if (OPAL_UNLIKELY(hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY)) { + struct iovec *rem_mem = (struct iovec *) ((uintptr_t)segments[0].seg_addr.pval + hdr->len); + + xpmem_reg = vader_get_registation (hdr->my_smp_rank, rem_mem->iov_base, + rem_mem->iov_len, 0); + + segments[1].seg_addr.pval = vader_reg_to_ptr (xpmem_reg, rem_mem->iov_base); + segments[1].seg_len = rem_mem->iov_len; + + /* recv upcall */ + frag.base.des_dst_cnt = 2; + reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); + vader_return_registration (xpmem_reg, hdr->my_smp_rank); + } else { + frag.base.des_dst_cnt = 1; + reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); + } /* return the fragment */ hdr->complete = true; diff --git a/ompi/mca/btl/vader/btl_vader_frag.h b/ompi/mca/btl/vader/btl_vader_frag.h index dda27d01b7..07041e07a2 100644 --- a/ompi/mca/btl/vader/btl_vader_frag.h +++ b/ompi/mca/btl/vader/btl_vader_frag.h @@ -26,12 +26,17 @@ #include "ompi_config.h" +#define MCA_BTL_VADER_FLAG_INLINE 0 +#define MCA_BTL_VADER_FLAG_SINGLE_COPY 1 + struct mca_btl_vader_hdr_t { - size_t len; /* length of data following this header */ - int my_smp_rank; /* smp rank of owning process */ - mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */ - volatile void *next; /* next item in fifo */ - volatile bool complete; /* fragment completion */ + volatile void *next; /* next item in fifo. many peers may touch this */ + char pad[2]; + volatile bool complete; /* fragment completion (usually 1 byte) */ + mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */ + int flags; /* vader send flags */ + int my_smp_rank; /* smp rank of owning process */ + size_t len; /* length of data following this header */ }; typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t; @@ -57,6 +62,7 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t); OMPI_FREE_LIST_GET(&mca_btl_vader_component.vader_frags_eager, item, rc); \ frag = (mca_btl_vader_frag_t *) item; \ frag->hdr->complete = false; \ + frag->hdr->flags = MCA_BTL_VADER_FLAG_INLINE; \ frag->my_list = &mca_btl_vader_component.vader_frags_eager; \ } while (0) diff --git a/ompi/mca/btl/vader/btl_vader_send.c b/ompi/mca/btl/vader/btl_vader_send.c index 93741168f2..87e01fa0a3 100644 --- a/ompi/mca/btl/vader/btl_vader_send.c +++ b/ompi/mca/btl/vader/btl_vader_send.c @@ -53,6 +53,11 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, vader_fifo_write ((void *) VIRTUAL2RELATIVE(frag->hdr), mca_btl_vader_component.fifo[endpoint->peer_smp_rank]); + if (frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + return 0; + } + /* data is gone */ return 1; }