From ee087de073ba87247a5ef05bfb9cc442573bdfee Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 27 Oct 2011 20:22:46 +0000 Subject: [PATCH] added fast boxes to vader This commit was SVN r25376. --- ompi/mca/btl/vader/Makefile.am | 1 + ompi/mca/btl/vader/btl_vader.c | 48 ++++++++- ompi/mca/btl/vader/btl_vader.h | 17 +++ ompi/mca/btl/vader/btl_vader_component.c | 5 + ompi/mca/btl/vader/btl_vader_fbox.h | 128 +++++++++++++++++++++++ ompi/mca/btl/vader/btl_vader_frag.c | 1 - ompi/mca/btl/vader/btl_vader_frag.h | 3 + ompi/mca/btl/vader/btl_vader_get.c | 7 +- ompi/mca/btl/vader/btl_vader_put.c | 9 +- ompi/mca/btl/vader/btl_vader_send.c | 11 ++ ompi/mca/btl/vader/btl_vader_sendi.c | 20 +++- 11 files changed, 226 insertions(+), 24 deletions(-) create mode 100644 ompi/mca/btl/vader/btl_vader_fbox.h diff --git a/ompi/mca/btl/vader/Makefile.am b/ompi/mca/btl/vader/Makefile.am index 1f4122ba47..b26dff00f6 100644 --- a/ompi/mca/btl/vader/Makefile.am +++ b/ompi/mca/btl/vader/Makefile.am @@ -33,6 +33,7 @@ libmca_btl_vader_la_sources = \ btl_vader_frag.h \ btl_vader_send.c \ btl_vader_sendi.c \ + btl_vader_fbox.h \ btl_vader_get.c \ btl_vader_put.c diff --git a/ompi/mca/btl/vader/btl_vader.c b/ompi/mca/btl/vader/btl_vader.c index 31d8db59f5..3430ec09d1 100644 --- a/ompi/mca/btl/vader/btl_vader.c +++ b/ompi/mca/btl/vader/btl_vader.c @@ -30,6 +30,7 @@ #include "btl_vader.h" #include "btl_vader_endpoint.h" #include "btl_vader_fifo.h" +#include "btl_vader_fbox.h" static int vader_del_procs (struct mca_btl_base_module_t *btl, size_t nprocs, struct ompi_proc_t **procs, @@ -274,8 +275,8 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) component->shm_bases[component->my_smp_rank] = (char *)component->vader_mpool_base; component->shm_seg_ids[component->my_smp_rank] = my_segid; - /* initialize the array of fifo's "owned" by this process */ - posix_memalign ((void **)&my_fifos, getpagesize (), sizeof (vader_fifo_t)); + /* initialize the fifo and fast boxes "owned" by this process */ + posix_memalign ((void **)&my_fifos, getpagesize (), (n + 1) * getpagesize ()); if(NULL == my_fifos) return OMPI_ERR_OUT_OF_RESOURCE; @@ -297,6 +298,22 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) if (NULL == component->xpmem_rcaches) return OMPI_ERR_OUT_OF_RESOURCE; + component->vader_fboxes_in = (char **) calloc (n, sizeof (char *)); + if (NULL == component->vader_fboxes_in) + return OMPI_ERR_OUT_OF_RESOURCE; + + component->vader_fboxes_out = (char **) calloc (n, sizeof (char *)); + if (NULL == component->vader_fboxes_out) + return OMPI_ERR_OUT_OF_RESOURCE; + + component->vader_next_fbox_in = (unsigned char *) calloc (64, 1); + if (NULL == component->vader_next_fbox_in) + return OMPI_ERR_OUT_OF_RESOURCE; + + component->vader_next_fbox_out = (unsigned char *) calloc (64, 1); + if (NULL == component->vader_next_fbox_out) + return OMPI_ERR_OUT_OF_RESOURCE; + /* initialize fragment descriptor free lists */ /* initialize free list for send fragments */ i = ompi_free_list_init_new(&component->vader_frags_eager, @@ -482,8 +499,19 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl, /* get a persistent pointer to the peer's fifo */ component->fifo[peer_smp_rank] = vader_reg_to_ptr (vader_get_registation (peer_smp_rank, rem_ptr, - sizeof (vader_fifo_t), + (n_local_procs + 1) * getpagesize (), MCA_MPOOL_FLAGS_PERSIST), rem_ptr); + + /* fast boxes are allocated at the same time as the fifos */ + component->vader_fboxes_in[peer_smp_rank] = (char *) component->fifo[my_smp_rank] + + (peer_smp_rank + 1) * getpagesize (); + component->vader_fboxes_out[peer_smp_rank] = (char *) component->fifo[peer_smp_rank] + + (my_smp_rank + 1) * getpagesize (); + + component->vader_next_fbox_in[peer_smp_rank] = 0; + component->vader_next_fbox_out[peer_smp_rank] = 0; + + memset (component->vader_fboxes_in[peer_smp_rank], MCA_BTL_VADER_FBOX_FREE, getpagesize()); } } @@ -643,7 +671,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ struct iovec iov, *lcl_mem; mca_btl_vader_frag_t *frag; uint32_t iov_count = 1; - void *data_ptr; + void *data_ptr, *fbox_ptr; int rc; opal_convertor_get_current_pointer (convertor, &data_ptr); @@ -688,8 +716,18 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ frag->segment.seg_len = reserve; } else { /* inline send */ + + /* try to reserve a fast box for this transfer */ + fbox_ptr = mca_btl_vader_reserve_fbox (endpoint->peer_smp_rank, reserve + *size); + + if (fbox_ptr) { + frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX; + frag->segment.seg_addr.pval = fbox_ptr; + } + /* NTH: the covertor adds some latency so we bypass it here */ - memmove ((void *)((uintptr_t)frag->segment.seg_addr.pval + reserve), data_ptr, *size); + vader_memmove ((void *)((uintptr_t)frag->segment.seg_addr.pval + reserve), + data_ptr, *size); frag->segment.seg_len = reserve + *size; } } diff --git a/ompi/mca/btl/vader/btl_vader.h b/ompi/mca/btl/vader/btl_vader.h index 8b57e921b1..0e2546a85d 100644 --- a/ompi/mca/btl/vader/btl_vader.h +++ b/ompi/mca/btl/vader/btl_vader.h @@ -114,6 +114,12 @@ struct mca_btl_vader_component_t { opal_list_t active_sends; /**< list of outstanding fragments */ + char **vader_fboxes_in; /**< incomming fast boxes (memory belongs to this process) */ + char **vader_fboxes_out; /**< outgoing fast boxes (memory belongs to remote peers) */ + + unsigned char *vader_next_fbox_in; /**< indices of fast boxes to poll */ + unsigned char *vader_next_fbox_out; /**< indices of fast boxes to write */ + struct mca_btl_base_endpoint_t **vader_peers; }; typedef struct mca_btl_vader_component_t mca_btl_vader_component_t; @@ -256,6 +262,17 @@ static inline void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void * (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base)); } +/* memcpy is faster at larger sizes but is undefined if the + pointers are aliased (TODO -- readd alias check) */ +static inline void vader_memmove (void *dst, void *src, size_t size) +{ + if (size >= mca_btl_vader_memcpy_limit) { + memcpy (dst, src, size); + } else { + memmove (dst, src, size); + } +} + /** * Initiate a send to the peer. * diff --git a/ompi/mca/btl/vader/btl_vader_component.c b/ompi/mca/btl/vader/btl_vader_component.c index b34d271444..0a06a6c31d 100644 --- a/ompi/mca/btl/vader/btl_vader_component.c +++ b/ompi/mca/btl/vader/btl_vader_component.c @@ -35,6 +35,7 @@ #include "btl_vader.h" #include "btl_vader_frag.h" #include "btl_vader_fifo.h" +#include "btl_vader_fbox.h" static int mca_btl_vader_component_progress (void); static int mca_btl_vader_component_open(void); @@ -286,6 +287,7 @@ static inline void mca_btl_vader_progress_sends (void) } } + static int mca_btl_vader_component_progress (void) { int my_smp_rank = mca_btl_vader_component.my_smp_rank; @@ -297,6 +299,9 @@ static int mca_btl_vader_component_progress (void) mca_mpool_base_registration_t *xpmem_reg = NULL; bool single_copy; + /* check for messages in fast boxes */ + mca_btl_vader_check_fboxes (); + /* check active sends for completion */ mca_btl_vader_progress_sends (); diff --git a/ompi/mca/btl/vader/btl_vader_fbox.h b/ompi/mca/btl/vader/btl_vader_fbox.h new file mode 100644 index 0000000000..b17b2a4fb2 --- /dev/null +++ b/ompi/mca/btl/vader/btl_vader_fbox.h @@ -0,0 +1,128 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2011 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_VADER_FBOX_H) +#define MCA_BTL_VADER_FBOX_H + +#include "btl_vader.h" +#include "btl_vader_endpoint.h" + +/* XXX -- FIXME -- make no assumptions if possible */ +/* Assumptions: page size: 4096, cache line: 64 or 128 bytes, tag = 1 byte */ +#define FBOX_SIZE 128 /* 2-4 cache lines */ +#define LAST_FBOX 31 /* page size assumtion: 4096 */ +#define MAX_MSG 126 /* 1 byte used each for size and tag */ + +enum {MCA_BTL_VADER_FBOX_FREE = 0xfe, MCA_BTL_VADER_FBOX_RESERVED = 0xff}; + +#define MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, fbox) \ + (mca_btl_vader_component.vader_fboxes_out[peer_smp_rank] + FBOX_SIZE * (fbox)) + +#define MCA_BTL_VADER_FBOX_IN_PTR(peer_smp_rank, fbox) \ + (mca_btl_vader_component.vader_fboxes_in[peer_smp_rank] + FBOX_SIZE * (fbox)) + +static inline unsigned char *mca_btl_vader_reserve_fbox (int peer_smp_rank, size_t size) +{ + int next_fbox = mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank]; + unsigned char *fbox = MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, next_fbox); + + /* todo -- need thread locks here for the multi-threaded case */ + + if (size > MAX_MSG || fbox[0] != MCA_BTL_VADER_FBOX_FREE) { + /* fall back on fifo */ + return NULL; + } + + mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank] = + next_fbox == LAST_FBOX ? 0 : next_fbox + 1; + + /* mark this fast box as in use */ + fbox[0] = MCA_BTL_VADER_FBOX_RESERVED; + + return fbox + 2; +} + +static inline void mca_btl_vader_fbox_send (unsigned char *fbox, unsigned char tag, size_t size) +{ + fbox[-1] = tag; + + /* ensure data writes have completed before we mark the data as available */ + opal_atomic_wmb (); + + fbox[-2] = size; +} + +static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag, + void *header, size_t header_size, + void *payload, size_t payload_size) +{ + unsigned char *fbox; + + fbox = mca_btl_vader_reserve_fbox(endpoint->peer_smp_rank, header_size + payload_size); + if (NULL == fbox) { + return 0; + } + + memcpy (fbox, header, header_size); + if (OPAL_UNLIKELY(payload)) { + /* inline sends are typically just pml headers (due to MCA_BTL_FLAGS_SEND_INPLACE) */ + memcpy (fbox + header_size, payload, payload_size); + } + + /* mark the fbox as sent */ + mca_btl_vader_fbox_send (fbox, tag, header_size + payload_size); + + /* send complete */ + return 1; +} + +static inline void mca_btl_vader_check_fboxes (void) +{ + int my_smp_rank = mca_btl_vader_component.my_smp_rank; + mca_btl_active_message_callback_t *reg; + mca_btl_vader_frag_t frag; + unsigned char size, tag; + int i; + + for (i = 0 ; i < mca_btl_vader_component.num_smp_procs ; ++i) { + int next_fbox = mca_btl_vader_component.vader_next_fbox_in[i]; + unsigned char *fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox); + + if (my_smp_rank == i) { + continue; + } + + /* process all fast-box messages */ + while (0xfe != ((size = fbox[0]) & 0xfe)) { + opal_atomic_rmb (); + + tag = fbox[1]; + + reg = mca_btl_base_active_message_trigger + tag; + + frag.segment.seg_addr.pval = fbox + 2; + frag.segment.seg_len = size; + + frag.base.des_dst = &frag.segment; + frag.base.des_dst_cnt = 1; + reg->cbfunc(&mca_btl_vader.super, tag, &(frag.base), reg->cbdata); + + fbox[0] = MCA_BTL_VADER_FBOX_FREE; + + next_fbox = next_fbox == LAST_FBOX ? 0 : next_fbox + 1; + fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox); + } + + mca_btl_vader_component.vader_next_fbox_in[i] = next_fbox; + } +} + +#endif /* !defined(MCA_BTL_VADER_FBOX_H) */ diff --git a/ompi/mca/btl/vader/btl_vader_frag.c b/ompi/mca/btl/vader/btl_vader_frag.c index 97ca79bf18..89ed83987b 100644 --- a/ompi/mca/btl/vader/btl_vader_frag.c +++ b/ompi/mca/btl/vader/btl_vader_frag.c @@ -29,7 +29,6 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag) { frag->hdr = (mca_btl_vader_hdr_t*)frag->base.super.ptr; if(frag->hdr != NULL) { - frag->segment.seg_addr.pval = (char *)(frag->hdr + 1); frag->hdr->my_smp_rank = mca_btl_vader_component.my_smp_rank; } } diff --git a/ompi/mca/btl/vader/btl_vader_frag.h b/ompi/mca/btl/vader/btl_vader_frag.h index 2a55ab5250..f7db2a8ff5 100644 --- a/ompi/mca/btl/vader/btl_vader_frag.h +++ b/ompi/mca/btl/vader/btl_vader_frag.h @@ -28,6 +28,7 @@ #define MCA_BTL_VADER_FLAG_INLINE 0 #define MCA_BTL_VADER_FLAG_SINGLE_COPY 1 +#define MCA_BTL_VADER_FLAG_FBOX 2 struct mca_btl_vader_hdr_t { volatile void *next; /* next item in fifo. many peers may touch this */ @@ -63,6 +64,7 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t); frag = (mca_btl_vader_frag_t *) item; \ frag->hdr->complete = false; \ frag->hdr->flags = MCA_BTL_VADER_FLAG_INLINE; \ + frag->segment.seg_addr.pval = (char *)(frag->hdr + 1); \ frag->my_list = &mca_btl_vader_component.vader_frags_eager; \ } while (0) @@ -73,6 +75,7 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t); frag = (mca_btl_vader_frag_t *) item; \ frag->hdr->complete = false; \ frag->hdr->flags = MCA_BTL_VADER_FLAG_INLINE; \ + frag->segment.seg_addr.pval = (char *)(frag->hdr + 1); \ frag->my_list = &mca_btl_vader_component.vader_frags_user; \ } while (0) diff --git a/ompi/mca/btl/vader/btl_vader_get.c b/ompi/mca/btl/vader/btl_vader_get.c index 6fc8e9643f..821d3eadea 100644 --- a/ompi/mca/btl/vader/btl_vader_get.c +++ b/ompi/mca/btl/vader/btl_vader_get.c @@ -42,12 +42,7 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl, rem_ptr = vader_reg_to_ptr (reg, (void *) src->seg_key.ptr); - if (OPAL_LIKELY((uintptr_t)rem_ptr != src->seg_key.ptr) && - src->seg_len >= mca_btl_vader_memcpy_limit) { - memcpy ((void *) dst->seg_key.ptr, rem_ptr, size); - } else { - memmove ((void *) dst->seg_key.ptr, rem_ptr, size); - } + vader_memmove ((void *) dst->seg_key.ptr, rem_ptr, size); vader_return_registration (reg, endpoint->peer_smp_rank); diff --git a/ompi/mca/btl/vader/btl_vader_put.c b/ompi/mca/btl/vader/btl_vader_put.c index ad13f5d0f9..a0f2300523 100644 --- a/ompi/mca/btl/vader/btl_vader_put.c +++ b/ompi/mca/btl/vader/btl_vader_put.c @@ -42,14 +42,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl, rem_ptr = vader_reg_to_ptr (reg, (void *) dst->seg_key.ptr); - if (OPAL_LIKELY((uintptr_t)rem_ptr != dst->seg_key.ptr) && - dst->seg_len >= mca_btl_vader_memcpy_limit) { - /* memcpy is faster at certain sizes but is undefined if the - pointers are aliased */ - memcpy (rem_ptr, (void *) src->seg_key.ptr, size); - } else { - memmove (rem_ptr, (void *) src->seg_key.ptr, size); - } + vader_memmove (rem_ptr, (void *) src->seg_key.ptr, size); vader_return_registration (reg, endpoint->peer_smp_rank); diff --git a/ompi/mca/btl/vader/btl_vader_send.c b/ompi/mca/btl/vader/btl_vader_send.c index cdc1cbe823..88f5cc761a 100644 --- a/ompi/mca/btl/vader/btl_vader_send.c +++ b/ompi/mca/btl/vader/btl_vader_send.c @@ -26,6 +26,7 @@ #include "btl_vader.h" #include "btl_vader_frag.h" #include "btl_vader_fifo.h" +#include "btl_vader_fbox.h" /** * Initiate a send to the peer. @@ -40,6 +41,16 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; + if (frag->hdr->flags & MCA_BTL_VADER_FLAG_FBOX) { + mca_btl_vader_fbox_send (frag->segment.seg_addr.pval, tag, frag->segment.seg_len); + + if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { + MCA_BTL_VADER_FRAG_RETURN(frag); + } + + return 1; + } + /* available header space */ frag->hdr->len = frag->segment.seg_len; /* type of message, pt-2-pt, one-sided, etc */ diff --git a/ompi/mca/btl/vader/btl_vader_sendi.c b/ompi/mca/btl/vader/btl_vader_sendi.c index 2fc40892b2..af9c07df55 100644 --- a/ompi/mca/btl/vader/btl_vader_sendi.c +++ b/ompi/mca/btl/vader/btl_vader_sendi.c @@ -27,6 +27,8 @@ #include "btl_vader_frag.h" #include "btl_vader_fifo.h" +#include "btl_vader_fbox.h" + /** * Initiate an inline send to the peer. * @@ -46,7 +48,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, uint32_t iov_count = 1; struct iovec iov; size_t max_data; - void *data_ptr; + void *data_ptr = NULL; assert (length < mca_btl_vader_component.eager_limit); assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)); @@ -54,9 +56,19 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, /* we won't ever return a descriptor */ *descriptor = NULL; + if (OPAL_LIKELY(!(payload_size && opal_convertor_need_buffers (convertor)))) { + if (payload_size) { + opal_convertor_get_current_pointer (convertor, &data_ptr); + } + + if (mca_btl_vader_fbox_sendi (endpoint, tag, header, header_size, data_ptr, payload_size)) { + return OMPI_SUCCESS; + } + } + /* allocate a fragment, giving up if we can't get one */ - frag = mca_btl_vader_alloc (btl, endpoint, order, length, - flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, length, + flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if (OPAL_UNLIKELY(NULL == frag)) { return OMPI_ERR_OUT_OF_RESOURCE; } @@ -85,7 +97,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, } else if (payload_size) { /* bypassing the convertor may speed things up a little */ opal_convertor_get_current_pointer (convertor, &data_ptr); - memcpy ((uintptr_t)frag->segment.seg_addr.pval + header_size, data_ptr, payload_size); + memcpy ((void *)((uintptr_t)frag->segment.seg_addr.pval + header_size), data_ptr, payload_size); } opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);