From 12bfd13150bb35caded229fb00a5f14f5a44cc50 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 23 Sep 2014 18:11:22 +0000 Subject: [PATCH] btl/vader: improve performance for both single and multiple threads This is a large update that does the following: - Only allocate fast boxes for a peer if a send count threshold has been reached (default: 16). This will greatly reduce the memory usage with large numbers of local peers. - Improve performance by limiting the number of fast boxes that can be allocated per peer (default: 32). This will reduce the amount of time spent polling for fast box messages. - Provide new MCA variables to configure the size, maximum count, and send count thresholds for fast boxes allocations. - Updated buffer design to increase the range of message sizes that can be sent with a fast box. - Add thread protection around fast box allocation (locks). When spin locks are available this should be updated to use spin locks. - Various fixes and cleanup. This commit was SVN r32774. --- opal/mca/btl/vader/btl_vader.h | 49 ++-- opal/mca/btl/vader/btl_vader_component.c | 243 +++++++++++++------ opal/mca/btl/vader/btl_vader_endpoint.h | 77 ++++-- opal/mca/btl/vader/btl_vader_fbox.h | 295 ++++++++++++++++------- opal/mca/btl/vader/btl_vader_fifo.h | 81 +++++-- opal/mca/btl/vader/btl_vader_frag.c | 12 +- opal/mca/btl/vader/btl_vader_frag.h | 33 ++- opal/mca/btl/vader/btl_vader_get.c | 8 + opal/mca/btl/vader/btl_vader_module.c | 130 ++++++---- opal/mca/btl/vader/btl_vader_put.c | 2 + opal/mca/btl/vader/btl_vader_send.c | 23 +- opal/mca/btl/vader/btl_vader_sendi.c | 16 +- opal/mca/btl/vader/btl_vader_xpmem.c | 2 + opal/mca/btl/vader/btl_vader_xpmem.h | 2 +- 14 files changed, 675 insertions(+), 298 deletions(-) diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index 0d29298ff3..77e43d0ee5 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -64,10 +64,12 @@ #include "opal/mca/btl/base/base.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" +#include "opal/mca/btl/base/btl_base_error.h" #include "opal/util/proc.h" - #include "btl_vader_endpoint.h" +#include "opal/mca/pmix/pmix.h" + BEGIN_C_DECLS #define min(a,b) ((a) < (b) ? (a) : (b)) @@ -97,30 +99,39 @@ struct mca_btl_vader_component_t { mca_btl_base_component_2_0_0_t super; /**< base BTL component */ int vader_free_list_num; /**< initial size of free lists */ int vader_free_list_max; /**< maximum size of free lists */ - int vader_free_list_inc; /**< number of elements to alloc - * when growing free lists */ + int vader_free_list_inc; /**< number of elements to alloc when growing free lists */ #if OPAL_BTL_VADER_HAVE_XPMEM - xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */ + xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */ #else - opal_shmem_ds_t seg_ds; /* this rank's shared memory segment */ + opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment */ #endif - char *my_segment; /* this rank's base pointer */ - size_t segment_size; /* size of my_segment */ - size_t segment_offset; /* start of unused portion of my_segment */ + opal_mutex_t lock; /**< lock to protect concurrent updates to this structure's members */ + char *my_segment; /**< this rank's base pointer */ + size_t segment_size; /**< size of my_segment */ + size_t segment_offset; /**< start of unused portion of my_segment */ int32_t num_smp_procs; /**< current number of smp procs on this host */ ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */ #if !OPAL_BTL_VADER_HAVE_XPMEM - ompi_free_list_t vader_frags_max_send; + ompi_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */ #endif ompi_free_list_t vader_frags_user; /**< free list of vader put/get frags */ - int memcpy_limit; /** Limit where we switch from memmove to memcpy */ - int log_attach_align; /** Log of the alignment for xpmem segments */ - unsigned int max_inline_send; /** Limit for copy-in-copy-out fragments */ + unsigned int fbox_threshold; /**< number of sends required before we setup a send fast box for a peer */ + unsigned int fbox_max; /**< maximum number of send fast boxes to allocate */ + unsigned int fbox_size; /**< size of each peer fast box allocation */ + unsigned int fbox_count; /**< number of send fast boxes allocated */ - struct mca_btl_base_endpoint_t *endpoints; - struct vader_fifo_t *my_fifo; + int memcpy_limit; /**< Limit where we switch from memmove to memcpy */ + int log_attach_align; /**< Log of the alignment for xpmem segments */ + unsigned int max_inline_send; /**< Limit for copy-in-copy-out fragments */ + + mca_btl_base_endpoint_t *endpoints; /**< array of local endpoints (one for each local peer including myself) */ + mca_btl_base_endpoint_t **fbox_in_endpoints; /**< array of fast box in endpoints */ + unsigned int num_fbox_in_endpoints; /**< number of fast boxes to poll */ + struct vader_fifo_t *my_fifo; /**< pointer to the local fifo */ + + opal_list_t pending_endpoints; /**< list of endpoints with pending fragments */ }; typedef struct mca_btl_vader_component_t mca_btl_vader_component_t; OPAL_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component; @@ -136,16 +147,6 @@ struct mca_btl_vader_t { typedef struct mca_btl_vader_t mca_btl_vader_t; OPAL_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader; -/*** - * One or more FIFO components may be a pointer that must be - * accessed by multiple processes. Since the shared region may - * be mmapped differently into each process's address space, - * these pointers will be relative to some base address. Here, - * we define macros to translate between relative addresses and - * virtual addresses. - */ - - /* number of peers on the node (not including self) */ #define MCA_BTL_VADER_NUM_LOCAL_PEERS opal_process_info.num_local_peers diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index da800d8d6a..8eea6fc0e9 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -25,13 +25,14 @@ #include "opal_config.h" #include "opal/util/output.h" +#include "opal/threads/mutex.h" #include "opal/mca/btl/base/btl_base_error.h" -#include "opal/mca/pmix/pmix.h" #include "btl_vader.h" #include "btl_vader_frag.h" #include "btl_vader_fifo.h" #include "btl_vader_fbox.h" +#include "btl_vader_xpmem.h" #include @@ -104,6 +105,7 @@ static int mca_btl_vader_component_register (void) NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.memcpy_limit); +#if OPAL_BTL_VADER_HAVE_XPMEM mca_btl_vader_component.log_attach_align = 21; (void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version, "log_align", "Log base 2 of the alignment to use for xpmem " @@ -112,6 +114,7 @@ static int mca_btl_vader_component_register (void) MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.log_attach_align); +#endif #if OPAL_BTL_VADER_HAVE_XPMEM && 64 == MCA_BTL_VADER_BITNESS mca_btl_vader_component.segment_size = 1 << 24; @@ -139,6 +142,27 @@ static int mca_btl_vader_component_register (void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.max_inline_send); + mca_btl_vader_component.fbox_threshold = 16; + (void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version, + "fbox_threshold", "Number of sends required " + "before an eager send buffer is setup for a peer " + "(default: 16)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, + 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_threshold); + + mca_btl_vader_component.fbox_max = 32; + (void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version, + "fbox_max", "Maximum number of eager send buffers " + "to allocate (default: 32)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, + NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_max); + + mca_btl_vader_component.fbox_size = 4096; + (void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version, + "fbox_size", "Size of per-peer fast transfer buffers (default: 4k)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_size); + mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; #if OPAL_BTL_VADER_HAVE_XPMEM mca_btl_vader.super.btl_eager_limit = 32 * 1024; @@ -189,6 +213,8 @@ static int mca_btl_vader_component_open(void) #if !OPAL_BTL_VADER_HAVE_XPMEM OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t); #endif + OBJ_CONSTRUCT(&mca_btl_vader_component.lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_vader_component.pending_endpoints, opal_list_t); return OPAL_SUCCESS; } @@ -205,6 +231,8 @@ static int mca_btl_vader_component_close(void) #if !OPAL_BTL_VADER_HAVE_XPMEM OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send); #endif + OBJ_DESTRUCT(&mca_btl_vader_component.lock); + OBJ_DESTRUCT(&mca_btl_vader_component.pending_endpoints); if (NULL != mca_btl_vader_component.my_segment) { munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size); @@ -216,8 +244,7 @@ static int mca_btl_vader_component_close(void) static int mca_btl_base_vader_modex_send (void) { struct vader_modex_t modex; - int modex_size; - int rc; + int modex_size, rc; #if OPAL_BTL_VADER_HAVE_XPMEM modex.seg_id = mca_btl_vader_component.my_seg_id; @@ -229,9 +256,9 @@ static int mca_btl_base_vader_modex_send (void) memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size); #endif - OPAL_MODEX_SEND(rc, PMIX_ASYNC_RDY, PMIX_LOCAL, - &mca_btl_vader_component.super.btl_version, - &modex, modex_size); + OPAL_MODEX_SEND(rc, PMIX_SYNC_REQD, PMIX_LOCAL, + &mca_btl_vader_component.super.btl_version, &modex, modex_size); + return rc; } @@ -254,13 +281,14 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, return NULL; } +#if OPAL_BTL_VADER_HAVE_XPMEM /* limit segment alignment to be between 4k and 16M */ - - if (mca_btl_vader_component.log_attach_align < 12) { - mca_btl_vader_component.log_attach_align = 12; - } else if (mca_btl_vader_component.log_attach_align > 25) { - mca_btl_vader_component.log_attach_align = 25; + if (component->log_attach_align < 12) { + component->log_attach_align = 12; + } else if (component->log_attach_align > 25) { + component->log_attach_align = 25; } +#endif btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *)); if (NULL == btls) { @@ -268,28 +296,36 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, } /* ensure a sane segment size */ - if (mca_btl_vader_component.segment_size < (2 << 20)) { - mca_btl_vader_component.segment_size = (2 << 20); + if (component->segment_size < (2 << 20)) { + component->segment_size = (2 << 20); } - if (mca_btl_vader_component.segment_size > (1ul << MCA_BTL_VADER_OFFSET_BITS)) { - mca_btl_vader_component.segment_size = 2ul << MCA_BTL_VADER_OFFSET_BITS; + component->fbox_size = (component->fbox_size + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK; + + if (component->segment_size > (1ul << MCA_BTL_VADER_OFFSET_BITS)) { + component->segment_size = 2ul << MCA_BTL_VADER_OFFSET_BITS; } + /* no fast boxes allocated initially */ + component->num_fbox_in_endpoints = 0; + component->fbox_count = 0; + #if OPAL_BTL_VADER_HAVE_XPMEM + component->my_segment = mmap (NULL, component->segment_size, PROT_READ | + PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if ((void *)-1 == component->my_segment) { + BTL_VERBOSE(("Could not create anonymous memory segment")); + free (btls); + return NULL; + } + /* create an xpmem segment for the entire memory space */ component->my_seg_id = xpmem_make (0, VADER_MAX_ADDRESS, XPMEM_PERMIT_MODE, (void *)0666); if (-1 == component->my_seg_id) { BTL_VERBOSE(("Could not create xpmem segment")); free (btls); - return NULL; - } - - component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ | - PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); - if ((void *)-1 == component->my_segment) { - BTL_VERBOSE(("Could not create anonymous memory segment")); - free (btls); + munmap (component->my_segment, component->segment_size); + component->my_segment = NULL; return NULL; } #else @@ -303,7 +339,7 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, return NULL; } - rc = opal_shmem_segment_create (&mca_btl_vader_component.seg_ds, sm_file, mca_btl_vader_component.segment_size); + rc = opal_shmem_segment_create (&component->seg_ds, sm_file, component->segment_size); free (sm_file); if (OPAL_SUCCESS != rc) { BTL_VERBOSE(("Could not create shared memory segment")); @@ -311,7 +347,7 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, return NULL; } - component->my_segment = opal_shmem_segment_attach (&mca_btl_vader_component.seg_ds); + component->my_segment = opal_shmem_segment_attach (&component->seg_ds); if (NULL == component->my_segment) { BTL_VERBOSE(("Could not attach to just created shared memory segment")); goto failed; @@ -321,15 +357,8 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, component->segment_offset = 0; - memset (component->my_segment + MCA_BTL_VADER_FIFO_SIZE, 0, MCA_BTL_VADER_NUM_LOCAL_PEERS * - MCA_BTL_VADER_FBOX_PEER_SIZE); - /* initialize my fifo */ - rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("Error initializing FIFO")); - goto failed; - } + vader_fifo_init ((struct vader_fifo_t *) component->my_segment); rc = mca_btl_base_vader_modex_send (); if (OPAL_SUCCESS != rc) { @@ -348,9 +377,9 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, return btls; failed: #if OPAL_BTL_VADER_HAVE_XPMEM - munmap (component->my_segment, mca_btl_vader_component.segment_size); + munmap (component->my_segment, component->segment_size); #else - opal_shmem_unlink (&mca_btl_vader_component.seg_ds); + opal_shmem_unlink (&component->seg_ds); #endif if (btls) { @@ -360,70 +389,130 @@ failed: return NULL; } +void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *endpoint) +{ + mca_btl_vader_frag_t frag = {.base = {.des_local = frag.segments, .des_local_count = 1}}; + const mca_btl_active_message_callback_t *reg; + + if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) { + mca_btl_vader_frag_complete (hdr->frag); + return; + } + + reg = mca_btl_base_active_message_trigger + hdr->tag; + frag.segments[0].seg_addr.pval = (void *) (hdr + 1); + frag.segments[0].seg_len = hdr->len; + + if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { + mca_mpool_base_registration_t *xpmem_reg; + + xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base, + hdr->sc_iov.iov_len, 0, + &frag.segments[1].seg_addr.pval); + + frag.segments[1].seg_len = hdr->sc_iov.iov_len; + frag.base.des_local_count = 2; + + /* recv upcall */ + reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag.base, reg->cbdata); + vader_return_registration (xpmem_reg, endpoint); + } else { + reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag.base, reg->cbdata); + } + + if (OPAL_UNLIKELY(MCA_BTL_VADER_FLAG_SETUP_FBOX & hdr->flags)) { + mca_btl_vader_endpoint_setup_fbox_recv (endpoint, relative2virtual(hdr->fbox_base)); + mca_btl_vader_component.fbox_in_endpoints[mca_btl_vader_component.num_fbox_in_endpoints++] = endpoint; + } + + hdr->flags = MCA_BTL_VADER_FLAG_COMPLETE; + vader_fifo_write_back (hdr, endpoint); +} + static inline int mca_btl_vader_poll_fifo (void) { - const mca_btl_active_message_callback_t *reg; struct mca_btl_base_endpoint_t *endpoint; mca_btl_vader_hdr_t *hdr; /* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */ - for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) { - mca_btl_vader_frag_t frag = {.base = {.des_local = frag.segments, .des_local_count = 1}}; - + for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) { hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint); if (NULL == hdr) { return fifo_count; } - if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) { - mca_btl_vader_frag_complete (hdr->frag); - continue; - } - - reg = mca_btl_base_active_message_trigger + hdr->tag; - frag.segments[0].seg_addr.pval = (void *) (hdr + 1); - frag.segments[0].seg_len = hdr->len; - - if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { - mca_mpool_base_registration_t *xpmem_reg; - - xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base, - hdr->sc_iov.iov_len, 0, - &frag.segments[1].seg_addr.pval); - - frag.segments[1].seg_len = hdr->sc_iov.iov_len; - - /* recv upcall */ - frag.base.des_local_count = 2; - reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); - vader_return_registration (xpmem_reg, endpoint); - } else { - reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata); - } - - /* return the fragment */ - hdr->flags = MCA_BTL_VADER_FLAG_COMPLETE; - vader_fifo_write_back (hdr, endpoint); + mca_btl_vader_poll_handle_frag (hdr, endpoint); } return 1; } +/** + * Progress pending messages on an endpoint + * + * @param ep (IN) Vader BTL endpoint + */ +static void mca_btl_vader_progress_waiting (mca_btl_base_endpoint_t *ep) +{ + mca_btl_vader_frag_t *frag; + + OPAL_THREAD_LOCK(&ep->lock); + ep->waiting = false; + while (NULL != (frag = (mca_btl_vader_frag_t *) opal_list_remove_first (&ep->pending_frags))) { + OPAL_THREAD_UNLOCK(&ep->lock); + if (!vader_fifo_write_ep (frag->hdr, ep)) { + opal_list_prepend (&ep->pending_frags, (opal_list_item_t *) frag); + opal_list_append (&mca_btl_vader_component.pending_endpoints, &ep->super); + ep->waiting = true; + break; + } + OPAL_THREAD_LOCK(&ep->lock); + } + OPAL_THREAD_UNLOCK(&ep->lock); +} + +/** + * Progress pending messages on all waiting endpoints + * + * @param ep (IN) Vader BTL endpoint + */ +static void mca_btl_vader_progress_endpoints (void) +{ + int count; + + count = opal_list_get_size (&mca_btl_vader_component.pending_endpoints); + + for (int i = 0 ; i < count ; ++i) { + mca_btl_vader_progress_waiting ((mca_btl_base_endpoint_t *) opal_list_remove_first (&mca_btl_vader_component.pending_endpoints)); + } +} + static int mca_btl_vader_component_progress (void) { - bool fboxed; + static int32_t lock = 0; + int count = 0; - /* check for messages in fast boxes */ - for (int spin_count = 5 ; spin_count ; --spin_count) { - fboxed = (int) mca_btl_vader_check_fboxes (); - if (fboxed) { - break; + if (opal_using_threads()) { + if (opal_atomic_swap_32 (&lock, 1)) { + return 0; } } - if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) { - return (int) fboxed; + /* check for messages in fast boxes */ + if (mca_btl_vader_component.num_fbox_in_endpoints) { + count = mca_btl_vader_check_fboxes (); } - return mca_btl_vader_poll_fifo () + (int) fboxed; + mca_btl_vader_progress_endpoints (); + + if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) { + lock = 0; + return count; + } + + count += mca_btl_vader_poll_fifo (); + opal_atomic_mb (); + lock = 0; + + return count; } diff --git a/opal/mca/btl/vader/btl_vader_endpoint.h b/opal/mca/btl/vader/btl_vader_endpoint.h index 7e38ea0360..3d5b8abcd1 100644 --- a/opal/mca/btl/vader/btl_vader_endpoint.h +++ b/opal/mca/btl/vader/btl_vader_endpoint.h @@ -39,6 +39,9 @@ #include "opal/mca/shmem/base/base.h" #endif +#define MCA_BTL_VADER_FBOX_ALIGNMENT 32 +#define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1) + struct vader_fifo_t; /** @@ -49,28 +52,68 @@ struct vader_fifo_t; struct mca_btl_vader_fbox_t; -struct mca_btl_base_endpoint_t { - int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing - * SMP specfic data structures. */ +typedef struct mca_btl_base_endpoint_t { + opal_list_item_t super; + + /* per peer buffers */ + struct { + unsigned char *buffer; + unsigned int start, seq; + uint32_t *startp; + } fbox_in; + + struct { + unsigned char *buffer; + unsigned int start, end, seq; + uint32_t *startp; + } fbox_out; + + int32_t peer_smp_rank; /**< my peer's SMP process rank. Used for accessing + * SMP specfic data structures. */ + uint32_t send_count; /**< number of fragments sent to this peer */ char *segment_base; + struct vader_fifo_t *fifo; -#if OPAL_BTL_VADER_HAVE_XPMEM - xpmem_apid_t apid; -#else - pid_t pid; - opal_shmem_ds_t seg_ds; -#endif - struct mca_btl_vader_fbox_t * restrict fbox_out; - struct mca_btl_vader_fbox_t * restrict fbox_in; - int next_fbox_out; - int next_fbox_in; + + opal_mutex_t lock; + #if OPAL_BTL_VADER_HAVE_XPMEM struct mca_rcache_base_module_t *rcache; + xpmem_apid_t apid; /**< xpmem apid for remote peer */ +#else + pid_t pid; /**< pid of remote peer (used for CMA) */ + opal_shmem_ds_t *seg_ds; /**< stored segment information for detach */ #endif - /* enforce ordering */ - uint16_t next_sequence; - uint16_t expected_sequence; -}; + /** fragments pending fast box space */ + opal_list_t pending_frags; + /** endpoint is on the component wait list */ + bool waiting; +} mca_btl_base_endpoint_t; + +typedef mca_btl_base_endpoint_t mca_btl_vader_endpoint_t; + +OBJ_CLASS_DECLARATION(mca_btl_vader_endpoint_t); + +static inline void mca_btl_vader_endpoint_setup_fbox_recv (struct mca_btl_base_endpoint_t *endpoint, void *base) +{ + endpoint->fbox_in.buffer = base; + endpoint->fbox_in.startp = (uint32_t *) base; + endpoint->fbox_in.startp[0] = MCA_BTL_VADER_FBOX_ALIGNMENT; + endpoint->fbox_in.start = MCA_BTL_VADER_FBOX_ALIGNMENT; + endpoint->fbox_in.seq = 0; +} + +static inline void mca_btl_vader_endpoint_setup_fbox_send (struct mca_btl_base_endpoint_t *endpoint, void *base) +{ + endpoint->fbox_out.buffer = base; + endpoint->fbox_out.start = MCA_BTL_VADER_FBOX_ALIGNMENT; + endpoint->fbox_out.end = MCA_BTL_VADER_FBOX_ALIGNMENT; + endpoint->fbox_out.startp = (uint32_t *) base; + endpoint->fbox_out.seq = 0; + + /* zero out the first header in the fast box */ + memset ((char *) base + MCA_BTL_VADER_FBOX_ALIGNMENT, 0, MCA_BTL_VADER_FBOX_ALIGNMENT); +} #endif /* MCA_BTL_VADER_ENDPOINT_H */ diff --git a/opal/mca/btl/vader/btl_vader_fbox.h b/opal/mca/btl/vader/btl_vader_fbox.h index 680e65a0a8..1adf80ef30 100644 --- a/opal/mca/btl/vader/btl_vader_fbox.h +++ b/opal/mca/btl/vader/btl_vader_fbox.h @@ -13,90 +13,160 @@ #define MCA_BTL_VADER_FBOX_H #include "btl_vader.h" -#include "btl_vader_endpoint.h" -#include "btl_vader_xpmem.h" -#include - -/* these hard-coded settings are based on the ideal setup for an Opteron 61xx chip and - * may need to be adjusted for other systems. adding an MCA variable is possible but - * can cost 20-40 ns on the fast path. this size is limited to 256 maximum bytes */ -#define MCA_BTL_VADER_FBOX_SIZE 64 -/* there should be a power of two number of fast boxes to simplify the math in the - * critical path */ -#define MCA_BTL_VADER_LAST_FBOX 63 #define MCA_BTL_VADER_POLL_COUNT 31 -/* two bytes are reserved for tag and size (update if the header is modified) */ -#define MCA_BTL_VADER_FBOX_HDR_SIZE 4 -#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE) -/* total size of all the fast boxes assigned to a particular peer */ -#define MCA_BTL_VADER_FBOX_PEER_SIZE (MCA_BTL_VADER_FBOX_SIZE * (MCA_BTL_VADER_LAST_FBOX + 1)) -typedef struct mca_btl_vader_fbox_t { - union { - struct { - uint8_t size; - uint8_t tag; - uint16_t seqn; - } hdr_data; - uint32_t ival; - } hdr; +typedef union mca_btl_vader_fbox_hdr_t { + struct { + uint16_t tag; + uint16_t size; + uint32_t seq; + } data; + uint64_t ival; +} mca_btl_vader_fbox_hdr_t; - uint8_t data[MCA_BTL_VADER_FBOX_MAX_SIZE]; -} mca_btl_vader_fbox_t; +#define MCA_BTL_VADER_FBOX_HDR(x) ((mca_btl_vader_fbox_hdr_t *) (x)) -#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + (fbox)) -#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + (fbox)) -#define MCA_BTL_VADER_NEXT_FBOX(fbox) (((fbox) + 1) & MCA_BTL_VADER_LAST_FBOX) +#define MCA_BTL_VADER_FBOX_OFFSET_MASK 0x7fffffff +#define MCA_BTL_VADER_FBOX_HB_MASK 0x80000000 -static inline mca_btl_vader_fbox_t * mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, const size_t size) +/* if the two offsets are equal and the high bit matches the buffer is empty else the buffer is full. + * note that start will never be end - 1 so this simplified conditional will always produce the correct + * result */ +#define BUFFER_FREE(s,e,hbm,size) (((s + !hbm) > (e)) ? (s) - (e) : (size - (e))) + +/** macro for checking if the high bit is set */ +#define MCA_BTL_VADER_FBOX_OFFSET_HBS(v) (!!((v) & MCA_BTL_VADER_FBOX_HB_MASK)) + +void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, mca_btl_base_endpoint_t *ep); + +/* attempt to reserve a contiguous segment from the remote ep */ +static inline unsigned char *mca_btl_vader_reserve_fbox (mca_btl_base_endpoint_t *ep, size_t size) { - const int next_fbox = ep->next_fbox_out; - mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox); + const unsigned int fbox_size = mca_btl_vader_component.fbox_size; + unsigned int start, end, buffer_free; + size_t data_size = size; + unsigned char *dst; + bool hbs, hbm; - opal_atomic_mb (); - - /* todo -- need thread locks/atomics here for the multi-threaded case */ - if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) { - /* mark this fast box as in use */ - fbox->hdr.hdr_data.size = size; - ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox); - opal_atomic_mb (); - return fbox; + /* don't try to use the per-peer buffer for messages that will fill up more than 25% of the buffer */ + if (OPAL_UNLIKELY(NULL == ep->fbox_out.buffer || size > (fbox_size >> 2))) { + return NULL; } - return NULL; + OPAL_THREAD_LOCK(&ep->lock); + + /* the high bit helps determine if the buffer is empty or full */ + hbs = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.end); + hbm = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs; + + /* read current start and end offsets and check for free space */ + start = ep->fbox_out.start & MCA_BTL_VADER_FBOX_OFFSET_MASK; + end = ep->fbox_out.end & MCA_BTL_VADER_FBOX_OFFSET_MASK; + buffer_free = BUFFER_FREE(start, end, hbm, fbox_size); + + /* need space for the fragment + the header */ + size = (size + sizeof (mca_btl_vader_fbox_hdr_t) + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK; + + dst = ep->fbox_out.buffer + end; + + if (OPAL_UNLIKELY(buffer_free < size)) { + /* check if we need to free up space for this fragment */ + BTL_VERBOSE(("not enough room for a fragment of size %u. in use buffer segment: {start: %x, end: %x, high bit matches: %d}", + (unsigned) size, start, end, (int) hbm)); + + /* read the current start pointer from the remote peer and recalculate the available buffer space */ + start = ep->fbox_out.start = ep->fbox_out.startp[0]; + + /* recalculate how much buffer space is available */ + start &= MCA_BTL_VADER_FBOX_OFFSET_MASK; + hbm = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs; + buffer_free = BUFFER_FREE(start, end, hbm, fbox_size); + + opal_atomic_rmb (); + + /* if this is the end of the buffer and the fragment doesn't fit then mark the remaining buffer space to + * be skipped and check if the fragment can be written at the beginning of the buffer. */ + if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) { + mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = buffer_free - sizeof (mca_btl_vader_fbox_hdr_t), + .seq = ep->fbox_out.seq++, .tag = 0xff}}; + + BTL_VERBOSE(("message will not fit in remaining buffer space. skipping to beginning")); + + MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival; + + end = MCA_BTL_VADER_FBOX_ALIGNMENT; + /* toggle the high bit */ + hbs = !hbs; + /* toggle the high bit match */ + buffer_free = BUFFER_FREE(start, end, !hbm, fbox_size); + dst = ep->fbox_out.buffer + end; + } + + if (OPAL_UNLIKELY(buffer_free < size)) { + ep->fbox_out.end = (hbs << 31) | end; + OPAL_THREAD_UNLOCK(&ep->lock); + return NULL; + } + } + + BTL_VERBOSE(("writing fragment of size %u to offset %u {start: 0x%x, end: 0x%x (hbs: %d)} of peer's buffer. free = %u", + (unsigned int) size, end, start, end, hbs, buffer_free)); + + /* write out part of the header now. the tag will be written when the data is available */ + { + mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = data_size, .tag = 0, .seq = ep->fbox_out.seq++}}; + + MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival; + } + + end += size; + + if (OPAL_UNLIKELY(fbox_size == end)) { + /* toggle the high bit */ + hbs = !hbs; + /* reset the end pointer to the beginning of the buffer */ + end = MCA_BTL_VADER_FBOX_ALIGNMENT; + } else if (buffer_free > size) { + MCA_BTL_VADER_FBOX_HDR(ep->fbox_out.buffer + end)->ival = 0; + } + + /* align the buffer */ + ep->fbox_out.end = ((uint32_t) hbs << 31) | end; + OPAL_THREAD_UNLOCK(&ep->lock); + + return dst + sizeof (mca_btl_vader_fbox_hdr_t); } -static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag, - struct mca_btl_base_endpoint_t *endpoint) +static inline void mca_btl_vader_fbox_send (unsigned char * restrict fbox, unsigned char tag) { /* ensure data writes have completed before we mark the data as available */ opal_atomic_wmb (); - fbox->hdr.hdr_data.seqn = endpoint->next_sequence++; - fbox->hdr.hdr_data.tag = tag; - opal_atomic_wmb (); + + /* the header proceeds the fbox buffer */ + MCA_BTL_VADER_FBOX_HDR ((intptr_t) fbox)[-1].data.tag = tag; } -static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag, +static inline int mca_btl_vader_fbox_sendi (mca_btl_base_endpoint_t *ep, char tag, void * restrict header, const size_t header_size, void * restrict payload, const size_t payload_size) { - mca_btl_vader_fbox_t * restrict fbox; + const size_t total_size = header_size + payload_size; + unsigned char * restrict fbox; - fbox = mca_btl_vader_reserve_fbox(endpoint, header_size + payload_size); + fbox = mca_btl_vader_reserve_fbox(ep, total_size); if (OPAL_UNLIKELY(NULL == fbox)) { return 0; } - memcpy (fbox->data, header, header_size); + memcpy (fbox, header, header_size); if (payload) { /* inline sends are typically just pml headers (due to MCA_BTL_FLAGS_SEND_INPLACE) */ - memcpy (fbox->data + header_size, payload, payload_size); + memcpy (fbox + header_size, payload, payload_size); } /* mark the fbox as sent */ - mca_btl_vader_fbox_send (fbox, tag, endpoint); + mca_btl_vader_fbox_send (fbox, tag); /* send complete */ return 1; @@ -104,59 +174,108 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp static inline bool mca_btl_vader_check_fboxes (void) { - const mca_btl_active_message_callback_t *reg; - struct mca_btl_base_endpoint_t *endpoint; - mca_btl_vader_fbox_t * restrict fbox; - mca_btl_base_segment_t segment; - mca_btl_base_descriptor_t desc; + const unsigned int fbox_size = mca_btl_vader_component.fbox_size; bool processed = false; - int next_fbox; - for (endpoint = mca_btl_vader_component.endpoints ; endpoint->peer_smp_rank != -1 ; ++endpoint) { - next_fbox = endpoint->next_fbox_in; - fbox = MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox); + for (unsigned int i = 0 ; i < mca_btl_vader_component.num_fbox_in_endpoints ; ++i) { + mca_btl_base_endpoint_t *ep = mca_btl_vader_component.fbox_in_endpoints[i]; + unsigned int start = ep->fbox_in.start & MCA_BTL_VADER_FBOX_OFFSET_MASK; - if (NULL == endpoint->fbox_in || 0 == fbox->hdr.hdr_data.tag) { - continue; - } + /* save the current high bit state */ + bool hbs = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_in.start); + int poll_count; - desc.des_local = &segment; - desc.des_local_count = 1; + for (poll_count = 0 ; poll_count <= MCA_BTL_VADER_POLL_COUNT ; ++poll_count) { + const mca_btl_vader_fbox_hdr_t hdr = {.ival = MCA_BTL_VADER_FBOX_HDR(ep->fbox_in.buffer + start)->ival}; - processed = true; - - /* process all fast-box messages */ - for (int count = 0 ; count <= MCA_BTL_VADER_POLL_COUNT && 0 != fbox->hdr.hdr_data.tag ; ++count) { - if (OPAL_UNLIKELY(endpoint->expected_sequence != fbox->hdr.hdr_data.seqn)) { + /* check for a valid tag a sequence number */ + if (0 == hdr.data.tag || hdr.data.seq != ep->fbox_in.seq) { break; } - opal_atomic_mb (); - ++endpoint->expected_sequence; - reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag; + ++ep->fbox_in.seq; - segment.seg_addr.pval = fbox->data; - segment.seg_len = fbox->hdr.hdr_data.size; + /* force all prior reads to complete before continuing */ + opal_atomic_rmb (); - reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata); + BTL_VERBOSE(("got frag with header {.tag = %d, .size = %d} from offset %u", hdr.data.tag, + hdr.data.size, start)); - if (segment.seg_len > MCA_BTL_VADER_FBOX_MAX_SIZE) { - fbox[1].hdr.ival = 0; - opal_atomic_mb (); - ++next_fbox; + /* the 0xff tag indicates we should skip the rest of the buffer */ + if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) { + mca_btl_base_segment_t segment; + mca_btl_base_descriptor_t desc = {.des_local = &segment, .des_local_count = 1}; + const mca_btl_active_message_callback_t *reg = + mca_btl_base_active_message_trigger + hdr.data.tag; + + /* fragment fits entirely in the remaining buffer space. some + * btl users do not handle fragmented data so we can't split + * the fragment without introducing another copy here. this + * limitation has not appeared to cause any performance + * degradation. */ + segment.seg_len = hdr.data.size; + segment.seg_addr.pval = (void *) (ep->fbox_in.buffer + start + sizeof (hdr)); + + /* call the registered callback function */ + reg->cbfunc(&mca_btl_vader.super, hdr.data.tag, &desc, reg->cbdata); + } else if (OPAL_LIKELY(0xfe == hdr.data.tag)) { + /* process fragment header */ + fifo_value_t *value = (fifo_value_t *)(ep->fbox_in.buffer + start + sizeof (hdr)); + mca_btl_vader_hdr_t *hdr = relative2virtual(*value); + mca_btl_vader_poll_handle_frag (hdr, ep); } - fbox->hdr.ival = 0; - next_fbox = MCA_BTL_VADER_NEXT_FBOX(next_fbox); - fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox); + start = (start + hdr.data.size + sizeof (hdr) + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK; + if (OPAL_UNLIKELY(fbox_size == start)) { + /* jump to the beginning of the buffer */ + start = MCA_BTL_VADER_FBOX_ALIGNMENT; + /* toggle the high bit */ + hbs = !hbs; + } } - opal_atomic_mb (); + if (poll_count) { + BTL_VERBOSE(("left off at offset %u (hbs: %d)", start, hbs)); - endpoint->next_fbox_in = next_fbox; + /* save where we left off */ + /* let the sender know where we stopped */ + ep->fbox_in.start = ep->fbox_in.startp[0] = ((uint32_t) hbs << 31) | start; + processed = true; + } } + /* return the number of fragments processed */ return processed; } +static inline void mca_btl_vader_try_fbox_setup (mca_btl_base_endpoint_t *ep, mca_btl_vader_hdr_t *hdr) +{ + if (NULL == ep->fbox_out.buffer && mca_btl_vader_component.fbox_max > mca_btl_vader_component.fbox_count && + mca_btl_vader_component.fbox_threshold <= ++ep->send_count) { + + /* protect access to mca_btl_vader_component.segment_offset */ + OPAL_THREAD_LOCK(&mca_btl_vader_component.lock); + + if (mca_btl_vader_component.segment_size >= mca_btl_vader_component.segment_offset + mca_btl_vader_component.fbox_size) { + /* verify the remote side will accept another fbox */ + if (0 <= opal_atomic_add_32 (&ep->fifo->fbox_available, -1)) { + void *fbox_base = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset; + mca_btl_vader_component.segment_offset += mca_btl_vader_component.fbox_size; + + /* zero out the fast box */ + memset (fbox_base, 0, mca_btl_vader_component.fbox_size); + mca_btl_vader_endpoint_setup_fbox_send (ep, fbox_base); + + hdr->flags |= MCA_BTL_VADER_FLAG_SETUP_FBOX; + hdr->fbox_base = virtual2relative((char *) ep->fbox_out.buffer); + ++mca_btl_vader_component.fbox_count; + } else { + opal_atomic_add_32 (&ep->fifo->fbox_available, 1); + } + } + + OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock); + } +} + #endif /* !defined(MCA_BTL_VADER_FBOX_H) */ diff --git a/opal/mca/btl/vader/btl_vader_fifo.h b/opal/mca/btl/vader/btl_vader_fifo.h index ebbb4153b8..d63ffb49ad 100644 --- a/opal/mca/btl/vader/btl_vader_fifo.h +++ b/opal/mca/btl/vader/btl_vader_fifo.h @@ -70,11 +70,21 @@ typedef struct vader_fifo_t { volatile fifo_value_t fifo_head; volatile fifo_value_t fifo_tail; + volatile int32_t fbox_available; } vader_fifo_t; /* large enough to ensure the fifo is on its own cache line */ #define MCA_BTL_VADER_FIFO_SIZE 128 +/*** + * One or more FIFO components may be a pointer that must be + * accessed by multiple processes. Since the shared region may + * be mmapped differently into each process's address space, + * these pointers will be relative to some base address. Here, + * we define inline functions to translate between relative + * addresses and virtual addresses. + */ + /* This only works for finding the relative address for a pointer within my_segment */ static inline fifo_value_t virtual2relative (char *addr) { @@ -91,18 +101,26 @@ static inline void *relative2virtual (fifo_value_t offset) return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base); } +#include "btl_vader_fbox.h" + +/** + * vader_fifo_read: + * + * @brief reads a single fragment from a local fifo + * + * @param[inout] fifo - FIFO to read from + * @param[out] ep - returns the endpoint the fifo element was read from + * + * @returns a fragment header or NULL + * + * This function does not currently support multiple readers. + */ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep) { mca_btl_vader_hdr_t *hdr; fifo_value_t value; - static volatile int32_t lock = 0; - - if (opal_atomic_swap_32 (&lock, 1)) { - return NULL; - } if (VADER_FIFO_FREE == fifo->fifo_head) { - lock = 0; return NULL; } @@ -113,13 +131,7 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct m *ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS]; hdr = (mca_btl_vader_hdr_t *) relative2virtual (value); - if (OPAL_UNLIKELY(!(hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) && ((*ep)->expected_sequence != hdr->seqn))) { - lock = 0; - return NULL; - } - fifo->fifo_head = VADER_FIFO_FREE; - ++(*ep)->expected_sequence; assert (hdr->next != value); @@ -138,16 +150,14 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct m } opal_atomic_wmb (); - lock = 0; return hdr; } -static inline int vader_fifo_init (vader_fifo_t *fifo) +static inline void vader_fifo_init (vader_fifo_t *fifo) { fifo->fifo_head = fifo->fifo_tail = VADER_FIFO_FREE; + fifo->fbox_available = mca_btl_vader_component.fbox_max; mca_btl_vader_component.my_fifo = fifo; - - return OPAL_SUCCESS; } static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value) @@ -170,15 +180,44 @@ static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value) opal_atomic_wmb (); } -/* write a frag (relative to this process' base) to another rank's fifo */ -static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep) +/** + * vader_fifo_write_ep: + * + * @brief write a frag (relative to this process' base) to another rank's fifo + * + * @param[in] hdr - fragment header to write + * @param[in] ep - endpoint to write the fragment to + * + * This function is used to send a fragment to a remote peer. {hdr} must belong + * to the current process. + */ +static inline bool vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep) { + fifo_value_t rhdr = virtual2relative ((char *) hdr); + if (ep->fbox_out.buffer) { + /* if there is a fast box for this peer then use the fast box to send the fragment header. + * this is done to ensure fragment ordering */ + opal_atomic_wmb (); + return mca_btl_vader_fbox_sendi (ep, 0xfe, &rhdr, sizeof (rhdr), NULL, 0); + } + mca_btl_vader_try_fbox_setup (ep, hdr); hdr->next = VADER_FIFO_FREE; - hdr->seqn = ep->next_sequence++; - vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr)); + vader_fifo_write (ep->fifo, rhdr); + + return true; } -/* write a frag (relative to the remote process' base) to the remote fifo. note the remote peer must own hdr */ +/** + * vader_fifo_write_back: + * + * @brief write a frag (relative to the remote process' base) to the remote fifo + * + * @param[in] hdr - fragment header to write + * @param[in] ep - endpoint the fragment belongs to + * + * This function is used to return a fragment to the sending process. It differs from vader_fifo_write_ep + * in that it uses the {ep} to produce the relative address. + */ static inline void vader_fifo_write_back (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep) { hdr->next = VADER_FIFO_FREE; diff --git a/opal/mca/btl/vader/btl_vader_frag.c b/opal/mca/btl/vader/btl_vader_frag.c index 9945f73985..2796627916 100644 --- a/opal/mca/btl/vader/btl_vader_frag.c +++ b/opal/mca/btl/vader/btl_vader_frag.c @@ -42,12 +42,18 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag) void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) item; - unsigned int frag_size = (unsigned int)(uintptr_t) ctx; - unsigned int data_size = frag_size - sizeof (mca_btl_vader_hdr_t); + unsigned int data_size = (unsigned int)(uintptr_t) ctx; + unsigned int frag_size = data_size + sizeof (mca_btl_vader_hdr_t); assert (data_size > 0); + /* ensure next fragment is aligned on a cache line */ + frag_size = (frag_size + 63) & ~63; + + OPAL_THREAD_LOCK(&mca_btl_vader_component.lock); + if (mca_btl_vader_component.segment_size < mca_btl_vader_component.segment_offset + frag_size) { + OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock); item->ptr = NULL; return; } @@ -69,6 +75,8 @@ void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx) item->ptr = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset; mca_btl_vader_component.segment_offset += frag_size; + OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock); + mca_btl_vader_frag_constructor ((mca_btl_vader_frag_t *) item); } diff --git a/opal/mca/btl/vader/btl_vader_frag.h b/opal/mca/btl/vader/btl_vader_frag.h index b653c671e4..3da2853f8c 100644 --- a/opal/mca/btl/vader/btl_vader_frag.h +++ b/opal/mca/btl/vader/btl_vader_frag.h @@ -30,19 +30,30 @@ enum { MCA_BTL_VADER_FLAG_INLINE = 0, MCA_BTL_VADER_FLAG_SINGLE_COPY = 1, MCA_BTL_VADER_FLAG_COMPLETE = 2, + MCA_BTL_VADER_FLAG_SETUP_FBOX = 4, }; struct mca_btl_vader_frag_t; struct mca_btl_vader_fbox_t; +/** + * FIFO fragment header + */ struct mca_btl_vader_hdr_t { - volatile intptr_t next; /* next item in fifo. many peers may touch this */ + /** next item in fifo. many peers may touch this */ + volatile intptr_t next; + /** pointer back the the fragment */ struct mca_btl_vader_frag_t *frag; - mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */ - uint8_t flags; /* vader send flags */ - uint16_t seqn; - int32_t len; /* length of data following this header */ - struct iovec sc_iov; /* io vector containing pointer to single-copy data */ + /** tag associated with this fragment (used to lookup callback) */ + mca_btl_base_tag_t tag; + /** vader send flags (inline, complete, setup fbox, etc) */ + uint8_t flags; + /** length of data following this header */ + int32_t len; + /** io vector containing pointer to single-copy data */ + struct iovec sc_iov; + /** if the fragment indicates to setup a fast box the base is stored here */ + intptr_t fbox_base; }; typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t; @@ -50,11 +61,17 @@ typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t; * shared memory send fragment derived type. */ struct mca_btl_vader_frag_t { + /** base object */ mca_btl_base_descriptor_t base; + /** storage for segment data (max 2) */ mca_btl_base_segment_t segments[2]; + /** endpoint this fragment is active on */ struct mca_btl_base_endpoint_t *endpoint; - struct mca_btl_vader_fbox_t *fbox; - mca_btl_vader_hdr_t *hdr; /* in the shared memory region */ + /** fast box in use (or NULL) */ + unsigned char * restrict fbox; + /** fragment header (in the shared memory region) */ + mca_btl_vader_hdr_t *hdr; + /** free list this fragment was allocated within */ ompi_free_list_t *my_list; }; diff --git a/opal/mca/btl/vader/btl_vader_get.c b/opal/mca/btl/vader/btl_vader_get.c index 213814e9d5..56294a3667 100644 --- a/opal/mca/btl/vader/btl_vader_get.c +++ b/opal/mca/btl/vader/btl_vader_get.c @@ -53,6 +53,10 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl, vader_return_registration (reg, endpoint); + /* always call the callback function */ + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + frag->endpoint = endpoint; mca_btl_vader_frag_complete (frag); return OPAL_SUCCESS; @@ -76,6 +80,10 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl, return OPAL_ERROR; } + /* always call the callback function */ + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + frag->endpoint = endpoint; mca_btl_vader_frag_complete (frag); return OPAL_SUCCESS; diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index da03362aee..b40d656ede 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -24,8 +24,6 @@ #include "opal_config.h" -#include "opal/mca/pmix/pmix.h" - #include "btl_vader.h" #include "btl_vader_endpoint.h" #include "btl_vader_fifo.h" @@ -105,8 +103,13 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) /* generate the endpoints */ component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n + 1, sizeof (struct mca_btl_base_endpoint_t)); component->endpoints[n].peer_smp_rank = -1; + component->fbox_in_endpoints = calloc (n + 1, sizeof (void *)); - component->segment_offset = (n - 1) * MCA_BTL_VADER_FBOX_PEER_SIZE + MCA_BTL_VADER_FIFO_SIZE; + if (NULL == component->endpoints || NULL == component->fbox_in_endpoints) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + component->segment_offset = MCA_BTL_VADER_FIFO_SIZE; /* initialize fragment descriptor free lists */ /* initialize free list for put/get/single copy/inline fragments */ @@ -118,8 +121,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) component->vader_free_list_max, component->vader_free_list_inc, NULL, mca_btl_vader_frag_init, - (void *) (sizeof(mca_btl_vader_hdr_t) + - mca_btl_vader_component.max_inline_send)); + (void *)(intptr_t) mca_btl_vader_component.max_inline_send); if (OPAL_SUCCESS != rc) { return rc; } @@ -133,8 +135,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) component->vader_free_list_max, component->vader_free_list_inc, NULL, mca_btl_vader_frag_init, - (void *) (sizeof (mca_btl_vader_hdr_t) + - mca_btl_vader.super.btl_eager_limit)); + (void *)(intptr_t) mca_btl_vader.super.btl_eager_limit); if (OPAL_SUCCESS != rc) { return rc; } @@ -149,8 +150,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) component->vader_free_list_max, component->vader_free_list_inc, NULL, mca_btl_vader_frag_init, - (void *) (sizeof (mca_btl_vader_hdr_t) + - mca_btl_vader.super.btl_max_send_size)); + (void *)(intptr_t) mca_btl_vader.super.btl_max_send_size); if (OPAL_SUCCESS != rc) { return rc; } @@ -164,18 +164,17 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) { - const int fbox_in_offset = MCA_BTL_VADER_LOCAL_RANK - (MCA_BTL_VADER_LOCAL_RANK > remote_rank); - const int fbox_out_offset = remote_rank - (MCA_BTL_VADER_LOCAL_RANK < remote_rank); mca_btl_vader_component_t *component = &mca_btl_vader_component; struct vader_modex_t *modex; size_t msg_size; int rc; + OBJ_CONSTRUCT(ep, mca_btl_vader_endpoint_t); + ep->peer_smp_rank = remote_rank; if (remote_rank != MCA_BTL_VADER_LOCAL_RANK) { - OPAL_MODEX_RECV(rc, &component->super.btl_version, - proc, (uint8_t**)&modex, &msg_size); + OPAL_MODEX_RECV(rc, &component->super.btl_version, proc, (void **) &modex, &msg_size); if (OPAL_SUCCESS != rc) { return rc; } @@ -189,24 +188,23 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_ MCA_MPOOL_FLAGS_PERSIST, (void **) &ep->segment_base); #else msg_size -= offsetof (struct vader_modex_t, seg_ds); - memcpy (&ep->seg_ds, &modex->seg_ds, msg_size); - ep->segment_base = opal_shmem_segment_attach (&ep->seg_ds); + + /* store a copy of the segment information for detach */ + ep->seg_ds = malloc (msg_size); + if (NULL == ep->seg_ds) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + memcpy (ep->seg_ds, &modex->seg_ds, msg_size); + + ep->segment_base = opal_shmem_segment_attach (ep->seg_ds); if (NULL == ep->segment_base) { return rc; } #endif + OBJ_CONSTRUCT(&ep->lock, opal_mutex_t); free (modex); - - ep->next_fbox_out = 0; - ep->next_fbox_in = 0; - ep->next_sequence = 0; - ep->expected_sequence = 0; - - ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE + - fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE); - ep->fbox_out = (struct mca_btl_vader_fbox_t * restrict) (component->my_segment + MCA_BTL_VADER_FIFO_SIZE + - fbox_out_offset * MCA_BTL_VADER_FBOX_PEER_SIZE); } else { /* set up the segment base so we can calculate a virtual to real for local pointers */ ep->segment_base = component->my_segment; @@ -220,36 +218,52 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_ static int fini_vader_endpoint (struct mca_btl_base_endpoint_t *ep) { - if (NULL != ep->fbox_out) { #if OPAL_BTL_VADER_HAVE_XPMEM - if (ep->rcache) { - /* clean out the registration cache */ - const int nregs = 100; - mca_mpool_base_registration_t *regs[nregs]; - int reg_cnt; + if (ep->rcache) { + /* clean out the registration cache */ + const int nregs = 100; + mca_mpool_base_registration_t *regs[nregs]; + int reg_cnt; - do { - reg_cnt = ep->rcache->rcache_find_all(ep->rcache, 0, (size_t)-1, + do { + reg_cnt = ep->rcache->rcache_find_all(ep->rcache, 0, (size_t)-1, regs, nregs); - for (int i = 0 ; i < reg_cnt ; ++i) { - /* otherwise dereg will fail on assert */ - regs[i]->ref_count = 0; - OBJ_RELEASE(regs[i]); - } - } while (reg_cnt == nregs); + for (int i = 0 ; i < reg_cnt ; ++i) { + /* otherwise dereg will fail on assert */ + regs[i]->ref_count = 0; + OBJ_RELEASE(regs[i]); + } + } while (reg_cnt == nregs); - ep->rcache = NULL; - } - xpmem_release (ep->apid); -#else - opal_shmem_segment_detach (&ep->seg_ds); -#endif + ep->rcache = NULL; } - ep->fbox_in = ep->fbox_out = NULL; + if (ep->segment_base) { + xpmem_release (ep->apid); + ep->apid = 0; + } +#else + if (ep->seg_ds) { + opal_shmem_ds_t seg_ds; + + /* opal_shmem_segment_detach expects a opal_shmem_ds_t and will + * stomp past the end of the seg_ds if it is too small (which + * ep->seg_ds probably is) */ + memcpy (&seg_ds, ep->seg_ds, opal_shmem_sizeof_shmem_ds (ep->seg_ds)); + free (ep->seg_ds); + ep->seg_ds = NULL; + + /* disconnect from the peer's segment */ + opal_shmem_segment_detach (&seg_ds); + } +#endif + + ep->fbox_in.buffer = ep->fbox_out.buffer = NULL; ep->segment_base = NULL; + OBJ_DESTRUCT(ep); + return OPAL_SUCCESS; } @@ -378,8 +392,13 @@ static int vader_finalize(struct mca_btl_base_module_t *btl) } free (component->endpoints); + component->endpoints = NULL; + vader_btl->btl_inited = false; + free (component->fbox_in_endpoints); + component->fbox_in_endpoints = NULL; + #if !OPAL_BTL_VADER_HAVE_XPMEM opal_shmem_unlink (&mca_btl_vader_component.seg_ds); opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds); @@ -488,8 +507,8 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ uint32_t flags) { const size_t total_size = reserve + *size; - mca_btl_vader_fbox_t *fbox; mca_btl_vader_frag_t *frag; + unsigned char *fbox; void *data_ptr; int rc; @@ -562,15 +581,14 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ * fragment does not belong to the caller */ fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); if (OPAL_LIKELY(fbox)) { - frag->segments[0].seg_addr.pval = fbox->data; + frag->segments[0].seg_addr.pval = fbox; } frag->fbox = fbox; } /* NTH: the covertor adds some latency so we bypass it here */ - vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), - data_ptr, *size); + memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); frag->segments[0].seg_len = total_size; #if OPAL_BTL_VADER_HAVE_XPMEM } @@ -602,3 +620,15 @@ static int vader_ft_event (int state) { return OPAL_SUCCESS; } + +static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep) +{ + OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t); +} + +static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep) +{ + OBJ_DESTRUCT(&ep->pending_frags); +} + +OBJ_CLASS_INSTANCE(mca_btl_vader_endpoint_t, opal_list_item_t, mca_btl_vader_endpoint_constructor, mca_btl_vader_endpoint_destructor); diff --git a/opal/mca/btl/vader/btl_vader_put.c b/opal/mca/btl/vader/btl_vader_put.c index 41b865d523..e59dbf526a 100644 --- a/opal/mca/btl/vader/btl_vader_put.c +++ b/opal/mca/btl/vader/btl_vader_put.c @@ -56,6 +56,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl, /* always call the callback function */ frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + frag->endpoint = endpoint; mca_btl_vader_frag_complete (frag); return OPAL_SUCCESS; @@ -79,6 +80,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl, return OPAL_ERROR; } + frag->endpoint = endpoint; mca_btl_vader_frag_complete (frag); return OPAL_SUCCESS; diff --git a/opal/mca/btl/vader/btl_vader_send.c b/opal/mca/btl/vader/btl_vader_send.c index 53327c0f96..1d21cd1b84 100644 --- a/opal/mca/btl/vader/btl_vader_send.c +++ b/opal/mca/btl/vader/btl_vader_send.c @@ -12,8 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,27 +40,38 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; + const size_t total_size = frag->segments[0].seg_len; if (OPAL_LIKELY(frag->fbox)) { - mca_btl_vader_fbox_send (frag->fbox, tag, endpoint); + mca_btl_vader_fbox_send (frag->fbox, tag); mca_btl_vader_frag_complete (frag); return 1; } /* header (+ optional inline data) */ - frag->hdr->len = frag->segments[0].seg_len; + frag->hdr->len = total_size; /* type of message, pt-2-pt, one-sided, etc */ frag->hdr->tag = tag; /* post the relative address of the descriptor into the peer's fifo */ - vader_fifo_write_ep (frag->hdr, endpoint); + if (opal_list_get_size (&endpoint->pending_frags) || !vader_fifo_write_ep (frag->hdr, endpoint)) { + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + OPAL_THREAD_LOCK(&endpoint->lock); + opal_list_append (&endpoint->pending_frags, (opal_list_item_t *) frag); + if (!endpoint->waiting) { + opal_list_append (&mca_btl_vader_component.pending_endpoints, &endpoint->super); + endpoint->waiting = true; + } + OPAL_THREAD_UNLOCK(&endpoint->lock); + return OPAL_SUCCESS; + } if ((frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) || !(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - return 0; + return OPAL_SUCCESS; } /* data is gone (from the pml's perspective). frag callback/release will diff --git a/opal/mca/btl/vader/btl_vader_sendi.c b/opal/mca/btl/vader/btl_vader_sendi.c index 4a9a4c7980..25b5d691b6 100644 --- a/opal/mca/btl/vader/btl_vader_sendi.c +++ b/opal/mca/btl/vader/btl_vader_sendi.c @@ -12,8 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,6 +47,12 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, void *data_ptr = NULL; size_t length; + /* don't attempt sendi if there are pending fragments on the endpoint */ + if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->pending_frags))) { + *descriptor = NULL; + return OPAL_ERR_OUT_OF_RESOURCE; + } + if (payload_size) { opal_convertor_get_current_pointer (convertor, &data_ptr); } @@ -56,7 +62,6 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, return OPAL_SUCCESS; } - length = header_size + payload_size; /* allocate a fragment, giving up if we can't get one */ @@ -92,7 +97,10 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, } /* write the fragment pointer to peer's the FIFO. the progress function will return the fragment */ - vader_fifo_write_ep (frag->hdr, endpoint); + if (!vader_fifo_write_ep (frag->hdr, endpoint)) { + *descriptor = &frag->base; + return OPAL_ERR_OUT_OF_RESOURCE; + } return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_xpmem.c b/opal/mca/btl/vader/btl_vader_xpmem.c index 5962fb6b6c..58ea426b9d 100644 --- a/opal/mca/btl/vader/btl_vader_xpmem.c +++ b/opal/mca/btl/vader/btl_vader_xpmem.c @@ -9,6 +9,8 @@ * $HEADER$ */ +#include "btl_vader.h" + #include "opal/include/opal/align.h" #include "btl_vader_xpmem.h" #include "opal/mca/memchecker/base/base.h" diff --git a/opal/mca/btl/vader/btl_vader_xpmem.h b/opal/mca/btl/vader/btl_vader_xpmem.h index 11ff28863f..1634ff1ee8 100644 --- a/opal/mca/btl/vader/btl_vader_xpmem.h +++ b/opal/mca/btl/vader/btl_vader_xpmem.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ *