diff --git a/ompi/mca/btl/vader/btl_vader_component.c b/ompi/mca/btl/vader/btl_vader_component.c index b6e8bd8319..08794e0bf4 100644 --- a/ompi/mca/btl/vader/btl_vader_component.c +++ b/ompi/mca/btl/vader/btl_vader_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2014 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ @@ -361,17 +361,17 @@ failed: return NULL; } -static int mca_btl_vader_poll_fifo (void) +static inline int mca_btl_vader_poll_fifo (void) { const mca_btl_active_message_callback_t *reg; struct mca_btl_base_endpoint_t *endpoint; mca_btl_vader_hdr_t *hdr; /* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */ - for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) { + for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) { mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}}; - hdr = vader_fifo_read (mca_btl_vader_component.my_fifo); + hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint); if (NULL == hdr) { return fifo_count; } @@ -385,8 +385,6 @@ static int mca_btl_vader_poll_fifo (void) frag.segments[0].seg_addr.pval = (void *) (hdr + 1); frag.segments[0].seg_len = hdr->len; - endpoint = mca_btl_vader_component.endpoints + hdr->src_smp_rank; - if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { mca_mpool_base_registration_t *xpmem_reg; diff --git a/ompi/mca/btl/vader/btl_vader_endpoint.h b/ompi/mca/btl/vader/btl_vader_endpoint.h index 037467d2ef..bcee8ef718 100644 --- a/ompi/mca/btl/vader/btl_vader_endpoint.h +++ b/ompi/mca/btl/vader/btl_vader_endpoint.h @@ -11,6 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -65,6 +67,10 @@ struct mca_btl_base_endpoint_t { #if OMPI_BTL_VADER_HAVE_XPMEM struct mca_rcache_base_module_t *rcache; #endif + + /* enforce ordering */ + uint16_t next_sequence; + uint16_t expected_sequence; }; #endif /* MCA_BTL_VADER_ENDPOINT_H */ diff --git a/ompi/mca/btl/vader/btl_vader_fbox.h b/ompi/mca/btl/vader/btl_vader_fbox.h index 57ebb507a9..5be2d275ef 100644 --- a/ompi/mca/btl/vader/btl_vader_fbox.h +++ b/ompi/mca/btl/vader/btl_vader_fbox.h @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,6 +25,7 @@ /* there should be a power of two number of fast boxes to simplify the math in the * critical path */ #define MCA_BTL_VADER_LAST_FBOX 63 +#define MCA_BTL_VADER_POLL_COUNT 31 /* two bytes are reserved for tag and size (update if the header is modified) */ #define MCA_BTL_VADER_FBOX_HDR_SIZE 4 #define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE) @@ -34,8 +35,9 @@ typedef struct mca_btl_vader_fbox_t { union { struct { - uint16_t size; - uint16_t tag; + uint8_t size; + uint8_t tag; + uint16_t seqn; } hdr_data; uint32_t ival; } hdr; @@ -52,19 +54,14 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct const int next_fbox = ep->next_fbox_out; mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox); + opal_atomic_mb (); + /* todo -- need thread locks/atomics here for the multi-threaded case */ if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) { /* mark this fast box as in use */ fbox->hdr.hdr_data.size = size; - ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox); - return fbox; - } else if (OPAL_LIKELY(size <= (MCA_BTL_VADER_FBOX_MAX_SIZE + MCA_BTL_VADER_FBOX_SIZE) && MCA_BTL_VADER_LAST_FBOX != next_fbox && - 0 == fbox->hdr.ival && 0 == fbox[1].hdr.ival)) { - /* aggregate two fast boxes */ - fbox->hdr.hdr_data.size = size; - - ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox + 1); + opal_atomic_mb (); return fbox; } @@ -72,12 +69,13 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct } static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag, - size_t size) + struct mca_btl_base_endpoint_t *endpoint) { /* ensure data writes have completed before we mark the data as available */ opal_atomic_wmb (); - + fbox->hdr.hdr_data.seqn = endpoint->next_sequence++; fbox->hdr.hdr_data.tag = tag; + opal_atomic_wmb (); } static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag, @@ -98,7 +96,7 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp } /* mark the fbox as sent */ - mca_btl_vader_fbox_send (fbox, tag, header_size + payload_size); + mca_btl_vader_fbox_send (fbox, tag, endpoint); /* send complete */ return 1; @@ -128,8 +126,12 @@ static inline bool mca_btl_vader_check_fboxes (void) processed = true; /* process all fast-box messages */ - while (0 != fbox->hdr.hdr_data.tag) { - opal_atomic_rmb (); + for (int count = 0 ; count <= MCA_BTL_VADER_POLL_COUNT && 0 != fbox->hdr.hdr_data.tag ; ++count) { + if (OPAL_UNLIKELY(endpoint->expected_sequence != fbox->hdr.hdr_data.seqn)) { + break; + } + opal_atomic_mb (); + ++endpoint->expected_sequence; reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag; @@ -138,8 +140,9 @@ static inline bool mca_btl_vader_check_fboxes (void) reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata); - if (fbox->hdr.hdr_data.size > MCA_BTL_VADER_FBOX_MAX_SIZE) { + if (segment.seg_len > MCA_BTL_VADER_FBOX_MAX_SIZE) { fbox[1].hdr.ival = 0; + opal_atomic_mb (); ++next_fbox; } fbox->hdr.ival = 0; @@ -148,6 +151,8 @@ static inline bool mca_btl_vader_check_fboxes (void) fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox); } + opal_atomic_mb (); + endpoint->next_fbox_in = next_fbox; } diff --git a/ompi/mca/btl/vader/btl_vader_fifo.h b/ompi/mca/btl/vader/btl_vader_fifo.h index 55f1bf5204..64029b613c 100644 --- a/ompi/mca/btl/vader/btl_vader_fifo.h +++ b/ompi/mca/btl/vader/btl_vader_fifo.h @@ -91,21 +91,36 @@ static inline void *relative2virtual (fifo_value_t offset) return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base); } -static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo) +static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep) { mca_btl_vader_hdr_t *hdr; fifo_value_t value; + static volatile int32_t lock = 0; - opal_atomic_rmb (); - - value = vader_item_swap (&fifo->fifo_head, VADER_FIFO_FREE); - if (VADER_FIFO_FREE == value) { - /* fifo is empty or we lost the race with another thread */ + if (opal_atomic_swap_32 (&lock, 1)) { return NULL; } + if (VADER_FIFO_FREE == fifo->fifo_head) { + lock = 0; + return NULL; + } + + opal_atomic_rmb (); + + value = fifo->fifo_head; + + *ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS]; hdr = (mca_btl_vader_hdr_t *) relative2virtual (value); + if (OPAL_UNLIKELY(!(hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) && ((*ep)->expected_sequence != hdr->seqn))) { + lock = 0; + return NULL; + } + + fifo->fifo_head = VADER_FIFO_FREE; + ++(*ep)->expected_sequence; + assert (hdr->next != value); if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) { @@ -123,7 +138,7 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo) } opal_atomic_wmb (); - + lock = 0; return hdr; } @@ -159,6 +174,7 @@ static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value) static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep) { hdr->next = VADER_FIFO_FREE; + hdr->seqn = ep->next_sequence++; vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr)); } diff --git a/ompi/mca/btl/vader/btl_vader_frag.c b/ompi/mca/btl/vader/btl_vader_frag.c index 91a4b8619f..b102e4d95e 100644 --- a/ompi/mca/btl/vader/btl_vader_frag.c +++ b/ompi/mca/btl/vader/btl_vader_frag.c @@ -29,7 +29,6 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag) { frag->hdr = (mca_btl_vader_hdr_t*)frag->base.super.ptr; if(frag->hdr != NULL) { - frag->hdr->src_smp_rank = MCA_BTL_VADER_LOCAL_RANK; frag->hdr->frag = frag; frag->hdr->flags = 0; frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); diff --git a/ompi/mca/btl/vader/btl_vader_frag.h b/ompi/mca/btl/vader/btl_vader_frag.h index c100b6eaa5..fa67ed81c7 100644 --- a/ompi/mca/btl/vader/btl_vader_frag.h +++ b/ompi/mca/btl/vader/btl_vader_frag.h @@ -40,7 +40,7 @@ struct mca_btl_vader_hdr_t { struct mca_btl_vader_frag_t *frag; mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */ uint8_t flags; /* vader send flags */ - uint16_t src_smp_rank; /* smp rank of owning process */ + uint16_t seqn; int32_t len; /* length of data following this header */ struct iovec sc_iov; /* io vector containing pointer to single-copy data */ }; diff --git a/ompi/mca/btl/vader/btl_vader_module.c b/ompi/mca/btl/vader/btl_vader_module.c index 32b2c99a6b..8410452d56 100644 --- a/ompi/mca/btl/vader/btl_vader_module.c +++ b/ompi/mca/btl/vader/btl_vader_module.c @@ -194,6 +194,8 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_ ep->next_fbox_out = 0; ep->next_fbox_in = 0; + ep->next_sequence = 0; + ep->expected_sequence = 0; ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE + fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE); @@ -526,6 +528,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ frag->base.des_src_cnt = 2; } else { #endif + /* inline send */ if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { /* try to reserve a fast box for this transfer only if the diff --git a/ompi/mca/btl/vader/btl_vader_send.c b/ompi/mca/btl/vader/btl_vader_send.c index 0b8ede0fbf..f05ae73787 100644 --- a/ompi/mca/btl/vader/btl_vader_send.c +++ b/ompi/mca/btl/vader/btl_vader_send.c @@ -42,7 +42,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; if (OPAL_LIKELY(frag->fbox)) { - mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len); + mca_btl_vader_fbox_send (frag->fbox, tag, endpoint); mca_btl_vader_frag_complete (frag); return 1;