1
1

btl/vader: improve performance under heavy load and eliminate a racy

feature

This commit should fix a hang seen when running some of the one-sided
tests. The downside of this fix is it reduces the maximum size of the
messages that use the fast boxes. I will fix this in a later commit.

To improve performance under a heavy load I introduced sequencing to
ensure messages are given to the pml in order. I have seen little-no
impact on the message rate or latency with this change and there is a
clear improvement to the heavy message rate case.

Lets let this sit in the trunk for a couple of days to ensure that
everything is working correctly.

cmr=v1.8.2:reviewer=jsquyres

This commit was SVN r31522.
Этот коммит содержится в:
Nathan Hjelm 2014-04-24 17:36:03 +00:00
родитель e243805ed8
Коммит 0849d61e38
8 изменённых файлов: 61 добавлений и 34 удалений

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC. * Copyright (c) 2010-2014 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -361,17 +361,17 @@ failed:
return NULL; return NULL;
} }
static int mca_btl_vader_poll_fifo (void) static inline int mca_btl_vader_poll_fifo (void)
{ {
const mca_btl_active_message_callback_t *reg; const mca_btl_active_message_callback_t *reg;
struct mca_btl_base_endpoint_t *endpoint; struct mca_btl_base_endpoint_t *endpoint;
mca_btl_vader_hdr_t *hdr; mca_btl_vader_hdr_t *hdr;
/* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */ /* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */
for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) { for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) {
mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}}; mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}};
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo); hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint);
if (NULL == hdr) { if (NULL == hdr) {
return fifo_count; return fifo_count;
} }
@ -385,8 +385,6 @@ static int mca_btl_vader_poll_fifo (void)
frag.segments[0].seg_addr.pval = (void *) (hdr + 1); frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
frag.segments[0].seg_len = hdr->len; frag.segments[0].seg_len = hdr->len;
endpoint = mca_btl_vader_component.endpoints + hdr->src_smp_rank;
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
mca_mpool_base_registration_t *xpmem_reg; mca_mpool_base_registration_t *xpmem_reg;

Просмотреть файл

@ -11,6 +11,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -65,6 +67,10 @@ struct mca_btl_base_endpoint_t {
#if OMPI_BTL_VADER_HAVE_XPMEM #if OMPI_BTL_VADER_HAVE_XPMEM
struct mca_rcache_base_module_t *rcache; struct mca_rcache_base_module_t *rcache;
#endif #endif
/* enforce ordering */
uint16_t next_sequence;
uint16_t expected_sequence;
}; };
#endif /* MCA_BTL_VADER_ENDPOINT_H */ #endif /* MCA_BTL_VADER_ENDPOINT_H */

Просмотреть файл

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* All rights reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -25,6 +25,7 @@
/* there should be a power of two number of fast boxes to simplify the math in the /* there should be a power of two number of fast boxes to simplify the math in the
* critical path */ * critical path */
#define MCA_BTL_VADER_LAST_FBOX 63 #define MCA_BTL_VADER_LAST_FBOX 63
#define MCA_BTL_VADER_POLL_COUNT 31
/* two bytes are reserved for tag and size (update if the header is modified) */ /* two bytes are reserved for tag and size (update if the header is modified) */
#define MCA_BTL_VADER_FBOX_HDR_SIZE 4 #define MCA_BTL_VADER_FBOX_HDR_SIZE 4
#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE) #define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE)
@ -34,8 +35,9 @@
typedef struct mca_btl_vader_fbox_t { typedef struct mca_btl_vader_fbox_t {
union { union {
struct { struct {
uint16_t size; uint8_t size;
uint16_t tag; uint8_t tag;
uint16_t seqn;
} hdr_data; } hdr_data;
uint32_t ival; uint32_t ival;
} hdr; } hdr;
@ -52,19 +54,14 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct
const int next_fbox = ep->next_fbox_out; const int next_fbox = ep->next_fbox_out;
mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox); mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
opal_atomic_mb ();
/* todo -- need thread locks/atomics here for the multi-threaded case */ /* todo -- need thread locks/atomics here for the multi-threaded case */
if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) { if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) {
/* mark this fast box as in use */ /* mark this fast box as in use */
fbox->hdr.hdr_data.size = size; fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox); ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
return fbox; opal_atomic_mb ();
} else if (OPAL_LIKELY(size <= (MCA_BTL_VADER_FBOX_MAX_SIZE + MCA_BTL_VADER_FBOX_SIZE) && MCA_BTL_VADER_LAST_FBOX != next_fbox &&
0 == fbox->hdr.ival && 0 == fbox[1].hdr.ival)) {
/* aggregate two fast boxes */
fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox + 1);
return fbox; return fbox;
} }
@ -72,12 +69,13 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct
} }
static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag, static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag,
size_t size) struct mca_btl_base_endpoint_t *endpoint)
{ {
/* ensure data writes have completed before we mark the data as available */ /* ensure data writes have completed before we mark the data as available */
opal_atomic_wmb (); opal_atomic_wmb ();
fbox->hdr.hdr_data.seqn = endpoint->next_sequence++;
fbox->hdr.hdr_data.tag = tag; fbox->hdr.hdr_data.tag = tag;
opal_atomic_wmb ();
} }
static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag, static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag,
@ -98,7 +96,7 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
} }
/* mark the fbox as sent */ /* mark the fbox as sent */
mca_btl_vader_fbox_send (fbox, tag, header_size + payload_size); mca_btl_vader_fbox_send (fbox, tag, endpoint);
/* send complete */ /* send complete */
return 1; return 1;
@ -128,8 +126,12 @@ static inline bool mca_btl_vader_check_fboxes (void)
processed = true; processed = true;
/* process all fast-box messages */ /* process all fast-box messages */
while (0 != fbox->hdr.hdr_data.tag) { for (int count = 0 ; count <= MCA_BTL_VADER_POLL_COUNT && 0 != fbox->hdr.hdr_data.tag ; ++count) {
opal_atomic_rmb (); if (OPAL_UNLIKELY(endpoint->expected_sequence != fbox->hdr.hdr_data.seqn)) {
break;
}
opal_atomic_mb ();
++endpoint->expected_sequence;
reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag; reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag;
@ -138,8 +140,9 @@ static inline bool mca_btl_vader_check_fboxes (void)
reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata); reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata);
if (fbox->hdr.hdr_data.size > MCA_BTL_VADER_FBOX_MAX_SIZE) { if (segment.seg_len > MCA_BTL_VADER_FBOX_MAX_SIZE) {
fbox[1].hdr.ival = 0; fbox[1].hdr.ival = 0;
opal_atomic_mb ();
++next_fbox; ++next_fbox;
} }
fbox->hdr.ival = 0; fbox->hdr.ival = 0;
@ -148,6 +151,8 @@ static inline bool mca_btl_vader_check_fboxes (void)
fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox); fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
} }
opal_atomic_mb ();
endpoint->next_fbox_in = next_fbox; endpoint->next_fbox_in = next_fbox;
} }

Просмотреть файл

@ -91,21 +91,36 @@ static inline void *relative2virtual (fifo_value_t offset)
return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base); return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base);
} }
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo) static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep)
{ {
mca_btl_vader_hdr_t *hdr; mca_btl_vader_hdr_t *hdr;
fifo_value_t value; fifo_value_t value;
static volatile int32_t lock = 0;
opal_atomic_rmb (); if (opal_atomic_swap_32 (&lock, 1)) {
value = vader_item_swap (&fifo->fifo_head, VADER_FIFO_FREE);
if (VADER_FIFO_FREE == value) {
/* fifo is empty or we lost the race with another thread */
return NULL; return NULL;
} }
if (VADER_FIFO_FREE == fifo->fifo_head) {
lock = 0;
return NULL;
}
opal_atomic_rmb ();
value = fifo->fifo_head;
*ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS];
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value); hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
if (OPAL_UNLIKELY(!(hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) && ((*ep)->expected_sequence != hdr->seqn))) {
lock = 0;
return NULL;
}
fifo->fifo_head = VADER_FIFO_FREE;
++(*ep)->expected_sequence;
assert (hdr->next != value); assert (hdr->next != value);
if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) { if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) {
@ -123,7 +138,7 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
} }
opal_atomic_wmb (); opal_atomic_wmb ();
lock = 0;
return hdr; return hdr;
} }
@ -159,6 +174,7 @@ static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value)
static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep) static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{ {
hdr->next = VADER_FIFO_FREE; hdr->next = VADER_FIFO_FREE;
hdr->seqn = ep->next_sequence++;
vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr)); vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr));
} }

Просмотреть файл

@ -29,7 +29,6 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
{ {
frag->hdr = (mca_btl_vader_hdr_t*)frag->base.super.ptr; frag->hdr = (mca_btl_vader_hdr_t*)frag->base.super.ptr;
if(frag->hdr != NULL) { if(frag->hdr != NULL) {
frag->hdr->src_smp_rank = MCA_BTL_VADER_LOCAL_RANK;
frag->hdr->frag = frag; frag->hdr->frag = frag;
frag->hdr->flags = 0; frag->hdr->flags = 0;
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);

Просмотреть файл

@ -40,7 +40,7 @@ struct mca_btl_vader_hdr_t {
struct mca_btl_vader_frag_t *frag; struct mca_btl_vader_frag_t *frag;
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */ mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
uint8_t flags; /* vader send flags */ uint8_t flags; /* vader send flags */
uint16_t src_smp_rank; /* smp rank of owning process */ uint16_t seqn;
int32_t len; /* length of data following this header */ int32_t len; /* length of data following this header */
struct iovec sc_iov; /* io vector containing pointer to single-copy data */ struct iovec sc_iov; /* io vector containing pointer to single-copy data */
}; };

Просмотреть файл

@ -194,6 +194,8 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_
ep->next_fbox_out = 0; ep->next_fbox_out = 0;
ep->next_fbox_in = 0; ep->next_fbox_in = 0;
ep->next_sequence = 0;
ep->expected_sequence = 0;
ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE + ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE +
fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE); fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
@ -526,6 +528,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
frag->base.des_src_cnt = 2; frag->base.des_src_cnt = 2;
} else { } else {
#endif #endif
/* inline send */ /* inline send */
if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) {
/* try to reserve a fast box for this transfer only if the /* try to reserve a fast box for this transfer only if the

Просмотреть файл

@ -42,7 +42,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor;
if (OPAL_LIKELY(frag->fbox)) { if (OPAL_LIKELY(frag->fbox)) {
mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len); mca_btl_vader_fbox_send (frag->fbox, tag, endpoint);
mca_btl_vader_frag_complete (frag); mca_btl_vader_frag_complete (frag);
return 1; return 1;