1
1

btl/vader: improve performance under heavy load and eliminate a racy

feature

This commit should fix a hang seen when running some of the one-sided
tests. The downside of this fix is it reduces the maximum size of the
messages that use the fast boxes. I will fix this in a later commit.

To improve performance under a heavy load I introduced sequencing to
ensure messages are given to the pml in order. I have seen little-no
impact on the message rate or latency with this change and there is a
clear improvement to the heavy message rate case.

Lets let this sit in the trunk for a couple of days to ensure that
everything is working correctly.

cmr=v1.8.2:reviewer=jsquyres

This commit was SVN r31522.
Этот коммит содержится в:
Nathan Hjelm 2014-04-24 17:36:03 +00:00
родитель e243805ed8
Коммит 0849d61e38
8 изменённых файлов: 61 добавлений и 34 удалений

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
@ -361,17 +361,17 @@ failed:
return NULL;
}
static int mca_btl_vader_poll_fifo (void)
static inline int mca_btl_vader_poll_fifo (void)
{
const mca_btl_active_message_callback_t *reg;
struct mca_btl_base_endpoint_t *endpoint;
mca_btl_vader_hdr_t *hdr;
/* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */
for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) {
for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) {
mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}};
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo);
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint);
if (NULL == hdr) {
return fifo_count;
}
@ -385,8 +385,6 @@ static int mca_btl_vader_poll_fifo (void)
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
frag.segments[0].seg_len = hdr->len;
endpoint = mca_btl_vader_component.endpoints + hdr->src_smp_rank;
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
mca_mpool_base_registration_t *xpmem_reg;

Просмотреть файл

@ -11,6 +11,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -65,6 +67,10 @@ struct mca_btl_base_endpoint_t {
#if OMPI_BTL_VADER_HAVE_XPMEM
struct mca_rcache_base_module_t *rcache;
#endif
/* enforce ordering */
uint16_t next_sequence;
uint16_t expected_sequence;
};
#endif /* MCA_BTL_VADER_ENDPOINT_H */

Просмотреть файл

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,6 +25,7 @@
/* there should be a power of two number of fast boxes to simplify the math in the
* critical path */
#define MCA_BTL_VADER_LAST_FBOX 63
#define MCA_BTL_VADER_POLL_COUNT 31
/* two bytes are reserved for tag and size (update if the header is modified) */
#define MCA_BTL_VADER_FBOX_HDR_SIZE 4
#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE)
@ -34,8 +35,9 @@
typedef struct mca_btl_vader_fbox_t {
union {
struct {
uint16_t size;
uint16_t tag;
uint8_t size;
uint8_t tag;
uint16_t seqn;
} hdr_data;
uint32_t ival;
} hdr;
@ -52,19 +54,14 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct
const int next_fbox = ep->next_fbox_out;
mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
opal_atomic_mb ();
/* todo -- need thread locks/atomics here for the multi-threaded case */
if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) {
/* mark this fast box as in use */
fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
return fbox;
} else if (OPAL_LIKELY(size <= (MCA_BTL_VADER_FBOX_MAX_SIZE + MCA_BTL_VADER_FBOX_SIZE) && MCA_BTL_VADER_LAST_FBOX != next_fbox &&
0 == fbox->hdr.ival && 0 == fbox[1].hdr.ival)) {
/* aggregate two fast boxes */
fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox + 1);
opal_atomic_mb ();
return fbox;
}
@ -72,12 +69,13 @@ static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct
}
static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag,
size_t size)
struct mca_btl_base_endpoint_t *endpoint)
{
/* ensure data writes have completed before we mark the data as available */
opal_atomic_wmb ();
fbox->hdr.hdr_data.seqn = endpoint->next_sequence++;
fbox->hdr.hdr_data.tag = tag;
opal_atomic_wmb ();
}
static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag,
@ -98,7 +96,7 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
}
/* mark the fbox as sent */
mca_btl_vader_fbox_send (fbox, tag, header_size + payload_size);
mca_btl_vader_fbox_send (fbox, tag, endpoint);
/* send complete */
return 1;
@ -128,8 +126,12 @@ static inline bool mca_btl_vader_check_fboxes (void)
processed = true;
/* process all fast-box messages */
while (0 != fbox->hdr.hdr_data.tag) {
opal_atomic_rmb ();
for (int count = 0 ; count <= MCA_BTL_VADER_POLL_COUNT && 0 != fbox->hdr.hdr_data.tag ; ++count) {
if (OPAL_UNLIKELY(endpoint->expected_sequence != fbox->hdr.hdr_data.seqn)) {
break;
}
opal_atomic_mb ();
++endpoint->expected_sequence;
reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag;
@ -138,8 +140,9 @@ static inline bool mca_btl_vader_check_fboxes (void)
reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata);
if (fbox->hdr.hdr_data.size > MCA_BTL_VADER_FBOX_MAX_SIZE) {
if (segment.seg_len > MCA_BTL_VADER_FBOX_MAX_SIZE) {
fbox[1].hdr.ival = 0;
opal_atomic_mb ();
++next_fbox;
}
fbox->hdr.ival = 0;
@ -148,6 +151,8 @@ static inline bool mca_btl_vader_check_fboxes (void)
fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
}
opal_atomic_mb ();
endpoint->next_fbox_in = next_fbox;
}

Просмотреть файл

@ -91,21 +91,36 @@ static inline void *relative2virtual (fifo_value_t offset)
return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base);
}
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep)
{
mca_btl_vader_hdr_t *hdr;
fifo_value_t value;
static volatile int32_t lock = 0;
opal_atomic_rmb ();
value = vader_item_swap (&fifo->fifo_head, VADER_FIFO_FREE);
if (VADER_FIFO_FREE == value) {
/* fifo is empty or we lost the race with another thread */
if (opal_atomic_swap_32 (&lock, 1)) {
return NULL;
}
if (VADER_FIFO_FREE == fifo->fifo_head) {
lock = 0;
return NULL;
}
opal_atomic_rmb ();
value = fifo->fifo_head;
*ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS];
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
if (OPAL_UNLIKELY(!(hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) && ((*ep)->expected_sequence != hdr->seqn))) {
lock = 0;
return NULL;
}
fifo->fifo_head = VADER_FIFO_FREE;
++(*ep)->expected_sequence;
assert (hdr->next != value);
if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) {
@ -123,7 +138,7 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
}
opal_atomic_wmb ();
lock = 0;
return hdr;
}
@ -159,6 +174,7 @@ static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value)
static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
hdr->next = VADER_FIFO_FREE;
hdr->seqn = ep->next_sequence++;
vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr));
}

Просмотреть файл

@ -29,7 +29,6 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
{
frag->hdr = (mca_btl_vader_hdr_t*)frag->base.super.ptr;
if(frag->hdr != NULL) {
frag->hdr->src_smp_rank = MCA_BTL_VADER_LOCAL_RANK;
frag->hdr->frag = frag;
frag->hdr->flags = 0;
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);

Просмотреть файл

@ -40,7 +40,7 @@ struct mca_btl_vader_hdr_t {
struct mca_btl_vader_frag_t *frag;
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
uint8_t flags; /* vader send flags */
uint16_t src_smp_rank; /* smp rank of owning process */
uint16_t seqn;
int32_t len; /* length of data following this header */
struct iovec sc_iov; /* io vector containing pointer to single-copy data */
};

Просмотреть файл

@ -194,6 +194,8 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_
ep->next_fbox_out = 0;
ep->next_fbox_in = 0;
ep->next_sequence = 0;
ep->expected_sequence = 0;
ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE +
fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
@ -526,6 +528,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
frag->base.des_src_cnt = 2;
} else {
#endif
/* inline send */
if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) {
/* try to reserve a fast box for this transfer only if the

Просмотреть файл

@ -42,7 +42,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor;
if (OPAL_LIKELY(frag->fbox)) {
mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len);
mca_btl_vader_fbox_send (frag->fbox, tag, endpoint);
mca_btl_vader_frag_complete (frag);
return 1;