12bfd13150
This is a large update that does the following: - Only allocate fast boxes for a peer if a send count threshold has been reached (default: 16). This will greatly reduce the memory usage with large numbers of local peers. - Improve performance by limiting the number of fast boxes that can be allocated per peer (default: 32). This will reduce the amount of time spent polling for fast box messages. - Provide new MCA variables to configure the size, maximum count, and send count thresholds for fast boxes allocations. - Updated buffer design to increase the range of message sizes that can be sent with a fast box. - Add thread protection around fast box allocation (locks). When spin locks are available this should be updated to use spin locks. - Various fixes and cleanup. This commit was SVN r32774.
228 строки
7.3 KiB
C
228 строки
7.3 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
|
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
/**
|
|
* @file
|
|
*/
|
|
#ifndef MCA_BTL_VADER_FIFO_H
|
|
#define MCA_BTL_VADER_FIFO_H
|
|
|
|
#include "btl_vader.h"
|
|
#include "btl_vader_endpoint.h"
|
|
#include "btl_vader_frag.h"
|
|
|
|
#if SIZEOF_VOID_P == 8
|
|
#define vader_item_cmpset(x, y, z) opal_atomic_cmpset_64((volatile int64_t *)(x), (int64_t)(y), (int64_t)(z))
|
|
#define vader_item_swap(x, y) opal_atomic_swap_64((volatile int64_t *)(x), (int64_t)(y))
|
|
|
|
#define MCA_BTL_VADER_OFFSET_MASK 0xffffffffll
|
|
#define MCA_BTL_VADER_OFFSET_BITS 32
|
|
#define MCA_BTL_VADER_BITNESS 64
|
|
|
|
typedef int64_t fifo_value_t;
|
|
#else
|
|
#define vader_item_cmpset(x, y, z) opal_atomic_cmpset_32((volatile int32_t *)(x), (int32_t)(y), (int32_t)(z))
|
|
#define vader_item_swap(x, y) opal_atomic_swap_32((volatile int32_t *)(x), (int32_t)(y))
|
|
|
|
#define MCA_BTL_VADER_OFFSET_MASK 0x00ffffffl
|
|
#define MCA_BTL_VADER_OFFSET_BITS 24
|
|
#define MCA_BTL_VADER_BITNESS 32
|
|
|
|
typedef int32_t fifo_value_t;
|
|
#endif
|
|
|
|
#define VADER_FIFO_FREE ((fifo_value_t)-2)
|
|
|
|
/*
|
|
* Shared Memory FIFOs
|
|
*
|
|
* The FIFO is implemented as a linked list of frag headers. The fifo has multiple
|
|
* producers and a single consumer (in the single thread case) so the tail needs
|
|
* to be modified by an atomic or protected by a atomic lock.
|
|
*
|
|
* Since the frags live in shared memory that is mapped differently into
|
|
* each address space, the head and tail pointers are relative (each process must
|
|
* add its own offset).
|
|
*
|
|
* We introduce some padding at the end of the structure but it is probably unnecessary.
|
|
*/
|
|
|
|
/* lock free fifo */
|
|
typedef struct vader_fifo_t {
|
|
volatile fifo_value_t fifo_head;
|
|
volatile fifo_value_t fifo_tail;
|
|
volatile int32_t fbox_available;
|
|
} vader_fifo_t;
|
|
|
|
/* large enough to ensure the fifo is on its own cache line */
|
|
#define MCA_BTL_VADER_FIFO_SIZE 128
|
|
|
|
/***
|
|
* One or more FIFO components may be a pointer that must be
|
|
* accessed by multiple processes. Since the shared region may
|
|
* be mmapped differently into each process's address space,
|
|
* these pointers will be relative to some base address. Here,
|
|
* we define inline functions to translate between relative
|
|
* addresses and virtual addresses.
|
|
*/
|
|
|
|
/* This only works for finding the relative address for a pointer within my_segment */
|
|
static inline fifo_value_t virtual2relative (char *addr)
|
|
{
|
|
return (fifo_value_t) ((intptr_t) (addr - mca_btl_vader_component.my_segment)) | ((fifo_value_t)MCA_BTL_VADER_LOCAL_RANK << MCA_BTL_VADER_OFFSET_BITS);
|
|
}
|
|
|
|
static inline fifo_value_t virtual2relativepeer (struct mca_btl_base_endpoint_t *endpoint, char *addr)
|
|
{
|
|
return (fifo_value_t) ((intptr_t) (addr - endpoint->segment_base)) | ((fifo_value_t)endpoint->peer_smp_rank << MCA_BTL_VADER_OFFSET_BITS);
|
|
}
|
|
|
|
static inline void *relative2virtual (fifo_value_t offset)
|
|
{
|
|
return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base);
|
|
}
|
|
|
|
#include "btl_vader_fbox.h"
|
|
|
|
/**
|
|
* vader_fifo_read:
|
|
*
|
|
* @brief reads a single fragment from a local fifo
|
|
*
|
|
* @param[inout] fifo - FIFO to read from
|
|
* @param[out] ep - returns the endpoint the fifo element was read from
|
|
*
|
|
* @returns a fragment header or NULL
|
|
*
|
|
* This function does not currently support multiple readers.
|
|
*/
|
|
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep)
|
|
{
|
|
mca_btl_vader_hdr_t *hdr;
|
|
fifo_value_t value;
|
|
|
|
if (VADER_FIFO_FREE == fifo->fifo_head) {
|
|
return NULL;
|
|
}
|
|
|
|
opal_atomic_rmb ();
|
|
|
|
value = fifo->fifo_head;
|
|
|
|
*ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS];
|
|
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
|
|
|
|
fifo->fifo_head = VADER_FIFO_FREE;
|
|
|
|
assert (hdr->next != value);
|
|
|
|
if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) {
|
|
opal_atomic_rmb();
|
|
|
|
if (!vader_item_cmpset (&fifo->fifo_tail, value, VADER_FIFO_FREE)) {
|
|
while (VADER_FIFO_FREE == hdr->next) {
|
|
opal_atomic_rmb ();
|
|
}
|
|
|
|
fifo->fifo_head = hdr->next;
|
|
}
|
|
} else {
|
|
fifo->fifo_head = hdr->next;
|
|
}
|
|
|
|
opal_atomic_wmb ();
|
|
return hdr;
|
|
}
|
|
|
|
static inline void vader_fifo_init (vader_fifo_t *fifo)
|
|
{
|
|
fifo->fifo_head = fifo->fifo_tail = VADER_FIFO_FREE;
|
|
fifo->fbox_available = mca_btl_vader_component.fbox_max;
|
|
mca_btl_vader_component.my_fifo = fifo;
|
|
}
|
|
|
|
static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value)
|
|
{
|
|
fifo_value_t prev;
|
|
|
|
opal_atomic_wmb ();
|
|
prev = vader_item_swap (&fifo->fifo_tail, value);
|
|
opal_atomic_rmb ();
|
|
|
|
assert (prev != value);
|
|
|
|
if (OPAL_LIKELY(VADER_FIFO_FREE != prev)) {
|
|
mca_btl_vader_hdr_t *hdr = (mca_btl_vader_hdr_t *) relative2virtual (prev);
|
|
hdr->next = value;
|
|
} else {
|
|
fifo->fifo_head = value;
|
|
}
|
|
|
|
opal_atomic_wmb ();
|
|
}
|
|
|
|
/**
|
|
* vader_fifo_write_ep:
|
|
*
|
|
* @brief write a frag (relative to this process' base) to another rank's fifo
|
|
*
|
|
* @param[in] hdr - fragment header to write
|
|
* @param[in] ep - endpoint to write the fragment to
|
|
*
|
|
* This function is used to send a fragment to a remote peer. {hdr} must belong
|
|
* to the current process.
|
|
*/
|
|
static inline bool vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
|
{
|
|
fifo_value_t rhdr = virtual2relative ((char *) hdr);
|
|
if (ep->fbox_out.buffer) {
|
|
/* if there is a fast box for this peer then use the fast box to send the fragment header.
|
|
* this is done to ensure fragment ordering */
|
|
opal_atomic_wmb ();
|
|
return mca_btl_vader_fbox_sendi (ep, 0xfe, &rhdr, sizeof (rhdr), NULL, 0);
|
|
}
|
|
mca_btl_vader_try_fbox_setup (ep, hdr);
|
|
hdr->next = VADER_FIFO_FREE;
|
|
vader_fifo_write (ep->fifo, rhdr);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* vader_fifo_write_back:
|
|
*
|
|
* @brief write a frag (relative to the remote process' base) to the remote fifo
|
|
*
|
|
* @param[in] hdr - fragment header to write
|
|
* @param[in] ep - endpoint the fragment belongs to
|
|
*
|
|
* This function is used to return a fragment to the sending process. It differs from vader_fifo_write_ep
|
|
* in that it uses the {ep} to produce the relative address.
|
|
*/
|
|
static inline void vader_fifo_write_back (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
|
{
|
|
hdr->next = VADER_FIFO_FREE;
|
|
vader_fifo_write(ep->fifo, virtual2relativepeer (ep, (char *) hdr));
|
|
}
|
|
|
|
#endif /* MCA_BTL_VADER_FIFO_H */
|