btl/vader: improve performance for both single and multiple threads
This is a large update that does the following: - Only allocate fast boxes for a peer if a send count threshold has been reached (default: 16). This will greatly reduce the memory usage with large numbers of local peers. - Improve performance by limiting the number of fast boxes that can be allocated per peer (default: 32). This will reduce the amount of time spent polling for fast box messages. - Provide new MCA variables to configure the size, maximum count, and send count thresholds for fast boxes allocations. - Updated buffer design to increase the range of message sizes that can be sent with a fast box. - Add thread protection around fast box allocation (locks). When spin locks are available this should be updated to use spin locks. - Various fixes and cleanup. This commit was SVN r32774.
Этот коммит содержится в:
родитель
1508a01325
Коммит
12bfd13150
@ -64,10 +64,12 @@
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/rcache/rcache.h"
|
||||
#include "opal/mca/rcache/base/base.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
#include "btl_vader_endpoint.h"
|
||||
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define min(a,b) ((a) < (b) ? (a) : (b))
|
||||
@ -97,30 +99,39 @@ struct mca_btl_vader_component_t {
|
||||
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
|
||||
int vader_free_list_num; /**< initial size of free lists */
|
||||
int vader_free_list_max; /**< maximum size of free lists */
|
||||
int vader_free_list_inc; /**< number of elements to alloc
|
||||
* when growing free lists */
|
||||
int vader_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
|
||||
xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */
|
||||
#else
|
||||
opal_shmem_ds_t seg_ds; /* this rank's shared memory segment */
|
||||
opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment */
|
||||
#endif
|
||||
|
||||
char *my_segment; /* this rank's base pointer */
|
||||
size_t segment_size; /* size of my_segment */
|
||||
size_t segment_offset; /* start of unused portion of my_segment */
|
||||
opal_mutex_t lock; /**< lock to protect concurrent updates to this structure's members */
|
||||
char *my_segment; /**< this rank's base pointer */
|
||||
size_t segment_size; /**< size of my_segment */
|
||||
size_t segment_offset; /**< start of unused portion of my_segment */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */
|
||||
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
||||
ompi_free_list_t vader_frags_max_send;
|
||||
ompi_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */
|
||||
#endif
|
||||
ompi_free_list_t vader_frags_user; /**< free list of vader put/get frags */
|
||||
|
||||
int memcpy_limit; /** Limit where we switch from memmove to memcpy */
|
||||
int log_attach_align; /** Log of the alignment for xpmem segments */
|
||||
unsigned int max_inline_send; /** Limit for copy-in-copy-out fragments */
|
||||
unsigned int fbox_threshold; /**< number of sends required before we setup a send fast box for a peer */
|
||||
unsigned int fbox_max; /**< maximum number of send fast boxes to allocate */
|
||||
unsigned int fbox_size; /**< size of each peer fast box allocation */
|
||||
unsigned int fbox_count; /**< number of send fast boxes allocated */
|
||||
|
||||
struct mca_btl_base_endpoint_t *endpoints;
|
||||
struct vader_fifo_t *my_fifo;
|
||||
int memcpy_limit; /**< Limit where we switch from memmove to memcpy */
|
||||
int log_attach_align; /**< Log of the alignment for xpmem segments */
|
||||
unsigned int max_inline_send; /**< Limit for copy-in-copy-out fragments */
|
||||
|
||||
mca_btl_base_endpoint_t *endpoints; /**< array of local endpoints (one for each local peer including myself) */
|
||||
mca_btl_base_endpoint_t **fbox_in_endpoints; /**< array of fast box in endpoints */
|
||||
unsigned int num_fbox_in_endpoints; /**< number of fast boxes to poll */
|
||||
struct vader_fifo_t *my_fifo; /**< pointer to the local fifo */
|
||||
|
||||
opal_list_t pending_endpoints; /**< list of endpoints with pending fragments */
|
||||
};
|
||||
typedef struct mca_btl_vader_component_t mca_btl_vader_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;
|
||||
@ -136,16 +147,6 @@ struct mca_btl_vader_t {
|
||||
typedef struct mca_btl_vader_t mca_btl_vader_t;
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader;
|
||||
|
||||
/***
|
||||
* One or more FIFO components may be a pointer that must be
|
||||
* accessed by multiple processes. Since the shared region may
|
||||
* be mmapped differently into each process's address space,
|
||||
* these pointers will be relative to some base address. Here,
|
||||
* we define macros to translate between relative addresses and
|
||||
* virtual addresses.
|
||||
*/
|
||||
|
||||
|
||||
/* number of peers on the node (not including self) */
|
||||
#define MCA_BTL_VADER_NUM_LOCAL_PEERS opal_process_info.num_local_peers
|
||||
|
||||
|
@ -25,13 +25,14 @@
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "btl_vader.h"
|
||||
#include "btl_vader_frag.h"
|
||||
#include "btl_vader_fifo.h"
|
||||
#include "btl_vader_fbox.h"
|
||||
#include "btl_vader_xpmem.h"
|
||||
|
||||
#include <sys/mman.h>
|
||||
|
||||
@ -104,6 +105,7 @@ static int mca_btl_vader_component_register (void)
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.memcpy_limit);
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
mca_btl_vader_component.log_attach_align = 21;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"log_align", "Log base 2 of the alignment to use for xpmem "
|
||||
@ -112,6 +114,7 @@ static int mca_btl_vader_component_register (void)
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.log_attach_align);
|
||||
#endif
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM && 64 == MCA_BTL_VADER_BITNESS
|
||||
mca_btl_vader_component.segment_size = 1 << 24;
|
||||
@ -139,6 +142,27 @@ static int mca_btl_vader_component_register (void)
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.max_inline_send);
|
||||
|
||||
mca_btl_vader_component.fbox_threshold = 16;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"fbox_threshold", "Number of sends required "
|
||||
"before an eager send buffer is setup for a peer "
|
||||
"(default: 16)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
|
||||
0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_threshold);
|
||||
|
||||
mca_btl_vader_component.fbox_max = 32;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"fbox_max", "Maximum number of eager send buffers "
|
||||
"to allocate (default: 32)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_max);
|
||||
|
||||
mca_btl_vader_component.fbox_size = 4096;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"fbox_size", "Size of per-peer fast transfer buffers (default: 4k)",
|
||||
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_vader_component.fbox_size);
|
||||
|
||||
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
mca_btl_vader.super.btl_eager_limit = 32 * 1024;
|
||||
@ -189,6 +213,8 @@ static int mca_btl_vader_component_open(void)
|
||||
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t);
|
||||
#endif
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.pending_endpoints, opal_list_t);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
@ -205,6 +231,8 @@ static int mca_btl_vader_component_close(void)
|
||||
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send);
|
||||
#endif
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.lock);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.pending_endpoints);
|
||||
|
||||
if (NULL != mca_btl_vader_component.my_segment) {
|
||||
munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size);
|
||||
@ -216,8 +244,7 @@ static int mca_btl_vader_component_close(void)
|
||||
static int mca_btl_base_vader_modex_send (void)
|
||||
{
|
||||
struct vader_modex_t modex;
|
||||
int modex_size;
|
||||
int rc;
|
||||
int modex_size, rc;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
modex.seg_id = mca_btl_vader_component.my_seg_id;
|
||||
@ -229,9 +256,9 @@ static int mca_btl_base_vader_modex_send (void)
|
||||
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
|
||||
#endif
|
||||
|
||||
OPAL_MODEX_SEND(rc, PMIX_ASYNC_RDY, PMIX_LOCAL,
|
||||
&mca_btl_vader_component.super.btl_version,
|
||||
&modex, modex_size);
|
||||
OPAL_MODEX_SEND(rc, PMIX_SYNC_REQD, PMIX_LOCAL,
|
||||
&mca_btl_vader_component.super.btl_version, &modex, modex_size);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -254,13 +281,14 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
/* limit segment alignment to be between 4k and 16M */
|
||||
|
||||
if (mca_btl_vader_component.log_attach_align < 12) {
|
||||
mca_btl_vader_component.log_attach_align = 12;
|
||||
} else if (mca_btl_vader_component.log_attach_align > 25) {
|
||||
mca_btl_vader_component.log_attach_align = 25;
|
||||
if (component->log_attach_align < 12) {
|
||||
component->log_attach_align = 12;
|
||||
} else if (component->log_attach_align > 25) {
|
||||
component->log_attach_align = 25;
|
||||
}
|
||||
#endif
|
||||
|
||||
btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *));
|
||||
if (NULL == btls) {
|
||||
@ -268,28 +296,36 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
}
|
||||
|
||||
/* ensure a sane segment size */
|
||||
if (mca_btl_vader_component.segment_size < (2 << 20)) {
|
||||
mca_btl_vader_component.segment_size = (2 << 20);
|
||||
if (component->segment_size < (2 << 20)) {
|
||||
component->segment_size = (2 << 20);
|
||||
}
|
||||
|
||||
if (mca_btl_vader_component.segment_size > (1ul << MCA_BTL_VADER_OFFSET_BITS)) {
|
||||
mca_btl_vader_component.segment_size = 2ul << MCA_BTL_VADER_OFFSET_BITS;
|
||||
component->fbox_size = (component->fbox_size + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK;
|
||||
|
||||
if (component->segment_size > (1ul << MCA_BTL_VADER_OFFSET_BITS)) {
|
||||
component->segment_size = 2ul << MCA_BTL_VADER_OFFSET_BITS;
|
||||
}
|
||||
|
||||
/* no fast boxes allocated initially */
|
||||
component->num_fbox_in_endpoints = 0;
|
||||
component->fbox_count = 0;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
component->my_segment = mmap (NULL, component->segment_size, PROT_READ |
|
||||
PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
||||
if ((void *)-1 == component->my_segment) {
|
||||
BTL_VERBOSE(("Could not create anonymous memory segment"));
|
||||
free (btls);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* create an xpmem segment for the entire memory space */
|
||||
component->my_seg_id = xpmem_make (0, VADER_MAX_ADDRESS, XPMEM_PERMIT_MODE, (void *)0666);
|
||||
if (-1 == component->my_seg_id) {
|
||||
BTL_VERBOSE(("Could not create xpmem segment"));
|
||||
free (btls);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ |
|
||||
PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
||||
if ((void *)-1 == component->my_segment) {
|
||||
BTL_VERBOSE(("Could not create anonymous memory segment"));
|
||||
free (btls);
|
||||
munmap (component->my_segment, component->segment_size);
|
||||
component->my_segment = NULL;
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
@ -303,7 +339,7 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = opal_shmem_segment_create (&mca_btl_vader_component.seg_ds, sm_file, mca_btl_vader_component.segment_size);
|
||||
rc = opal_shmem_segment_create (&component->seg_ds, sm_file, component->segment_size);
|
||||
free (sm_file);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("Could not create shared memory segment"));
|
||||
@ -311,7 +347,7 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
component->my_segment = opal_shmem_segment_attach (&mca_btl_vader_component.seg_ds);
|
||||
component->my_segment = opal_shmem_segment_attach (&component->seg_ds);
|
||||
if (NULL == component->my_segment) {
|
||||
BTL_VERBOSE(("Could not attach to just created shared memory segment"));
|
||||
goto failed;
|
||||
@ -321,15 +357,8 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
|
||||
component->segment_offset = 0;
|
||||
|
||||
memset (component->my_segment + MCA_BTL_VADER_FIFO_SIZE, 0, MCA_BTL_VADER_NUM_LOCAL_PEERS *
|
||||
MCA_BTL_VADER_FBOX_PEER_SIZE);
|
||||
|
||||
/* initialize my fifo */
|
||||
rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_VERBOSE(("Error initializing FIFO"));
|
||||
goto failed;
|
||||
}
|
||||
vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
|
||||
|
||||
rc = mca_btl_base_vader_modex_send ();
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
@ -348,9 +377,9 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
return btls;
|
||||
failed:
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
munmap (component->my_segment, mca_btl_vader_component.segment_size);
|
||||
munmap (component->my_segment, component->segment_size);
|
||||
#else
|
||||
opal_shmem_unlink (&mca_btl_vader_component.seg_ds);
|
||||
opal_shmem_unlink (&component->seg_ds);
|
||||
#endif
|
||||
|
||||
if (btls) {
|
||||
@ -360,70 +389,130 @@ failed:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_vader_frag_t frag = {.base = {.des_local = frag.segments, .des_local_count = 1}};
|
||||
const mca_btl_active_message_callback_t *reg;
|
||||
|
||||
if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) {
|
||||
mca_btl_vader_frag_complete (hdr->frag);
|
||||
return;
|
||||
}
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
|
||||
frag.segments[0].seg_len = hdr->len;
|
||||
|
||||
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
|
||||
mca_mpool_base_registration_t *xpmem_reg;
|
||||
|
||||
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
|
||||
hdr->sc_iov.iov_len, 0,
|
||||
&frag.segments[1].seg_addr.pval);
|
||||
|
||||
frag.segments[1].seg_len = hdr->sc_iov.iov_len;
|
||||
frag.base.des_local_count = 2;
|
||||
|
||||
/* recv upcall */
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag.base, reg->cbdata);
|
||||
vader_return_registration (xpmem_reg, endpoint);
|
||||
} else {
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag.base, reg->cbdata);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(MCA_BTL_VADER_FLAG_SETUP_FBOX & hdr->flags)) {
|
||||
mca_btl_vader_endpoint_setup_fbox_recv (endpoint, relative2virtual(hdr->fbox_base));
|
||||
mca_btl_vader_component.fbox_in_endpoints[mca_btl_vader_component.num_fbox_in_endpoints++] = endpoint;
|
||||
}
|
||||
|
||||
hdr->flags = MCA_BTL_VADER_FLAG_COMPLETE;
|
||||
vader_fifo_write_back (hdr, endpoint);
|
||||
}
|
||||
|
||||
static inline int mca_btl_vader_poll_fifo (void)
|
||||
{
|
||||
const mca_btl_active_message_callback_t *reg;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
|
||||
/* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */
|
||||
for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) {
|
||||
mca_btl_vader_frag_t frag = {.base = {.des_local = frag.segments, .des_local_count = 1}};
|
||||
|
||||
for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) {
|
||||
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint);
|
||||
if (NULL == hdr) {
|
||||
return fifo_count;
|
||||
}
|
||||
|
||||
if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) {
|
||||
mca_btl_vader_frag_complete (hdr->frag);
|
||||
continue;
|
||||
}
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
||||
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
|
||||
frag.segments[0].seg_len = hdr->len;
|
||||
|
||||
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
|
||||
mca_mpool_base_registration_t *xpmem_reg;
|
||||
|
||||
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
|
||||
hdr->sc_iov.iov_len, 0,
|
||||
&frag.segments[1].seg_addr.pval);
|
||||
|
||||
frag.segments[1].seg_len = hdr->sc_iov.iov_len;
|
||||
|
||||
/* recv upcall */
|
||||
frag.base.des_local_count = 2;
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
||||
vader_return_registration (xpmem_reg, endpoint);
|
||||
} else {
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
||||
}
|
||||
|
||||
/* return the fragment */
|
||||
hdr->flags = MCA_BTL_VADER_FLAG_COMPLETE;
|
||||
vader_fifo_write_back (hdr, endpoint);
|
||||
mca_btl_vader_poll_handle_frag (hdr, endpoint);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Progress pending messages on an endpoint
|
||||
*
|
||||
* @param ep (IN) Vader BTL endpoint
|
||||
*/
|
||||
static void mca_btl_vader_progress_waiting (mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
mca_btl_vader_frag_t *frag;
|
||||
|
||||
OPAL_THREAD_LOCK(&ep->lock);
|
||||
ep->waiting = false;
|
||||
while (NULL != (frag = (mca_btl_vader_frag_t *) opal_list_remove_first (&ep->pending_frags))) {
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
if (!vader_fifo_write_ep (frag->hdr, ep)) {
|
||||
opal_list_prepend (&ep->pending_frags, (opal_list_item_t *) frag);
|
||||
opal_list_append (&mca_btl_vader_component.pending_endpoints, &ep->super);
|
||||
ep->waiting = true;
|
||||
break;
|
||||
}
|
||||
OPAL_THREAD_LOCK(&ep->lock);
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Progress pending messages on all waiting endpoints
|
||||
*
|
||||
* @param ep (IN) Vader BTL endpoint
|
||||
*/
|
||||
static void mca_btl_vader_progress_endpoints (void)
|
||||
{
|
||||
int count;
|
||||
|
||||
count = opal_list_get_size (&mca_btl_vader_component.pending_endpoints);
|
||||
|
||||
for (int i = 0 ; i < count ; ++i) {
|
||||
mca_btl_vader_progress_waiting ((mca_btl_base_endpoint_t *) opal_list_remove_first (&mca_btl_vader_component.pending_endpoints));
|
||||
}
|
||||
}
|
||||
|
||||
static int mca_btl_vader_component_progress (void)
|
||||
{
|
||||
bool fboxed;
|
||||
static int32_t lock = 0;
|
||||
int count = 0;
|
||||
|
||||
/* check for messages in fast boxes */
|
||||
for (int spin_count = 5 ; spin_count ; --spin_count) {
|
||||
fboxed = (int) mca_btl_vader_check_fboxes ();
|
||||
if (fboxed) {
|
||||
break;
|
||||
if (opal_using_threads()) {
|
||||
if (opal_atomic_swap_32 (&lock, 1)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) {
|
||||
return (int) fboxed;
|
||||
/* check for messages in fast boxes */
|
||||
if (mca_btl_vader_component.num_fbox_in_endpoints) {
|
||||
count = mca_btl_vader_check_fboxes ();
|
||||
}
|
||||
|
||||
return mca_btl_vader_poll_fifo () + (int) fboxed;
|
||||
mca_btl_vader_progress_endpoints ();
|
||||
|
||||
if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) {
|
||||
lock = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
count += mca_btl_vader_poll_fifo ();
|
||||
opal_atomic_mb ();
|
||||
lock = 0;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
@ -39,6 +39,9 @@
|
||||
#include "opal/mca/shmem/base/base.h"
|
||||
#endif
|
||||
|
||||
#define MCA_BTL_VADER_FBOX_ALIGNMENT 32
|
||||
#define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1)
|
||||
|
||||
struct vader_fifo_t;
|
||||
|
||||
/**
|
||||
@ -49,28 +52,68 @@ struct vader_fifo_t;
|
||||
|
||||
struct mca_btl_vader_fbox_t;
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
typedef struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
/* per peer buffers */
|
||||
struct {
|
||||
unsigned char *buffer;
|
||||
unsigned int start, seq;
|
||||
uint32_t *startp;
|
||||
} fbox_in;
|
||||
|
||||
struct {
|
||||
unsigned char *buffer;
|
||||
unsigned int start, end, seq;
|
||||
uint32_t *startp;
|
||||
} fbox_out;
|
||||
|
||||
int32_t peer_smp_rank; /**< my peer's SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
uint32_t send_count; /**< number of fragments sent to this peer */
|
||||
char *segment_base;
|
||||
|
||||
struct vader_fifo_t *fifo;
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
xpmem_apid_t apid;
|
||||
#else
|
||||
pid_t pid;
|
||||
opal_shmem_ds_t seg_ds;
|
||||
#endif
|
||||
struct mca_btl_vader_fbox_t * restrict fbox_out;
|
||||
struct mca_btl_vader_fbox_t * restrict fbox_in;
|
||||
int next_fbox_out;
|
||||
int next_fbox_in;
|
||||
|
||||
opal_mutex_t lock;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
struct mca_rcache_base_module_t *rcache;
|
||||
xpmem_apid_t apid; /**< xpmem apid for remote peer */
|
||||
#else
|
||||
pid_t pid; /**< pid of remote peer (used for CMA) */
|
||||
opal_shmem_ds_t *seg_ds; /**< stored segment information for detach */
|
||||
#endif
|
||||
|
||||
/* enforce ordering */
|
||||
uint16_t next_sequence;
|
||||
uint16_t expected_sequence;
|
||||
};
|
||||
/** fragments pending fast box space */
|
||||
opal_list_t pending_frags;
|
||||
/** endpoint is on the component wait list */
|
||||
bool waiting;
|
||||
} mca_btl_base_endpoint_t;
|
||||
|
||||
typedef mca_btl_base_endpoint_t mca_btl_vader_endpoint_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_vader_endpoint_t);
|
||||
|
||||
static inline void mca_btl_vader_endpoint_setup_fbox_recv (struct mca_btl_base_endpoint_t *endpoint, void *base)
|
||||
{
|
||||
endpoint->fbox_in.buffer = base;
|
||||
endpoint->fbox_in.startp = (uint32_t *) base;
|
||||
endpoint->fbox_in.startp[0] = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
endpoint->fbox_in.start = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
endpoint->fbox_in.seq = 0;
|
||||
}
|
||||
|
||||
static inline void mca_btl_vader_endpoint_setup_fbox_send (struct mca_btl_base_endpoint_t *endpoint, void *base)
|
||||
{
|
||||
endpoint->fbox_out.buffer = base;
|
||||
endpoint->fbox_out.start = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
endpoint->fbox_out.end = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
endpoint->fbox_out.startp = (uint32_t *) base;
|
||||
endpoint->fbox_out.seq = 0;
|
||||
|
||||
/* zero out the first header in the fast box */
|
||||
memset ((char *) base + MCA_BTL_VADER_FBOX_ALIGNMENT, 0, MCA_BTL_VADER_FBOX_ALIGNMENT);
|
||||
}
|
||||
|
||||
#endif /* MCA_BTL_VADER_ENDPOINT_H */
|
||||
|
@ -13,90 +13,160 @@
|
||||
#define MCA_BTL_VADER_FBOX_H
|
||||
|
||||
#include "btl_vader.h"
|
||||
#include "btl_vader_endpoint.h"
|
||||
#include "btl_vader_xpmem.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/* these hard-coded settings are based on the ideal setup for an Opteron 61xx chip and
|
||||
* may need to be adjusted for other systems. adding an MCA variable is possible but
|
||||
* can cost 20-40 ns on the fast path. this size is limited to 256 maximum bytes */
|
||||
#define MCA_BTL_VADER_FBOX_SIZE 64
|
||||
/* there should be a power of two number of fast boxes to simplify the math in the
|
||||
* critical path */
|
||||
#define MCA_BTL_VADER_LAST_FBOX 63
|
||||
#define MCA_BTL_VADER_POLL_COUNT 31
|
||||
/* two bytes are reserved for tag and size (update if the header is modified) */
|
||||
#define MCA_BTL_VADER_FBOX_HDR_SIZE 4
|
||||
#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE)
|
||||
/* total size of all the fast boxes assigned to a particular peer */
|
||||
#define MCA_BTL_VADER_FBOX_PEER_SIZE (MCA_BTL_VADER_FBOX_SIZE * (MCA_BTL_VADER_LAST_FBOX + 1))
|
||||
|
||||
typedef struct mca_btl_vader_fbox_t {
|
||||
union {
|
||||
struct {
|
||||
uint8_t size;
|
||||
uint8_t tag;
|
||||
uint16_t seqn;
|
||||
} hdr_data;
|
||||
uint32_t ival;
|
||||
} hdr;
|
||||
typedef union mca_btl_vader_fbox_hdr_t {
|
||||
struct {
|
||||
uint16_t tag;
|
||||
uint16_t size;
|
||||
uint32_t seq;
|
||||
} data;
|
||||
uint64_t ival;
|
||||
} mca_btl_vader_fbox_hdr_t;
|
||||
|
||||
uint8_t data[MCA_BTL_VADER_FBOX_MAX_SIZE];
|
||||
} mca_btl_vader_fbox_t;
|
||||
#define MCA_BTL_VADER_FBOX_HDR(x) ((mca_btl_vader_fbox_hdr_t *) (x))
|
||||
|
||||
#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + (fbox))
|
||||
#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + (fbox))
|
||||
#define MCA_BTL_VADER_NEXT_FBOX(fbox) (((fbox) + 1) & MCA_BTL_VADER_LAST_FBOX)
|
||||
#define MCA_BTL_VADER_FBOX_OFFSET_MASK 0x7fffffff
|
||||
#define MCA_BTL_VADER_FBOX_HB_MASK 0x80000000
|
||||
|
||||
static inline mca_btl_vader_fbox_t * mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, const size_t size)
|
||||
/* if the two offsets are equal and the high bit matches the buffer is empty else the buffer is full.
|
||||
* note that start will never be end - 1 so this simplified conditional will always produce the correct
|
||||
* result */
|
||||
#define BUFFER_FREE(s,e,hbm,size) (((s + !hbm) > (e)) ? (s) - (e) : (size - (e)))
|
||||
|
||||
/** macro for checking if the high bit is set */
|
||||
#define MCA_BTL_VADER_FBOX_OFFSET_HBS(v) (!!((v) & MCA_BTL_VADER_FBOX_HB_MASK))
|
||||
|
||||
void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, mca_btl_base_endpoint_t *ep);
|
||||
|
||||
/* attempt to reserve a contiguous segment from the remote ep */
|
||||
static inline unsigned char *mca_btl_vader_reserve_fbox (mca_btl_base_endpoint_t *ep, size_t size)
|
||||
{
|
||||
const int next_fbox = ep->next_fbox_out;
|
||||
mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
|
||||
const unsigned int fbox_size = mca_btl_vader_component.fbox_size;
|
||||
unsigned int start, end, buffer_free;
|
||||
size_t data_size = size;
|
||||
unsigned char *dst;
|
||||
bool hbs, hbm;
|
||||
|
||||
opal_atomic_mb ();
|
||||
|
||||
/* todo -- need thread locks/atomics here for the multi-threaded case */
|
||||
if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) {
|
||||
/* mark this fast box as in use */
|
||||
fbox->hdr.hdr_data.size = size;
|
||||
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
|
||||
opal_atomic_mb ();
|
||||
return fbox;
|
||||
/* don't try to use the per-peer buffer for messages that will fill up more than 25% of the buffer */
|
||||
if (OPAL_UNLIKELY(NULL == ep->fbox_out.buffer || size > (fbox_size >> 2))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
OPAL_THREAD_LOCK(&ep->lock);
|
||||
|
||||
/* the high bit helps determine if the buffer is empty or full */
|
||||
hbs = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.end);
|
||||
hbm = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs;
|
||||
|
||||
/* read current start and end offsets and check for free space */
|
||||
start = ep->fbox_out.start & MCA_BTL_VADER_FBOX_OFFSET_MASK;
|
||||
end = ep->fbox_out.end & MCA_BTL_VADER_FBOX_OFFSET_MASK;
|
||||
buffer_free = BUFFER_FREE(start, end, hbm, fbox_size);
|
||||
|
||||
/* need space for the fragment + the header */
|
||||
size = (size + sizeof (mca_btl_vader_fbox_hdr_t) + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK;
|
||||
|
||||
dst = ep->fbox_out.buffer + end;
|
||||
|
||||
if (OPAL_UNLIKELY(buffer_free < size)) {
|
||||
/* check if we need to free up space for this fragment */
|
||||
BTL_VERBOSE(("not enough room for a fragment of size %u. in use buffer segment: {start: %x, end: %x, high bit matches: %d}",
|
||||
(unsigned) size, start, end, (int) hbm));
|
||||
|
||||
/* read the current start pointer from the remote peer and recalculate the available buffer space */
|
||||
start = ep->fbox_out.start = ep->fbox_out.startp[0];
|
||||
|
||||
/* recalculate how much buffer space is available */
|
||||
start &= MCA_BTL_VADER_FBOX_OFFSET_MASK;
|
||||
hbm = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs;
|
||||
buffer_free = BUFFER_FREE(start, end, hbm, fbox_size);
|
||||
|
||||
opal_atomic_rmb ();
|
||||
|
||||
/* if this is the end of the buffer and the fragment doesn't fit then mark the remaining buffer space to
|
||||
* be skipped and check if the fragment can be written at the beginning of the buffer. */
|
||||
if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) {
|
||||
mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = buffer_free - sizeof (mca_btl_vader_fbox_hdr_t),
|
||||
.seq = ep->fbox_out.seq++, .tag = 0xff}};
|
||||
|
||||
BTL_VERBOSE(("message will not fit in remaining buffer space. skipping to beginning"));
|
||||
|
||||
MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival;
|
||||
|
||||
end = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
/* toggle the high bit */
|
||||
hbs = !hbs;
|
||||
/* toggle the high bit match */
|
||||
buffer_free = BUFFER_FREE(start, end, !hbm, fbox_size);
|
||||
dst = ep->fbox_out.buffer + end;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(buffer_free < size)) {
|
||||
ep->fbox_out.end = (hbs << 31) | end;
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("writing fragment of size %u to offset %u {start: 0x%x, end: 0x%x (hbs: %d)} of peer's buffer. free = %u",
|
||||
(unsigned int) size, end, start, end, hbs, buffer_free));
|
||||
|
||||
/* write out part of the header now. the tag will be written when the data is available */
|
||||
{
|
||||
mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = data_size, .tag = 0, .seq = ep->fbox_out.seq++}};
|
||||
|
||||
MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival;
|
||||
}
|
||||
|
||||
end += size;
|
||||
|
||||
if (OPAL_UNLIKELY(fbox_size == end)) {
|
||||
/* toggle the high bit */
|
||||
hbs = !hbs;
|
||||
/* reset the end pointer to the beginning of the buffer */
|
||||
end = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
} else if (buffer_free > size) {
|
||||
MCA_BTL_VADER_FBOX_HDR(ep->fbox_out.buffer + end)->ival = 0;
|
||||
}
|
||||
|
||||
/* align the buffer */
|
||||
ep->fbox_out.end = ((uint32_t) hbs << 31) | end;
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
|
||||
return dst + sizeof (mca_btl_vader_fbox_hdr_t);
|
||||
}
|
||||
|
||||
static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag,
|
||||
struct mca_btl_base_endpoint_t *endpoint)
|
||||
static inline void mca_btl_vader_fbox_send (unsigned char * restrict fbox, unsigned char tag)
|
||||
{
|
||||
/* ensure data writes have completed before we mark the data as available */
|
||||
opal_atomic_wmb ();
|
||||
fbox->hdr.hdr_data.seqn = endpoint->next_sequence++;
|
||||
fbox->hdr.hdr_data.tag = tag;
|
||||
opal_atomic_wmb ();
|
||||
|
||||
/* the header proceeds the fbox buffer */
|
||||
MCA_BTL_VADER_FBOX_HDR ((intptr_t) fbox)[-1].data.tag = tag;
|
||||
}
|
||||
|
||||
static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag,
|
||||
static inline int mca_btl_vader_fbox_sendi (mca_btl_base_endpoint_t *ep, char tag,
|
||||
void * restrict header, const size_t header_size,
|
||||
void * restrict payload, const size_t payload_size)
|
||||
{
|
||||
mca_btl_vader_fbox_t * restrict fbox;
|
||||
const size_t total_size = header_size + payload_size;
|
||||
unsigned char * restrict fbox;
|
||||
|
||||
fbox = mca_btl_vader_reserve_fbox(endpoint, header_size + payload_size);
|
||||
fbox = mca_btl_vader_reserve_fbox(ep, total_size);
|
||||
if (OPAL_UNLIKELY(NULL == fbox)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
memcpy (fbox->data, header, header_size);
|
||||
memcpy (fbox, header, header_size);
|
||||
if (payload) {
|
||||
/* inline sends are typically just pml headers (due to MCA_BTL_FLAGS_SEND_INPLACE) */
|
||||
memcpy (fbox->data + header_size, payload, payload_size);
|
||||
memcpy (fbox + header_size, payload, payload_size);
|
||||
}
|
||||
|
||||
/* mark the fbox as sent */
|
||||
mca_btl_vader_fbox_send (fbox, tag, endpoint);
|
||||
mca_btl_vader_fbox_send (fbox, tag);
|
||||
|
||||
/* send complete */
|
||||
return 1;
|
||||
@ -104,59 +174,108 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
|
||||
|
||||
static inline bool mca_btl_vader_check_fboxes (void)
|
||||
{
|
||||
const mca_btl_active_message_callback_t *reg;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_vader_fbox_t * restrict fbox;
|
||||
mca_btl_base_segment_t segment;
|
||||
mca_btl_base_descriptor_t desc;
|
||||
const unsigned int fbox_size = mca_btl_vader_component.fbox_size;
|
||||
bool processed = false;
|
||||
int next_fbox;
|
||||
|
||||
for (endpoint = mca_btl_vader_component.endpoints ; endpoint->peer_smp_rank != -1 ; ++endpoint) {
|
||||
next_fbox = endpoint->next_fbox_in;
|
||||
fbox = MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
|
||||
for (unsigned int i = 0 ; i < mca_btl_vader_component.num_fbox_in_endpoints ; ++i) {
|
||||
mca_btl_base_endpoint_t *ep = mca_btl_vader_component.fbox_in_endpoints[i];
|
||||
unsigned int start = ep->fbox_in.start & MCA_BTL_VADER_FBOX_OFFSET_MASK;
|
||||
|
||||
if (NULL == endpoint->fbox_in || 0 == fbox->hdr.hdr_data.tag) {
|
||||
continue;
|
||||
}
|
||||
/* save the current high bit state */
|
||||
bool hbs = MCA_BTL_VADER_FBOX_OFFSET_HBS(ep->fbox_in.start);
|
||||
int poll_count;
|
||||
|
||||
desc.des_local = &segment;
|
||||
desc.des_local_count = 1;
|
||||
for (poll_count = 0 ; poll_count <= MCA_BTL_VADER_POLL_COUNT ; ++poll_count) {
|
||||
const mca_btl_vader_fbox_hdr_t hdr = {.ival = MCA_BTL_VADER_FBOX_HDR(ep->fbox_in.buffer + start)->ival};
|
||||
|
||||
processed = true;
|
||||
|
||||
/* process all fast-box messages */
|
||||
for (int count = 0 ; count <= MCA_BTL_VADER_POLL_COUNT && 0 != fbox->hdr.hdr_data.tag ; ++count) {
|
||||
if (OPAL_UNLIKELY(endpoint->expected_sequence != fbox->hdr.hdr_data.seqn)) {
|
||||
/* check for a valid tag a sequence number */
|
||||
if (0 == hdr.data.tag || hdr.data.seq != ep->fbox_in.seq) {
|
||||
break;
|
||||
}
|
||||
opal_atomic_mb ();
|
||||
++endpoint->expected_sequence;
|
||||
|
||||
reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag;
|
||||
++ep->fbox_in.seq;
|
||||
|
||||
segment.seg_addr.pval = fbox->data;
|
||||
segment.seg_len = fbox->hdr.hdr_data.size;
|
||||
/* force all prior reads to complete before continuing */
|
||||
opal_atomic_rmb ();
|
||||
|
||||
reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata);
|
||||
BTL_VERBOSE(("got frag with header {.tag = %d, .size = %d} from offset %u", hdr.data.tag,
|
||||
hdr.data.size, start));
|
||||
|
||||
if (segment.seg_len > MCA_BTL_VADER_FBOX_MAX_SIZE) {
|
||||
fbox[1].hdr.ival = 0;
|
||||
opal_atomic_mb ();
|
||||
++next_fbox;
|
||||
/* the 0xff tag indicates we should skip the rest of the buffer */
|
||||
if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) {
|
||||
mca_btl_base_segment_t segment;
|
||||
mca_btl_base_descriptor_t desc = {.des_local = &segment, .des_local_count = 1};
|
||||
const mca_btl_active_message_callback_t *reg =
|
||||
mca_btl_base_active_message_trigger + hdr.data.tag;
|
||||
|
||||
/* fragment fits entirely in the remaining buffer space. some
|
||||
* btl users do not handle fragmented data so we can't split
|
||||
* the fragment without introducing another copy here. this
|
||||
* limitation has not appeared to cause any performance
|
||||
* degradation. */
|
||||
segment.seg_len = hdr.data.size;
|
||||
segment.seg_addr.pval = (void *) (ep->fbox_in.buffer + start + sizeof (hdr));
|
||||
|
||||
/* call the registered callback function */
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr.data.tag, &desc, reg->cbdata);
|
||||
} else if (OPAL_LIKELY(0xfe == hdr.data.tag)) {
|
||||
/* process fragment header */
|
||||
fifo_value_t *value = (fifo_value_t *)(ep->fbox_in.buffer + start + sizeof (hdr));
|
||||
mca_btl_vader_hdr_t *hdr = relative2virtual(*value);
|
||||
mca_btl_vader_poll_handle_frag (hdr, ep);
|
||||
}
|
||||
fbox->hdr.ival = 0;
|
||||
|
||||
next_fbox = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
|
||||
fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
|
||||
start = (start + hdr.data.size + sizeof (hdr) + MCA_BTL_VADER_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_VADER_FBOX_ALIGNMENT_MASK;
|
||||
if (OPAL_UNLIKELY(fbox_size == start)) {
|
||||
/* jump to the beginning of the buffer */
|
||||
start = MCA_BTL_VADER_FBOX_ALIGNMENT;
|
||||
/* toggle the high bit */
|
||||
hbs = !hbs;
|
||||
}
|
||||
}
|
||||
|
||||
opal_atomic_mb ();
|
||||
if (poll_count) {
|
||||
BTL_VERBOSE(("left off at offset %u (hbs: %d)", start, hbs));
|
||||
|
||||
endpoint->next_fbox_in = next_fbox;
|
||||
/* save where we left off */
|
||||
/* let the sender know where we stopped */
|
||||
ep->fbox_in.start = ep->fbox_in.startp[0] = ((uint32_t) hbs << 31) | start;
|
||||
processed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* return the number of fragments processed */
|
||||
return processed;
|
||||
}
|
||||
|
||||
static inline void mca_btl_vader_try_fbox_setup (mca_btl_base_endpoint_t *ep, mca_btl_vader_hdr_t *hdr)
|
||||
{
|
||||
if (NULL == ep->fbox_out.buffer && mca_btl_vader_component.fbox_max > mca_btl_vader_component.fbox_count &&
|
||||
mca_btl_vader_component.fbox_threshold <= ++ep->send_count) {
|
||||
|
||||
/* protect access to mca_btl_vader_component.segment_offset */
|
||||
OPAL_THREAD_LOCK(&mca_btl_vader_component.lock);
|
||||
|
||||
if (mca_btl_vader_component.segment_size >= mca_btl_vader_component.segment_offset + mca_btl_vader_component.fbox_size) {
|
||||
/* verify the remote side will accept another fbox */
|
||||
if (0 <= opal_atomic_add_32 (&ep->fifo->fbox_available, -1)) {
|
||||
void *fbox_base = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset;
|
||||
mca_btl_vader_component.segment_offset += mca_btl_vader_component.fbox_size;
|
||||
|
||||
/* zero out the fast box */
|
||||
memset (fbox_base, 0, mca_btl_vader_component.fbox_size);
|
||||
mca_btl_vader_endpoint_setup_fbox_send (ep, fbox_base);
|
||||
|
||||
hdr->flags |= MCA_BTL_VADER_FLAG_SETUP_FBOX;
|
||||
hdr->fbox_base = virtual2relative((char *) ep->fbox_out.buffer);
|
||||
++mca_btl_vader_component.fbox_count;
|
||||
} else {
|
||||
opal_atomic_add_32 (&ep->fifo->fbox_available, 1);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* !defined(MCA_BTL_VADER_FBOX_H) */
|
||||
|
@ -70,11 +70,21 @@
|
||||
typedef struct vader_fifo_t {
|
||||
volatile fifo_value_t fifo_head;
|
||||
volatile fifo_value_t fifo_tail;
|
||||
volatile int32_t fbox_available;
|
||||
} vader_fifo_t;
|
||||
|
||||
/* large enough to ensure the fifo is on its own cache line */
|
||||
#define MCA_BTL_VADER_FIFO_SIZE 128
|
||||
|
||||
/***
|
||||
* One or more FIFO components may be a pointer that must be
|
||||
* accessed by multiple processes. Since the shared region may
|
||||
* be mmapped differently into each process's address space,
|
||||
* these pointers will be relative to some base address. Here,
|
||||
* we define inline functions to translate between relative
|
||||
* addresses and virtual addresses.
|
||||
*/
|
||||
|
||||
/* This only works for finding the relative address for a pointer within my_segment */
|
||||
static inline fifo_value_t virtual2relative (char *addr)
|
||||
{
|
||||
@ -91,18 +101,26 @@ static inline void *relative2virtual (fifo_value_t offset)
|
||||
return (void *)(intptr_t)((offset & MCA_BTL_VADER_OFFSET_MASK) + mca_btl_vader_component.endpoints[offset >> MCA_BTL_VADER_OFFSET_BITS].segment_base);
|
||||
}
|
||||
|
||||
#include "btl_vader_fbox.h"
|
||||
|
||||
/**
|
||||
* vader_fifo_read:
|
||||
*
|
||||
* @brief reads a single fragment from a local fifo
|
||||
*
|
||||
* @param[inout] fifo - FIFO to read from
|
||||
* @param[out] ep - returns the endpoint the fifo element was read from
|
||||
*
|
||||
* @returns a fragment header or NULL
|
||||
*
|
||||
* This function does not currently support multiple readers.
|
||||
*/
|
||||
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct mca_btl_base_endpoint_t **ep)
|
||||
{
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
fifo_value_t value;
|
||||
static volatile int32_t lock = 0;
|
||||
|
||||
if (opal_atomic_swap_32 (&lock, 1)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (VADER_FIFO_FREE == fifo->fifo_head) {
|
||||
lock = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -113,13 +131,7 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct m
|
||||
*ep = &mca_btl_vader_component.endpoints[value >> MCA_BTL_VADER_OFFSET_BITS];
|
||||
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
|
||||
|
||||
if (OPAL_UNLIKELY(!(hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) && ((*ep)->expected_sequence != hdr->seqn))) {
|
||||
lock = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fifo->fifo_head = VADER_FIFO_FREE;
|
||||
++(*ep)->expected_sequence;
|
||||
|
||||
assert (hdr->next != value);
|
||||
|
||||
@ -138,16 +150,14 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo, struct m
|
||||
}
|
||||
|
||||
opal_atomic_wmb ();
|
||||
lock = 0;
|
||||
return hdr;
|
||||
}
|
||||
|
||||
static inline int vader_fifo_init (vader_fifo_t *fifo)
|
||||
static inline void vader_fifo_init (vader_fifo_t *fifo)
|
||||
{
|
||||
fifo->fifo_head = fifo->fifo_tail = VADER_FIFO_FREE;
|
||||
fifo->fbox_available = mca_btl_vader_component.fbox_max;
|
||||
mca_btl_vader_component.my_fifo = fifo;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value)
|
||||
@ -170,15 +180,44 @@ static inline void vader_fifo_write (vader_fifo_t *fifo, fifo_value_t value)
|
||||
opal_atomic_wmb ();
|
||||
}
|
||||
|
||||
/* write a frag (relative to this process' base) to another rank's fifo */
|
||||
static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
||||
/**
|
||||
* vader_fifo_write_ep:
|
||||
*
|
||||
* @brief write a frag (relative to this process' base) to another rank's fifo
|
||||
*
|
||||
* @param[in] hdr - fragment header to write
|
||||
* @param[in] ep - endpoint to write the fragment to
|
||||
*
|
||||
* This function is used to send a fragment to a remote peer. {hdr} must belong
|
||||
* to the current process.
|
||||
*/
|
||||
static inline bool vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
fifo_value_t rhdr = virtual2relative ((char *) hdr);
|
||||
if (ep->fbox_out.buffer) {
|
||||
/* if there is a fast box for this peer then use the fast box to send the fragment header.
|
||||
* this is done to ensure fragment ordering */
|
||||
opal_atomic_wmb ();
|
||||
return mca_btl_vader_fbox_sendi (ep, 0xfe, &rhdr, sizeof (rhdr), NULL, 0);
|
||||
}
|
||||
mca_btl_vader_try_fbox_setup (ep, hdr);
|
||||
hdr->next = VADER_FIFO_FREE;
|
||||
hdr->seqn = ep->next_sequence++;
|
||||
vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr));
|
||||
vader_fifo_write (ep->fifo, rhdr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* write a frag (relative to the remote process' base) to the remote fifo. note the remote peer must own hdr */
|
||||
/**
|
||||
* vader_fifo_write_back:
|
||||
*
|
||||
* @brief write a frag (relative to the remote process' base) to the remote fifo
|
||||
*
|
||||
* @param[in] hdr - fragment header to write
|
||||
* @param[in] ep - endpoint the fragment belongs to
|
||||
*
|
||||
* This function is used to return a fragment to the sending process. It differs from vader_fifo_write_ep
|
||||
* in that it uses the {ep} to produce the relative address.
|
||||
*/
|
||||
static inline void vader_fifo_write_back (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
hdr->next = VADER_FIFO_FREE;
|
||||
|
@ -42,12 +42,18 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
|
||||
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
|
||||
{
|
||||
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) item;
|
||||
unsigned int frag_size = (unsigned int)(uintptr_t) ctx;
|
||||
unsigned int data_size = frag_size - sizeof (mca_btl_vader_hdr_t);
|
||||
unsigned int data_size = (unsigned int)(uintptr_t) ctx;
|
||||
unsigned int frag_size = data_size + sizeof (mca_btl_vader_hdr_t);
|
||||
|
||||
assert (data_size > 0);
|
||||
|
||||
/* ensure next fragment is aligned on a cache line */
|
||||
frag_size = (frag_size + 63) & ~63;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_btl_vader_component.lock);
|
||||
|
||||
if (mca_btl_vader_component.segment_size < mca_btl_vader_component.segment_offset + frag_size) {
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock);
|
||||
item->ptr = NULL;
|
||||
return;
|
||||
}
|
||||
@ -69,6 +75,8 @@ void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
|
||||
item->ptr = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset;
|
||||
mca_btl_vader_component.segment_offset += frag_size;
|
||||
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_vader_component.lock);
|
||||
|
||||
mca_btl_vader_frag_constructor ((mca_btl_vader_frag_t *) item);
|
||||
}
|
||||
|
||||
|
@ -30,19 +30,30 @@ enum {
|
||||
MCA_BTL_VADER_FLAG_INLINE = 0,
|
||||
MCA_BTL_VADER_FLAG_SINGLE_COPY = 1,
|
||||
MCA_BTL_VADER_FLAG_COMPLETE = 2,
|
||||
MCA_BTL_VADER_FLAG_SETUP_FBOX = 4,
|
||||
};
|
||||
|
||||
struct mca_btl_vader_frag_t;
|
||||
struct mca_btl_vader_fbox_t;
|
||||
|
||||
/**
|
||||
* FIFO fragment header
|
||||
*/
|
||||
struct mca_btl_vader_hdr_t {
|
||||
volatile intptr_t next; /* next item in fifo. many peers may touch this */
|
||||
/** next item in fifo. many peers may touch this */
|
||||
volatile intptr_t next;
|
||||
/** pointer back the the fragment */
|
||||
struct mca_btl_vader_frag_t *frag;
|
||||
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
|
||||
uint8_t flags; /* vader send flags */
|
||||
uint16_t seqn;
|
||||
int32_t len; /* length of data following this header */
|
||||
struct iovec sc_iov; /* io vector containing pointer to single-copy data */
|
||||
/** tag associated with this fragment (used to lookup callback) */
|
||||
mca_btl_base_tag_t tag;
|
||||
/** vader send flags (inline, complete, setup fbox, etc) */
|
||||
uint8_t flags;
|
||||
/** length of data following this header */
|
||||
int32_t len;
|
||||
/** io vector containing pointer to single-copy data */
|
||||
struct iovec sc_iov;
|
||||
/** if the fragment indicates to setup a fast box the base is stored here */
|
||||
intptr_t fbox_base;
|
||||
};
|
||||
typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t;
|
||||
|
||||
@ -50,11 +61,17 @@ typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t;
|
||||
* shared memory send fragment derived type.
|
||||
*/
|
||||
struct mca_btl_vader_frag_t {
|
||||
/** base object */
|
||||
mca_btl_base_descriptor_t base;
|
||||
/** storage for segment data (max 2) */
|
||||
mca_btl_base_segment_t segments[2];
|
||||
/** endpoint this fragment is active on */
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
struct mca_btl_vader_fbox_t *fbox;
|
||||
mca_btl_vader_hdr_t *hdr; /* in the shared memory region */
|
||||
/** fast box in use (or NULL) */
|
||||
unsigned char * restrict fbox;
|
||||
/** fragment header (in the shared memory region) */
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
/** free list this fragment was allocated within */
|
||||
ompi_free_list_t *my_list;
|
||||
};
|
||||
|
||||
|
@ -53,6 +53,10 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
|
||||
|
||||
vader_return_registration (reg, endpoint);
|
||||
|
||||
/* always call the callback function */
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -76,6 +80,10 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* always call the callback function */
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -24,8 +24,6 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "btl_vader.h"
|
||||
#include "btl_vader_endpoint.h"
|
||||
#include "btl_vader_fifo.h"
|
||||
@ -105,8 +103,13 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
/* generate the endpoints */
|
||||
component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n + 1, sizeof (struct mca_btl_base_endpoint_t));
|
||||
component->endpoints[n].peer_smp_rank = -1;
|
||||
component->fbox_in_endpoints = calloc (n + 1, sizeof (void *));
|
||||
|
||||
component->segment_offset = (n - 1) * MCA_BTL_VADER_FBOX_PEER_SIZE + MCA_BTL_VADER_FIFO_SIZE;
|
||||
if (NULL == component->endpoints || NULL == component->fbox_in_endpoints) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
component->segment_offset = MCA_BTL_VADER_FIFO_SIZE;
|
||||
|
||||
/* initialize fragment descriptor free lists */
|
||||
/* initialize free list for put/get/single copy/inline fragments */
|
||||
@ -118,8 +121,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
NULL, mca_btl_vader_frag_init,
|
||||
(void *) (sizeof(mca_btl_vader_hdr_t) +
|
||||
mca_btl_vader_component.max_inline_send));
|
||||
(void *)(intptr_t) mca_btl_vader_component.max_inline_send);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
@ -133,8 +135,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
NULL, mca_btl_vader_frag_init,
|
||||
(void *) (sizeof (mca_btl_vader_hdr_t) +
|
||||
mca_btl_vader.super.btl_eager_limit));
|
||||
(void *)(intptr_t) mca_btl_vader.super.btl_eager_limit);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
@ -149,8 +150,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
NULL, mca_btl_vader_frag_init,
|
||||
(void *) (sizeof (mca_btl_vader_hdr_t) +
|
||||
mca_btl_vader.super.btl_max_send_size));
|
||||
(void *)(intptr_t) mca_btl_vader.super.btl_max_send_size);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
@ -164,18 +164,17 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
|
||||
|
||||
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
|
||||
const int fbox_in_offset = MCA_BTL_VADER_LOCAL_RANK - (MCA_BTL_VADER_LOCAL_RANK > remote_rank);
|
||||
const int fbox_out_offset = remote_rank - (MCA_BTL_VADER_LOCAL_RANK < remote_rank);
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
struct vader_modex_t *modex;
|
||||
size_t msg_size;
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(ep, mca_btl_vader_endpoint_t);
|
||||
|
||||
ep->peer_smp_rank = remote_rank;
|
||||
|
||||
if (remote_rank != MCA_BTL_VADER_LOCAL_RANK) {
|
||||
OPAL_MODEX_RECV(rc, &component->super.btl_version,
|
||||
proc, (uint8_t**)&modex, &msg_size);
|
||||
OPAL_MODEX_RECV(rc, &component->super.btl_version, proc, (void **) &modex, &msg_size);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
@ -189,24 +188,23 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
|
||||
MCA_MPOOL_FLAGS_PERSIST, (void **) &ep->segment_base);
|
||||
#else
|
||||
msg_size -= offsetof (struct vader_modex_t, seg_ds);
|
||||
memcpy (&ep->seg_ds, &modex->seg_ds, msg_size);
|
||||
ep->segment_base = opal_shmem_segment_attach (&ep->seg_ds);
|
||||
|
||||
/* store a copy of the segment information for detach */
|
||||
ep->seg_ds = malloc (msg_size);
|
||||
if (NULL == ep->seg_ds) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
memcpy (ep->seg_ds, &modex->seg_ds, msg_size);
|
||||
|
||||
ep->segment_base = opal_shmem_segment_attach (ep->seg_ds);
|
||||
if (NULL == ep->segment_base) {
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
OBJ_CONSTRUCT(&ep->lock, opal_mutex_t);
|
||||
|
||||
free (modex);
|
||||
|
||||
ep->next_fbox_out = 0;
|
||||
ep->next_fbox_in = 0;
|
||||
ep->next_sequence = 0;
|
||||
ep->expected_sequence = 0;
|
||||
|
||||
ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE +
|
||||
fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
|
||||
ep->fbox_out = (struct mca_btl_vader_fbox_t * restrict) (component->my_segment + MCA_BTL_VADER_FIFO_SIZE +
|
||||
fbox_out_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
|
||||
} else {
|
||||
/* set up the segment base so we can calculate a virtual to real for local pointers */
|
||||
ep->segment_base = component->my_segment;
|
||||
@ -220,36 +218,52 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
|
||||
static int fini_vader_endpoint (struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
|
||||
if (NULL != ep->fbox_out) {
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
if (ep->rcache) {
|
||||
/* clean out the registration cache */
|
||||
const int nregs = 100;
|
||||
mca_mpool_base_registration_t *regs[nregs];
|
||||
int reg_cnt;
|
||||
if (ep->rcache) {
|
||||
/* clean out the registration cache */
|
||||
const int nregs = 100;
|
||||
mca_mpool_base_registration_t *regs[nregs];
|
||||
int reg_cnt;
|
||||
|
||||
do {
|
||||
reg_cnt = ep->rcache->rcache_find_all(ep->rcache, 0, (size_t)-1,
|
||||
do {
|
||||
reg_cnt = ep->rcache->rcache_find_all(ep->rcache, 0, (size_t)-1,
|
||||
regs, nregs);
|
||||
|
||||
for (int i = 0 ; i < reg_cnt ; ++i) {
|
||||
/* otherwise dereg will fail on assert */
|
||||
regs[i]->ref_count = 0;
|
||||
OBJ_RELEASE(regs[i]);
|
||||
}
|
||||
} while (reg_cnt == nregs);
|
||||
for (int i = 0 ; i < reg_cnt ; ++i) {
|
||||
/* otherwise dereg will fail on assert */
|
||||
regs[i]->ref_count = 0;
|
||||
OBJ_RELEASE(regs[i]);
|
||||
}
|
||||
} while (reg_cnt == nregs);
|
||||
|
||||
ep->rcache = NULL;
|
||||
}
|
||||
xpmem_release (ep->apid);
|
||||
#else
|
||||
opal_shmem_segment_detach (&ep->seg_ds);
|
||||
#endif
|
||||
ep->rcache = NULL;
|
||||
}
|
||||
|
||||
ep->fbox_in = ep->fbox_out = NULL;
|
||||
if (ep->segment_base) {
|
||||
xpmem_release (ep->apid);
|
||||
ep->apid = 0;
|
||||
}
|
||||
#else
|
||||
if (ep->seg_ds) {
|
||||
opal_shmem_ds_t seg_ds;
|
||||
|
||||
/* opal_shmem_segment_detach expects a opal_shmem_ds_t and will
|
||||
* stomp past the end of the seg_ds if it is too small (which
|
||||
* ep->seg_ds probably is) */
|
||||
memcpy (&seg_ds, ep->seg_ds, opal_shmem_sizeof_shmem_ds (ep->seg_ds));
|
||||
free (ep->seg_ds);
|
||||
ep->seg_ds = NULL;
|
||||
|
||||
/* disconnect from the peer's segment */
|
||||
opal_shmem_segment_detach (&seg_ds);
|
||||
}
|
||||
#endif
|
||||
|
||||
ep->fbox_in.buffer = ep->fbox_out.buffer = NULL;
|
||||
ep->segment_base = NULL;
|
||||
|
||||
OBJ_DESTRUCT(ep);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -378,8 +392,13 @@ static int vader_finalize(struct mca_btl_base_module_t *btl)
|
||||
}
|
||||
|
||||
free (component->endpoints);
|
||||
component->endpoints = NULL;
|
||||
|
||||
vader_btl->btl_inited = false;
|
||||
|
||||
free (component->fbox_in_endpoints);
|
||||
component->fbox_in_endpoints = NULL;
|
||||
|
||||
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
||||
opal_shmem_unlink (&mca_btl_vader_component.seg_ds);
|
||||
opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds);
|
||||
@ -488,8 +507,8 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
|
||||
uint32_t flags)
|
||||
{
|
||||
const size_t total_size = reserve + *size;
|
||||
mca_btl_vader_fbox_t *fbox;
|
||||
mca_btl_vader_frag_t *frag;
|
||||
unsigned char *fbox;
|
||||
void *data_ptr;
|
||||
int rc;
|
||||
|
||||
@ -562,15 +581,14 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
|
||||
* fragment does not belong to the caller */
|
||||
fbox = mca_btl_vader_reserve_fbox (endpoint, total_size);
|
||||
if (OPAL_LIKELY(fbox)) {
|
||||
frag->segments[0].seg_addr.pval = fbox->data;
|
||||
frag->segments[0].seg_addr.pval = fbox;
|
||||
}
|
||||
|
||||
frag->fbox = fbox;
|
||||
}
|
||||
|
||||
/* NTH: the covertor adds some latency so we bypass it here */
|
||||
vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve),
|
||||
data_ptr, *size);
|
||||
memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size);
|
||||
frag->segments[0].seg_len = total_size;
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
}
|
||||
@ -602,3 +620,15 @@ static int vader_ft_event (int state)
|
||||
{
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
|
||||
{
|
||||
OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t);
|
||||
}
|
||||
|
||||
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
|
||||
{
|
||||
OBJ_DESTRUCT(&ep->pending_frags);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_vader_endpoint_t, opal_list_item_t, mca_btl_vader_endpoint_constructor, mca_btl_vader_endpoint_destructor);
|
||||
|
@ -56,6 +56,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
|
||||
/* always call the callback function */
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
@ -79,6 +80,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
frag->endpoint = endpoint;
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
@ -12,8 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,27 +40,38 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor;
|
||||
const size_t total_size = frag->segments[0].seg_len;
|
||||
|
||||
if (OPAL_LIKELY(frag->fbox)) {
|
||||
mca_btl_vader_fbox_send (frag->fbox, tag, endpoint);
|
||||
mca_btl_vader_fbox_send (frag->fbox, tag);
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* header (+ optional inline data) */
|
||||
frag->hdr->len = frag->segments[0].seg_len;
|
||||
frag->hdr->len = total_size;
|
||||
/* type of message, pt-2-pt, one-sided, etc */
|
||||
frag->hdr->tag = tag;
|
||||
|
||||
/* post the relative address of the descriptor into the peer's fifo */
|
||||
vader_fifo_write_ep (frag->hdr, endpoint);
|
||||
if (opal_list_get_size (&endpoint->pending_frags) || !vader_fifo_write_ep (frag->hdr, endpoint)) {
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_append (&endpoint->pending_frags, (opal_list_item_t *) frag);
|
||||
if (!endpoint->waiting) {
|
||||
opal_list_append (&mca_btl_vader_component.pending_endpoints, &endpoint->super);
|
||||
endpoint->waiting = true;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if ((frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) ||
|
||||
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
||||
return 0;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* data is gone (from the pml's perspective). frag callback/release will
|
||||
|
@ -12,8 +12,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -47,6 +47,12 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
void *data_ptr = NULL;
|
||||
size_t length;
|
||||
|
||||
/* don't attempt sendi if there are pending fragments on the endpoint */
|
||||
if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->pending_frags))) {
|
||||
*descriptor = NULL;
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (payload_size) {
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
}
|
||||
@ -56,7 +62,6 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
length = header_size + payload_size;
|
||||
|
||||
/* allocate a fragment, giving up if we can't get one */
|
||||
@ -92,7 +97,10 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
}
|
||||
|
||||
/* write the fragment pointer to peer's the FIFO. the progress function will return the fragment */
|
||||
vader_fifo_write_ep (frag->hdr, endpoint);
|
||||
if (!vader_fifo_write_ep (frag->hdr, endpoint)) {
|
||||
*descriptor = &frag->base;
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_vader.h"
|
||||
|
||||
#include "opal/include/opal/align.h"
|
||||
#include "btl_vader_xpmem.h"
|
||||
#include "opal/mca/memchecker/base/base.h"
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user