btl/vader: do not use common/sm for shared memory fragments
This commit was SVN r28250.
Этот коммит содержится в:
родитель
d12eed0703
Коммит
113fadd749
@ -35,7 +35,8 @@ libmca_btl_vader_la_sources = \
|
||||
btl_vader_sendi.c \
|
||||
btl_vader_fbox.h \
|
||||
btl_vader_get.c \
|
||||
btl_vader_put.c
|
||||
btl_vader_put.c \
|
||||
btl_vader_xpmem.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -45,44 +45,39 @@
|
||||
/* xpmem is required by vader atm */
|
||||
#include <xpmem.h>
|
||||
|
||||
#include "opal/include/opal/align.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/sm/mpool_sm.h"
|
||||
#include "ompi/mca/common/sm/common_sm.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
|
||||
#include "ompi/runtime/ompi_module_exchange.h"
|
||||
|
||||
#include "ompi/mca/rcache/rcache.h"
|
||||
#include "ompi/mca/rcache/base/base.h"
|
||||
|
||||
#include "btl_vader_endpoint.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define min(a,b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
extern int mca_btl_vader_memcpy_limit;
|
||||
extern int mca_btl_vader_log_align;
|
||||
extern int mca_btl_vader_max_inline_send;
|
||||
|
||||
/* We can't use opal_cache_line_size here because we need a
|
||||
compile-time constant for padding the struct. We can't really have
|
||||
a compile-time constant that is portable, either (e.g., compile on
|
||||
one machine and run on another). So just use a big enough cache
|
||||
line that should hopefully be good in most places. */
|
||||
#define VADER_CACHE_LINE_PAD 128
|
||||
|
||||
/* largest address we can attach to using xpmem */
|
||||
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
|
||||
|
||||
/*
|
||||
* Shared Memory resource managment
|
||||
*/
|
||||
|
||||
struct vader_fifo_t;
|
||||
|
||||
/*
|
||||
* Modex data
|
||||
*/
|
||||
struct vader_modex_t {
|
||||
xpmem_segid_t seg_id;
|
||||
void *segment_base;
|
||||
};
|
||||
|
||||
/**
|
||||
* Shared Memory (VADER) BTL module.
|
||||
*/
|
||||
@ -92,20 +87,10 @@ struct mca_btl_vader_component_t {
|
||||
int vader_free_list_max; /**< maximum size of free lists */
|
||||
int vader_free_list_inc; /**< number of elements to alloc
|
||||
* when growing free lists */
|
||||
mca_mpool_base_module_t *vader_mpool; /**< mpool on local node */
|
||||
void *vader_mpool_base; /**< base address of shared memory pool */
|
||||
size_t eager_limit; /**< send fragment size */
|
||||
mca_common_sm_module_t *vader_seg; /**< description of shared memory segment */
|
||||
volatile struct vader_fifo_t **shm_fifo;/**< pointer to fifo 2D array in
|
||||
* shared memory */
|
||||
char **shm_bases; /**< pointer to base pointers in
|
||||
* shared memory */
|
||||
xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
|
||||
xpmem_segid_t *shm_seg_ids; /* xpmem segment ids */
|
||||
struct vader_fifo_t **fifo; /**< cached copy of the pointer to
|
||||
* the 2D fifo array. */
|
||||
struct mca_rcache_base_module_t **xpmem_rcaches;
|
||||
xpmem_apid_t *apids; /* xpmem apids */
|
||||
char *my_segment; /* this rank's base pointer */
|
||||
size_t segment_size; /* size of my_segment */
|
||||
size_t segment_offset; /* start of unused portion of my_segment */
|
||||
int32_t num_smp_procs; /**< current number of smp procs on this host */
|
||||
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
@ -114,13 +99,11 @@ struct mca_btl_vader_component_t {
|
||||
|
||||
opal_list_t active_sends; /**< list of outstanding fragments */
|
||||
|
||||
unsigned char **vader_fboxes_in; /**< incomming fast boxes (memory belongs to this process) */
|
||||
unsigned char **vader_fboxes_out; /**< outgoing fast boxes (memory belongs to remote peers) */
|
||||
int memcpy_limit; /** Limit where we switch from memmove to memcpy */
|
||||
int log_attach_align; /** Log of the alignment for xpmem segments */
|
||||
int max_inline_send; /** Limit for copy-in-copy-out fragments */
|
||||
|
||||
unsigned char *vader_next_fbox_in; /**< indices of fast boxes to poll */
|
||||
unsigned char *vader_next_fbox_out; /**< indices of fast boxes to write */
|
||||
|
||||
struct mca_btl_base_endpoint_t **vader_peers;
|
||||
struct mca_btl_base_endpoint_t *endpoints;
|
||||
};
|
||||
typedef struct mca_btl_vader_component_t mca_btl_vader_component_t;
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;
|
||||
@ -144,117 +127,38 @@ OMPI_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader;
|
||||
* we define macros to translate between relative addresses and
|
||||
* virtual addresses.
|
||||
*/
|
||||
#define VIRTUAL2RELATIVE(VADDR ) ((intptr_t)(VADDR) - (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
|
||||
#define RELATIVE2VIRTUAL(OFFSET) ((intptr_t)(OFFSET) + (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
|
||||
|
||||
/* look up the remote pointer in the peer rcache and attach if
|
||||
* necessary */
|
||||
static inline mca_mpool_base_registration_t *vader_get_registation (int peer_smp_rank, void *rem_ptr,
|
||||
size_t size, int flags)
|
||||
/* This only works for finding the relative address for a pointer within my_segment */
|
||||
static inline int64_t virtual2relative (char *addr)
|
||||
{
|
||||
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
|
||||
mca_mpool_base_registration_t *regs[10], *reg = NULL;
|
||||
struct xpmem_addr xpmem_addr;
|
||||
uintptr_t base, bound;
|
||||
int rc, i;
|
||||
|
||||
if (OPAL_UNLIKELY(peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
|
||||
return rem_ptr;
|
||||
}
|
||||
|
||||
base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_log_align);
|
||||
bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
|
||||
mca_btl_vader_log_align) + 1;
|
||||
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
|
||||
bound = VADER_MAX_ADDRESS;
|
||||
}
|
||||
|
||||
/* several segments may match the base pointer */
|
||||
rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
|
||||
for (i = 0 ; i < rc ; ++i) {
|
||||
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
|
||||
opal_atomic_add (®s[i]->ref_count, 1);
|
||||
return regs[i];
|
||||
}
|
||||
|
||||
if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* remove this pointer from the rcache and decrement its reference count
|
||||
(so it is detached later) */
|
||||
rc = rcache->rcache_delete (rcache, regs[i]);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
/* someone beat us to it? */
|
||||
break;
|
||||
}
|
||||
|
||||
/* start the new segment from the lower of the two bases */
|
||||
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
|
||||
|
||||
opal_atomic_add (®s[i]->ref_count, -1);
|
||||
|
||||
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
|
||||
/* this pointer is not in use */
|
||||
(void) xpmem_detach (regs[i]->alloc_base);
|
||||
OBJ_RELEASE(regs[i]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
reg = OBJ_NEW(mca_mpool_base_registration_t);
|
||||
if (OPAL_LIKELY(NULL != reg)) {
|
||||
/* stick around for awhile */
|
||||
reg->ref_count = 2;
|
||||
reg->base = (unsigned char *) base;
|
||||
reg->bound = (unsigned char *) bound;
|
||||
reg->flags = flags;
|
||||
|
||||
xpmem_addr.apid = mca_btl_vader_component.apids[peer_smp_rank];
|
||||
xpmem_addr.offset = base;
|
||||
|
||||
reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
|
||||
if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
|
||||
OBJ_RELEASE(reg);
|
||||
reg = NULL;
|
||||
} else {
|
||||
rcache->rcache_insert (rcache, reg, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return reg;
|
||||
return (int64_t)(uintptr_t) (addr - mca_btl_vader_component.my_segment) | ((int64_t)mca_btl_vader_component.my_smp_rank << 32);
|
||||
}
|
||||
|
||||
static inline void vader_return_registration (mca_mpool_base_registration_t *reg, int peer_smp_rank)
|
||||
static inline void *relative2virtual (int64_t offset)
|
||||
{
|
||||
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
|
||||
|
||||
opal_atomic_add (®->ref_count, -1);
|
||||
if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
|
||||
rcache->rcache_delete (rcache, reg);
|
||||
(void)xpmem_detach (reg->alloc_base);
|
||||
OBJ_RELEASE (reg);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
|
||||
{
|
||||
return (void *) ((uintptr_t) reg->alloc_base +
|
||||
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
|
||||
return (void *)(uintptr_t)((offset & 0xffffffffull) + mca_btl_vader_component.endpoints[offset >> 32].segment_base);
|
||||
}
|
||||
|
||||
/* memcpy is faster at larger sizes but is undefined if the
|
||||
pointers are aliased (TODO -- readd alias check) */
|
||||
static inline void vader_memmove (void *dst, void *src, size_t size)
|
||||
{
|
||||
if (size >= (size_t) mca_btl_vader_memcpy_limit) {
|
||||
if (size >= (size_t) mca_btl_vader_component.memcpy_limit) {
|
||||
memcpy (dst, src, size);
|
||||
} else {
|
||||
memmove (dst, src, size);
|
||||
}
|
||||
}
|
||||
|
||||
/* look up the remote pointer in the peer rcache and attach if
|
||||
* necessary */
|
||||
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
|
||||
size_t size, int flags);
|
||||
|
||||
void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr);
|
||||
|
||||
/**
|
||||
* Initiate a send to the peer.
|
||||
*
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -26,6 +26,7 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
|
||||
#include "btl_vader.h"
|
||||
@ -33,6 +34,8 @@
|
||||
#include "btl_vader_fifo.h"
|
||||
#include "btl_vader_fbox.h"
|
||||
|
||||
#include <sys/mman.h>
|
||||
|
||||
static int mca_btl_vader_component_progress (void);
|
||||
static int mca_btl_vader_component_open(void);
|
||||
static int mca_btl_vader_component_close(void);
|
||||
@ -41,38 +44,32 @@ static mca_btl_base_module_t** mca_btl_vader_component_init(int *num_btls,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
|
||||
/* limit where we should switch from bcopy to memcpy */
|
||||
int mca_btl_vader_memcpy_limit = 524288;
|
||||
int mca_btl_vader_log_align = 21; /* 2 MiB */
|
||||
/* maximum size for using copy-in-copy out semantics for contiguous sends */
|
||||
int mca_btl_vader_max_inline_send = 256;
|
||||
|
||||
/*
|
||||
* Shared Memory (VADER) component instance.
|
||||
*/
|
||||
mca_btl_vader_component_t mca_btl_vader_component = {
|
||||
{
|
||||
.super = {
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
about the component itself */
|
||||
{
|
||||
.btl_version = {
|
||||
MCA_BTL_BASE_VERSION_2_0_0,
|
||||
|
||||
"vader", /* MCA component name */
|
||||
OMPI_MAJOR_VERSION, /* MCA component major version */
|
||||
OMPI_MINOR_VERSION, /* MCA component minor version */
|
||||
OMPI_RELEASE_VERSION, /* MCA component release version */
|
||||
mca_btl_vader_component_open, /* component open */
|
||||
mca_btl_vader_component_close, /* component close */
|
||||
NULL,
|
||||
mca_btl_vader_component_register,
|
||||
.mca_component_name = "vader",
|
||||
.mca_component_major_version = OMPI_MAJOR_VERSION,
|
||||
.mca_component_minor_version = OMPI_MINOR_VERSION,
|
||||
.mca_component_release_version = OMPI_RELEASE_VERSION,
|
||||
.mca_open_component = mca_btl_vader_component_open,
|
||||
.mca_close_component = mca_btl_vader_component_close,
|
||||
.mca_query_component = NULL,
|
||||
.mca_register_component_params = mca_btl_vader_component_register,
|
||||
},
|
||||
{
|
||||
.btl_data = {
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_btl_vader_component_init,
|
||||
mca_btl_vader_component_progress,
|
||||
.btl_init = mca_btl_vader_component_init,
|
||||
.btl_progress = mca_btl_vader_component_progress,
|
||||
} /* end super */
|
||||
};
|
||||
|
||||
@ -84,60 +81,73 @@ static int mca_btl_vader_component_register (void)
|
||||
/* register VADER component variables */
|
||||
mca_btl_vader_component.vader_free_list_num = 8;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9,
|
||||
"free_list_num", "Initial number of fragments "
|
||||
"to allocate for shared memory communication.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.vader_free_list_num);
|
||||
mca_btl_vader_component.vader_free_list_max = 8192;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9,
|
||||
"free_list_max", "Maximum number of fragments "
|
||||
"to allocate for shared memory communication.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.vader_free_list_max);
|
||||
mca_btl_vader_component.vader_free_list_inc = 64;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_9,
|
||||
"free_list_inc", "Number of fragments to create "
|
||||
"on each allocation.", MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.vader_free_list_inc);
|
||||
|
||||
mca_btl_vader_memcpy_limit = 524288;
|
||||
mca_btl_vader_component.memcpy_limit = 524288;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"memcpy_limit", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5,
|
||||
"memcpy_limit", "Message size to switch from using "
|
||||
"memove to memcpy. The relative speed of these two "
|
||||
"routines can vary by size.", MCA_BASE_VAR_TYPE_INT,
|
||||
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_memcpy_limit);
|
||||
mca_btl_vader_log_align = 21;
|
||||
&mca_btl_vader_component.memcpy_limit);
|
||||
mca_btl_vader_component.log_attach_align = 21;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"log_align", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5,
|
||||
"log_align", "Log base 2 of the alignment to use for xpmem "
|
||||
"segments (default: 21, minimum: 12, maximum: 25)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_log_align);
|
||||
&mca_btl_vader_component.log_attach_align);
|
||||
|
||||
mca_btl_vader_max_inline_send = 256;
|
||||
mca_btl_vader_component.segment_size = 1 << 24;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"max_inline_send", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_ALL_EQ,
|
||||
&mca_btl_vader_max_inline_send);
|
||||
"segment_size", "Maximum size of all shared "
|
||||
"memory buffers (default: 16M)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.segment_size);
|
||||
|
||||
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
||||
mca_btl_vader.super.btl_eager_limit = 64 * 1024;
|
||||
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader_component.max_inline_send = 256;
|
||||
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
||||
"max_inline_send", "Maximum size to transfer "
|
||||
"using copy-in copy-out semantics",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_LOCAL,
|
||||
&mca_btl_vader_component.max_inline_send);
|
||||
|
||||
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
||||
mca_btl_vader.super.btl_eager_limit = 64 * 1024;
|
||||
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT |
|
||||
MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
|
||||
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
|
||||
|
||||
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
||||
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
||||
mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
|
||||
mca_btl_vader.super.btl_latency = 1; /* Microsecs */
|
||||
|
||||
@ -155,16 +165,6 @@ static int mca_btl_vader_component_register (void)
|
||||
|
||||
static int mca_btl_vader_component_open(void)
|
||||
{
|
||||
|
||||
/* limit segment alignment to be between 4k and 16M */
|
||||
if (mca_btl_vader_log_align < 12) {
|
||||
mca_btl_vader_log_align = 12;
|
||||
} else if (mca_btl_vader_log_align > 25) {
|
||||
mca_btl_vader_log_align = 25;
|
||||
}
|
||||
|
||||
mca_btl_vader_component.eager_limit = mca_btl_vader.super.btl_eager_limit;
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
|
||||
@ -180,39 +180,26 @@ static int mca_btl_vader_component_open(void)
|
||||
|
||||
static int mca_btl_vader_component_close(void)
|
||||
{
|
||||
int return_value = OMPI_SUCCESS;
|
||||
|
||||
/**
|
||||
* We don't have to destroy the fragment lists. They are allocated
|
||||
* directly into the mmapped file, they will auto-magically disappear
|
||||
* when the file get unmapped.
|
||||
*/
|
||||
/*OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);*/
|
||||
|
||||
/* unmap the shared memory control structure */
|
||||
if(mca_btl_vader_component.vader_seg != NULL) {
|
||||
return_value = mca_common_sm_fini( mca_btl_vader_component.vader_seg );
|
||||
if( OMPI_SUCCESS != return_value ) {
|
||||
return_value=OMPI_ERROR;
|
||||
opal_output(0," mca_common_sm_fini failed\n");
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* unlink file, so that it will be deleted when all references
|
||||
* to it are gone - no error checking, since we want all procs
|
||||
* to call this, so that in an abnormal termination scenario,
|
||||
* this file will still get cleaned up */
|
||||
/* XXX LANL TODO -- remove unlink once the shmem segment uses xpmem */
|
||||
unlink(mca_btl_vader_component.vader_seg->shmem_ds.seg_name);
|
||||
OBJ_RELEASE(mca_btl_vader_component.vader_seg);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user);
|
||||
OBJ_DESTRUCT(&mca_btl_vader_component.active_sends);
|
||||
|
||||
CLEANUP:
|
||||
if (NULL != mca_btl_vader_component.my_segment) {
|
||||
munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size);
|
||||
}
|
||||
|
||||
/* return */
|
||||
return return_value;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_base_vader_modex_send (void)
|
||||
{
|
||||
struct vader_modex_t modex;
|
||||
|
||||
modex.seg_id = mca_btl_vader_component.my_seg_id;
|
||||
modex.segment_base = mca_btl_vader_component.my_segment;
|
||||
|
||||
return ompi_modex_send(&mca_btl_vader_component.super.btl_version,
|
||||
&modex, sizeof (modex));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -224,18 +211,17 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
{
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
mca_btl_base_module_t **btls = NULL;
|
||||
int rc;
|
||||
|
||||
*num_btls = 0;
|
||||
|
||||
/* if no session directory was created, then we cannot be used */
|
||||
/* XXX LANL FIXME -- this is not the case. we can use an anonymous segment */
|
||||
if (NULL == ompi_process_info.job_session_dir) {
|
||||
return NULL;
|
||||
}
|
||||
/* limit segment alignment to be between 4k and 16M */
|
||||
|
||||
/* lookup/create shared memory pool only when used */
|
||||
component->vader_mpool = NULL;
|
||||
component->vader_mpool_base = NULL;
|
||||
if (mca_btl_vader_component.segment_size < 12) {
|
||||
mca_btl_vader_component.segment_size = 12;
|
||||
} else if (mca_btl_vader_component.segment_size > 25) {
|
||||
mca_btl_vader_component.segment_size = 25;
|
||||
}
|
||||
|
||||
btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *));
|
||||
if (NULL == btls) {
|
||||
@ -250,6 +236,35 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ensure a sane segment size */
|
||||
if (mca_btl_vader_component.segment_size < (1 << 20)) {
|
||||
mca_btl_vader_component.segment_size = (1 << 20);
|
||||
}
|
||||
|
||||
component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ |
|
||||
PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
|
||||
if ((void *)-1 == component->my_segment) {
|
||||
free (btls);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
component->segment_offset = 0;
|
||||
|
||||
/* initialize my fifo */
|
||||
rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
free (btls);
|
||||
munmap (component->my_segment, mca_btl_vader_component.segment_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = mca_btl_base_vader_modex_send ();
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
free (btls);
|
||||
munmap (component->my_segment, mca_btl_vader_component.segment_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*num_btls = 1;
|
||||
|
||||
/* get pointer to the btls */
|
||||
@ -268,21 +283,14 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
||||
|
||||
static inline void mca_btl_vader_progress_sends (void)
|
||||
{
|
||||
opal_list_t *list = &mca_btl_vader_component.active_sends;
|
||||
opal_list_item_t *item, *next;
|
||||
mca_btl_vader_frag_t *frag;
|
||||
|
||||
for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ; ) {
|
||||
frag = (mca_btl_vader_frag_t *) item;
|
||||
next = opal_list_get_next (item);
|
||||
mca_btl_vader_frag_t *frag, *next;
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(frag, next, &mca_btl_vader_component.active_sends, mca_btl_vader_frag_t) {
|
||||
if (OPAL_LIKELY(frag->hdr->complete)) {
|
||||
opal_list_remove_item (&mca_btl_vader_component.active_sends, item);
|
||||
opal_list_remove_item (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
|
||||
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
}
|
||||
|
||||
item = next;
|
||||
}
|
||||
}
|
||||
|
||||
@ -290,11 +298,11 @@ static inline void mca_btl_vader_progress_sends (void)
|
||||
static int mca_btl_vader_component_progress (void)
|
||||
{
|
||||
int my_smp_rank = mca_btl_vader_component.my_smp_rank;
|
||||
vader_fifo_t *fifo = mca_btl_vader_component.fifo[my_smp_rank];
|
||||
mca_btl_active_message_callback_t *reg;
|
||||
mca_btl_vader_frag_t frag;
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
mca_mpool_base_registration_t *xpmem_reg = NULL;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
|
||||
/* check active sends for completion */
|
||||
mca_btl_vader_progress_sends ();
|
||||
@ -303,7 +311,7 @@ static int mca_btl_vader_component_progress (void)
|
||||
mca_btl_vader_check_fboxes ();
|
||||
|
||||
/* poll the fifo once */
|
||||
hdr = vader_fifo_read (fifo);
|
||||
hdr = vader_fifo_read (mca_btl_vader_component.endpoints[my_smp_rank].fifo);
|
||||
if (NULL == hdr) {
|
||||
return 0;
|
||||
}
|
||||
@ -314,7 +322,8 @@ static int mca_btl_vader_component_progress (void)
|
||||
frag.segments[0].seg_len = hdr->len;
|
||||
|
||||
if (OPAL_UNLIKELY(hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY)) {
|
||||
xpmem_reg = vader_get_registation (hdr->my_smp_rank, hdr->sc_iov.iov_base,
|
||||
endpoint = mca_btl_vader_component.endpoints + hdr->my_smp_rank;
|
||||
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
|
||||
hdr->sc_iov.iov_len, 0);
|
||||
|
||||
frag.segments[1].seg_addr.pval = vader_reg_to_ptr (xpmem_reg, hdr->sc_iov.iov_base);
|
||||
@ -323,7 +332,7 @@ static int mca_btl_vader_component_progress (void)
|
||||
/* recv upcall */
|
||||
frag.base.des_dst_cnt = 2;
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
||||
vader_return_registration (xpmem_reg, hdr->my_smp_rank);
|
||||
vader_return_registration (xpmem_reg, endpoint);
|
||||
} else {
|
||||
frag.base.des_dst_cnt = 1;
|
||||
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
||||
|
@ -24,6 +24,10 @@
|
||||
#ifndef MCA_BTL_VADER_ENDPOINT_H
|
||||
#define MCA_BTL_VADER_ENDPOINT_H
|
||||
|
||||
#include <xpmem.h>
|
||||
|
||||
struct vader_fifo_t;
|
||||
|
||||
/**
|
||||
* An abstraction that represents a connection to a endpoint process.
|
||||
* An instance of mca_ptl_base_endpoint_t is associated w/ each process
|
||||
@ -31,10 +35,16 @@
|
||||
*/
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
int my_smp_rank; /**< My SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing
|
||||
* SMP specfic data structures. */
|
||||
char *segment_base;
|
||||
struct vader_fifo_t *fifo;
|
||||
xpmem_apid_t apid;
|
||||
char *fbox_out;
|
||||
char *fbox_in;
|
||||
int next_fbox_out;
|
||||
int next_fbox_in;
|
||||
struct mca_rcache_base_module_t *rcache;
|
||||
};
|
||||
|
||||
#endif /* MCA_BTL_VADER_ENDPOINT_H */
|
||||
|
@ -23,16 +23,13 @@
|
||||
|
||||
enum {MCA_BTL_VADER_FBOX_FREE = 0xfe, MCA_BTL_VADER_FBOX_RESERVED = 0xff};
|
||||
|
||||
#define MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, fbox) \
|
||||
(mca_btl_vader_component.vader_fboxes_out[peer_smp_rank] + FBOX_SIZE * (fbox))
|
||||
#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + FBOX_SIZE * (fbox))
|
||||
#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + FBOX_SIZE * (fbox))
|
||||
|
||||
#define MCA_BTL_VADER_FBOX_IN_PTR(peer_smp_rank, fbox) \
|
||||
(mca_btl_vader_component.vader_fboxes_in[peer_smp_rank] + FBOX_SIZE * (fbox))
|
||||
|
||||
static inline unsigned char *mca_btl_vader_reserve_fbox (int peer_smp_rank, size_t size)
|
||||
static inline unsigned char *mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, size_t size)
|
||||
{
|
||||
int next_fbox = mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank];
|
||||
unsigned char *fbox = MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, next_fbox);
|
||||
int next_fbox = ep->next_fbox_out;
|
||||
unsigned char *fbox = (unsigned char *) MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
|
||||
|
||||
/* todo -- need thread locks/atomics here for the multi-threaded case */
|
||||
if (OPAL_UNLIKELY(size > MAX_MSG || fbox[0] != MCA_BTL_VADER_FBOX_FREE)) {
|
||||
@ -40,7 +37,7 @@ static inline unsigned char *mca_btl_vader_reserve_fbox (int peer_smp_rank, size
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank] = (next_fbox + 1) & LAST_FBOX;
|
||||
ep->next_fbox_out = (next_fbox + 1) & LAST_FBOX;
|
||||
|
||||
/* mark this fast box as in use */
|
||||
fbox[0] = MCA_BTL_VADER_FBOX_RESERVED;
|
||||
@ -64,7 +61,7 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
|
||||
{
|
||||
unsigned char *fbox;
|
||||
|
||||
fbox = mca_btl_vader_reserve_fbox(endpoint->peer_smp_rank, header_size + payload_size);
|
||||
fbox = mca_btl_vader_reserve_fbox(endpoint, header_size + payload_size);
|
||||
if (OPAL_UNLIKELY(NULL == fbox)) {
|
||||
return 0;
|
||||
}
|
||||
@ -86,18 +83,20 @@ static inline void mca_btl_vader_check_fboxes (void)
|
||||
{
|
||||
int my_smp_rank = mca_btl_vader_component.my_smp_rank;
|
||||
mca_btl_active_message_callback_t *reg;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
unsigned char size, tag, *fbox;
|
||||
mca_btl_vader_frag_t frag;
|
||||
unsigned char size, tag;
|
||||
int i;
|
||||
int i, next_fbox;
|
||||
|
||||
for (i = 0 ; i < mca_btl_vader_component.num_smp_procs ; ++i) {
|
||||
int next_fbox = mca_btl_vader_component.vader_next_fbox_in[i];
|
||||
unsigned char *fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox);
|
||||
|
||||
if (my_smp_rank == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
endpoint = mca_btl_vader_component.endpoints + i;
|
||||
next_fbox = endpoint->next_fbox_in;
|
||||
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
|
||||
|
||||
/* process all fast-box messages */
|
||||
while (0xfe != ((size = fbox[0]) & 0xfe)) {
|
||||
opal_atomic_rmb ();
|
||||
@ -116,10 +115,10 @@ static inline void mca_btl_vader_check_fboxes (void)
|
||||
fbox[0] = MCA_BTL_VADER_FBOX_FREE;
|
||||
|
||||
next_fbox = next_fbox == LAST_FBOX ? 0 : next_fbox + 1;
|
||||
fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox);
|
||||
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
|
||||
}
|
||||
|
||||
mca_btl_vader_component.vader_next_fbox_in[i] = next_fbox;
|
||||
endpoint->next_fbox_in = next_fbox;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,7 +30,7 @@
|
||||
#include "btl_vader_endpoint.h"
|
||||
#include "btl_vader_frag.h"
|
||||
|
||||
#define VADER_FIFO_FREE ((intptr_t)-2)
|
||||
#define VADER_FIFO_FREE ((int64_t)-2)
|
||||
|
||||
/*
|
||||
* Shared Memory FIFOs
|
||||
@ -48,9 +48,10 @@
|
||||
|
||||
/* lock free fifo */
|
||||
typedef struct vader_fifo_t {
|
||||
volatile intptr_t fifo_head;
|
||||
volatile intptr_t fifo_tail;
|
||||
char pad[VADER_CACHE_LINE_PAD - 2 * sizeof (intptr_t)];
|
||||
volatile int64_t fifo_head;
|
||||
volatile int64_t fifo_tail;
|
||||
/* pad out to fill a cache line (64 or 128 bytes) */
|
||||
char pad[128 - 2 * sizeof (int64_t)];
|
||||
} vader_fifo_t;
|
||||
|
||||
static inline int vader_fifo_init (vader_fifo_t *fifo)
|
||||
@ -60,19 +61,21 @@ static inline int vader_fifo_init (vader_fifo_t *fifo)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, int rank)
|
||||
static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
vader_fifo_t *fifo = mca_btl_vader_component.fifo[rank];
|
||||
intptr_t prev, value = VIRTUAL2RELATIVE(hdr);
|
||||
vader_fifo_t *fifo = ep->fifo;
|
||||
int64_t prev, value = virtual2relative ((char *) hdr);
|
||||
|
||||
hdr->next = VADER_FIFO_FREE;
|
||||
|
||||
opal_atomic_wmb ();
|
||||
prev = opal_atomic_swap_ptr (&fifo->fifo_tail, value);
|
||||
prev = opal_atomic_swap_64 (&fifo->fifo_tail, value);
|
||||
opal_atomic_rmb ();
|
||||
|
||||
assert (prev != value);
|
||||
|
||||
if (OPAL_LIKELY(VADER_FIFO_FREE != prev)) {
|
||||
hdr = (mca_btl_vader_hdr_t *) RELATIVE2VIRTUAL(prev);
|
||||
hdr = (mca_btl_vader_hdr_t *) relative2virtual (prev);
|
||||
hdr->next = value;
|
||||
} else {
|
||||
fifo->fifo_head = value;
|
||||
@ -84,17 +87,19 @@ static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, int rank)
|
||||
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
|
||||
{
|
||||
mca_btl_vader_hdr_t *hdr;
|
||||
intptr_t value;
|
||||
int64_t value;
|
||||
|
||||
opal_atomic_rmb ();
|
||||
|
||||
value = opal_atomic_swap_ptr (&fifo->fifo_head, VADER_FIFO_FREE);
|
||||
value = opal_atomic_swap_64 (&fifo->fifo_head, VADER_FIFO_FREE);
|
||||
if (VADER_FIFO_FREE == value) {
|
||||
/* fifo is empty or we lost the race with another thread */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hdr = (mca_btl_vader_hdr_t *) RELATIVE2VIRTUAL(value);
|
||||
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
|
||||
|
||||
assert (hdr->next != value);
|
||||
|
||||
if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) {
|
||||
opal_atomic_rmb();
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -31,6 +31,35 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
|
||||
if(frag->hdr != NULL) {
|
||||
frag->hdr->my_smp_rank = mca_btl_vader_component.my_smp_rank;
|
||||
}
|
||||
|
||||
frag->base.des_src = frag->segments;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = frag->segments;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
}
|
||||
|
||||
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
|
||||
{
|
||||
unsigned int frag_size = (unsigned int)(uintptr_t) ctx;
|
||||
|
||||
if (mca_btl_vader_component.segment_size < mca_btl_vader_component.segment_offset + frag_size) {
|
||||
item->ptr = NULL;
|
||||
}
|
||||
|
||||
item->ptr = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset;
|
||||
mca_btl_vader_component.segment_offset += frag_size;
|
||||
|
||||
mca_btl_vader_frag_constructor ((mca_btl_vader_frag_t *) item);
|
||||
}
|
||||
|
||||
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag)
|
||||
{
|
||||
frag->base.des_src = frag->segments;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = frag->segments;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
|
||||
OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t *)frag);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_vader_frag_t, mca_btl_base_descriptor_t,
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -34,7 +34,6 @@ struct mca_btl_vader_hdr_t {
|
||||
volatile intptr_t next; /* next item in fifo. many peers may touch this */
|
||||
volatile bool complete; /* fragment completion (usually 1 byte) */
|
||||
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
|
||||
char pad[2];
|
||||
int flags; /* vader send flags */
|
||||
int my_smp_rank; /* smp rank of owning process */
|
||||
size_t len; /* length of data following this header */
|
||||
@ -64,6 +63,12 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
|
||||
OMPI_FREE_LIST_GET(list, item, rc);
|
||||
*frag = (mca_btl_vader_frag_t *) item;
|
||||
if (OPAL_LIKELY(NULL != item)) {
|
||||
if (NULL == (*frag)->hdr) {
|
||||
OMPI_FREE_LIST_RETURN(list, (ompi_free_list_item_t *)*frag);
|
||||
*frag = NULL;
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
(*frag)->hdr->complete = false;
|
||||
(*frag)->hdr->flags = MCA_BTL_VADER_FLAG_INLINE;
|
||||
(*frag)->segments[0].seg_addr.pval = (char *)((*frag)->hdr + 1);
|
||||
@ -73,12 +78,14 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
|
||||
return rc;
|
||||
}
|
||||
|
||||
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag);
|
||||
|
||||
#define MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag) \
|
||||
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_eager)
|
||||
#define MCA_BTL_VADER_FRAG_ALLOC_USER(frag) \
|
||||
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user)
|
||||
#define MCA_BTL_VADER_FRAG_RETURN(frag) \
|
||||
OMPI_FREE_LIST_RETURN((frag)->my_list, (ompi_free_list_item_t *)(frag))
|
||||
#define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag)
|
||||
|
||||
|
||||
static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
|
||||
if (OPAL_UNLIKELY(MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
|
||||
@ -92,4 +99,6 @@ static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
|
||||
}
|
||||
}
|
||||
|
||||
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx);
|
||||
|
||||
#endif /* MCA_BTL_VADER_SEND_FRAG_H */
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -33,8 +33,7 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
|
||||
mca_mpool_base_registration_t *reg;
|
||||
void *rem_ptr;
|
||||
|
||||
reg = vader_get_registation (endpoint->peer_smp_rank,
|
||||
(void *)(uintptr_t) src->seg_addr.lval,
|
||||
reg = vader_get_registation (endpoint, (void *)(uintptr_t) src->seg_addr.lval,
|
||||
src->seg_len, 0);
|
||||
if (OPAL_UNLIKELY(NULL == reg)) {
|
||||
return OMPI_ERROR;
|
||||
@ -44,7 +43,7 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
|
||||
|
||||
vader_memmove ((void *)(uintptr_t) dst->seg_addr.lval, rem_ptr, size);
|
||||
|
||||
vader_return_registration (reg, endpoint->peer_smp_rank);
|
||||
vader_return_registration (reg, endpoint);
|
||||
|
||||
mca_btl_vader_frag_complete (frag);
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -70,204 +70,76 @@ static int vader_ft_event (int state);
|
||||
mca_btl_vader_t mca_btl_vader = {
|
||||
{
|
||||
&mca_btl_vader_component.super,
|
||||
0, /* btl_eager_limit */
|
||||
0, /* btl_rndv_eager_limit */
|
||||
0, /* btl_max_send_size */
|
||||
0, /* btl_rdma_pipeline_send_length */
|
||||
0, /* btl_rdma_pipeline_frag_size */
|
||||
0, /* btl_min_rdma_pipeline_size */
|
||||
0, /* btl_exclusivity */
|
||||
0, /* bTl_latency */
|
||||
0, /* btl_bandwidth */
|
||||
0, /* btl_flags */
|
||||
0, /* btl segment size */
|
||||
vader_add_procs,
|
||||
vader_del_procs,
|
||||
NULL, /* btl_register */
|
||||
vader_finalize,
|
||||
mca_btl_vader_alloc,
|
||||
vader_free,
|
||||
vader_prepare_src,
|
||||
vader_prepare_dst,
|
||||
mca_btl_vader_send,
|
||||
mca_btl_vader_sendi,
|
||||
mca_btl_vader_put,
|
||||
mca_btl_vader_get,
|
||||
mca_btl_base_dump,
|
||||
NULL, /* btl_mpool */
|
||||
vader_register_error_cb, /* register error */
|
||||
vader_ft_event
|
||||
.btl_eager_limit = 0,
|
||||
.btl_rndv_eager_limit = 0,
|
||||
.btl_max_send_size = 0,
|
||||
.btl_rdma_pipeline_send_length = 0,
|
||||
.btl_rdma_pipeline_frag_size = 0,
|
||||
.btl_min_rdma_pipeline_size = 0,
|
||||
.btl_exclusivity = 0,
|
||||
.btl_latency = 0,
|
||||
.btl_bandwidth = 0,
|
||||
.btl_flags = 0,
|
||||
.btl_seg_size = 0,
|
||||
.btl_add_procs = vader_add_procs,
|
||||
.btl_del_procs = vader_del_procs,
|
||||
.btl_register = NULL,
|
||||
.btl_finalize = vader_finalize,
|
||||
.btl_alloc = mca_btl_vader_alloc,
|
||||
.btl_free = vader_free,
|
||||
.btl_prepare_src = vader_prepare_src,
|
||||
.btl_prepare_dst = vader_prepare_dst,
|
||||
.btl_send = mca_btl_vader_send,
|
||||
.btl_sendi = mca_btl_vader_sendi,
|
||||
.btl_put = mca_btl_vader_put,
|
||||
.btl_get = mca_btl_vader_get,
|
||||
.btl_dump = mca_btl_base_dump,
|
||||
.btl_mpool = NULL,
|
||||
.btl_register_error = vader_register_error_cb,
|
||||
.btl_ft_event = vader_ft_event
|
||||
}
|
||||
};
|
||||
|
||||
static inline int vader_init_mpool (mca_btl_vader_t *vader_btl, int n)
|
||||
{
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
mca_mpool_base_resources_t res;
|
||||
|
||||
res.mem_node = -1;
|
||||
|
||||
/* determine how much memory to create */
|
||||
/*
|
||||
* This heuristic formula mostly says that we request memory for:
|
||||
* - a vader fifo
|
||||
* - eager fragments (2 * n of them, allocated in vader_free_list_inc chunks)
|
||||
*
|
||||
* On top of all that, we sprinkle in some number of "opal_cache_line_size"
|
||||
* additions to account for some padding and edge effects that may lie
|
||||
* in the allocator.
|
||||
*/
|
||||
res.size = sizeof (vader_fifo_t) + 4 * opal_cache_line_size +
|
||||
(2 * n + component->vader_free_list_inc) * (component->eager_limit + 2 * opal_cache_line_size);
|
||||
|
||||
/* before we multiply by n, make sure the result won't overflow */
|
||||
/* Stick that little pad in, particularly since we'll eventually
|
||||
* need a little extra space. E.g., in mca_mpool_vader_init() in
|
||||
* mpool_vader_component.c when sizeof(mca_common_sm_module_t) is
|
||||
* added.
|
||||
*/
|
||||
if ( ((double) res.size) * n > LONG_MAX - 4096 )
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
res.size *= n;
|
||||
|
||||
/* now, create it */
|
||||
component->vader_mpool =
|
||||
mca_mpool_base_module_create("sm", vader_btl, &res);
|
||||
/* Sanity check to ensure that we found it */
|
||||
if(NULL == component->vader_mpool) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
component->vader_mpool_base =
|
||||
component->vader_mpool->mpool_base (component->vader_mpool);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
{
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
size_t size;
|
||||
char *vader_ctl_file;
|
||||
vader_fifo_t *my_fifos;
|
||||
ompi_proc_t **procs;
|
||||
size_t num_procs;
|
||||
int i, rc;
|
||||
int rc;
|
||||
|
||||
rc = vader_init_mpool (vader_btl, n);
|
||||
/* generate the endpoints */
|
||||
component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n, sizeof (struct mca_btl_base_endpoint_t));
|
||||
|
||||
component->segment_offset = (n + 1) * 4096;
|
||||
|
||||
/* initialize fragment descriptor free lists */
|
||||
/* initialize free list for send fragments */
|
||||
rc = ompi_free_list_init_ex_new(&component->vader_frags_eager,
|
||||
sizeof (mca_btl_vader_frag_t),
|
||||
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
|
||||
0, opal_cache_line_size,
|
||||
component->vader_free_list_num,
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
NULL, mca_btl_vader_frag_init,
|
||||
(void *) (sizeof (mca_btl_vader_hdr_t) +
|
||||
mca_btl_vader.super.btl_eager_limit));
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* create a list of peers */
|
||||
component->vader_peers = (struct mca_btl_base_endpoint_t **)
|
||||
calloc(n, sizeof(struct mca_btl_base_endpoint_t *));
|
||||
if(NULL == component->vader_peers)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* Allocate Shared Memory BTL process coordination
|
||||
* data structure. This will reside in shared memory */
|
||||
|
||||
/* set file name */
|
||||
if(asprintf(&vader_ctl_file, "%s"OPAL_PATH_SEP"vader_btl_module.%s",
|
||||
ompi_process_info.job_session_dir,
|
||||
ompi_process_info.nodename) < 0)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* Pass in a data segment alignment of 0 to get no data
|
||||
segment (only the shared control structure) */
|
||||
size = sizeof (mca_common_sm_seg_header_t) +
|
||||
n * (sizeof (vader_fifo_t *) + sizeof (char *)
|
||||
+ sizeof (xpmem_segid_t)) + opal_cache_line_size;
|
||||
procs = ompi_proc_world(&num_procs);
|
||||
if (!(mca_btl_vader_component.vader_seg =
|
||||
mca_common_sm_init(procs, num_procs, size, vader_ctl_file,
|
||||
sizeof (mca_common_sm_seg_header_t),
|
||||
opal_cache_line_size))) {
|
||||
opal_output(0, "vader_add_procs: unable to create shared memory "
|
||||
"BTL coordinating strucure :: size %lu \n",
|
||||
(unsigned long) size);
|
||||
free(procs);
|
||||
free(vader_ctl_file);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
free(procs);
|
||||
free(vader_ctl_file);
|
||||
|
||||
component->shm_fifo = (volatile vader_fifo_t **) component->vader_seg->module_data_addr;
|
||||
component->shm_bases = (char **)(component->shm_fifo + n);
|
||||
component->shm_seg_ids = (xpmem_segid_t *)(component->shm_bases + n);
|
||||
|
||||
/* set the base of the shared memory segment */
|
||||
component->shm_bases[component->my_smp_rank] = (char *)component->vader_mpool_base;
|
||||
component->shm_seg_ids[component->my_smp_rank] = component->my_seg_id;
|
||||
|
||||
/* initialize the fifo and fast boxes "owned" by this process */
|
||||
posix_memalign ((void **)&my_fifos, getpagesize (), (n + 1) * getpagesize ());
|
||||
if(NULL == my_fifos)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* cache the pointer to the 2d fifo array. These addresses
|
||||
* are valid in the current process space */
|
||||
component->fifo = (vader_fifo_t **) calloc (n, sizeof(vader_fifo_t *));
|
||||
if(NULL == component->fifo)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->shm_fifo[component->my_smp_rank] =
|
||||
component->fifo[component->my_smp_rank] = my_fifos;
|
||||
|
||||
component->apids = (xpmem_apid_t *) calloc (n, sizeof (xpmem_apid_t));
|
||||
if (NULL == component->apids)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->xpmem_rcaches =
|
||||
(struct mca_rcache_base_module_t **) calloc (n, sizeof (struct mca_rcache_base_module_t *));
|
||||
if (NULL == component->xpmem_rcaches)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->vader_fboxes_in = (unsigned char **) calloc (n, sizeof (char *));
|
||||
if (NULL == component->vader_fboxes_in)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->vader_fboxes_out = (unsigned char **) calloc (n, sizeof (char *));
|
||||
if (NULL == component->vader_fboxes_out)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->vader_next_fbox_in = (unsigned char *) calloc (64, 1);
|
||||
if (NULL == component->vader_next_fbox_in)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
component->vader_next_fbox_out = (unsigned char *) calloc (64, 1);
|
||||
if (NULL == component->vader_next_fbox_out)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
/* initialize fragment descriptor free lists */
|
||||
/* initialize free list for send fragments */
|
||||
i = ompi_free_list_init_new(&component->vader_frags_eager,
|
||||
sizeof (mca_btl_vader_frag_t),
|
||||
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
|
||||
sizeof (mca_btl_vader_hdr_t) + component->eager_limit,
|
||||
opal_cache_line_size,
|
||||
component->vader_free_list_num,
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
component->vader_mpool);
|
||||
if (OMPI_SUCCESS != i)
|
||||
return i;
|
||||
|
||||
/* initialize free list for put/get fragments */
|
||||
i = ompi_free_list_init_new(&component->vader_frags_user,
|
||||
sizeof(mca_btl_vader_frag_t),
|
||||
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
|
||||
sizeof(mca_btl_vader_hdr_t) + mca_btl_vader_max_inline_send,
|
||||
opal_cache_line_size,
|
||||
component->vader_free_list_num,
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
component->vader_mpool);
|
||||
if (OMPI_SUCCESS != i)
|
||||
return i;
|
||||
rc = ompi_free_list_init_ex_new(&component->vader_frags_user,
|
||||
sizeof(mca_btl_vader_frag_t),
|
||||
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
|
||||
0, opal_cache_line_size,
|
||||
component->vader_free_list_num,
|
||||
component->vader_free_list_max,
|
||||
component->vader_free_list_inc,
|
||||
NULL, mca_btl_vader_frag_init,
|
||||
(void *) (sizeof(mca_btl_vader_hdr_t) +
|
||||
mca_btl_vader_component.max_inline_send));
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* set flag indicating btl has been inited */
|
||||
vader_btl->btl_inited = true;
|
||||
@ -275,16 +147,38 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static struct
|
||||
mca_btl_base_endpoint_t *create_vader_endpoint (int local_proc, struct ompi_proc_t *proc)
|
||||
{
|
||||
struct mca_btl_base_endpoint_t *ep = (struct mca_btl_base_endpoint_t *)
|
||||
calloc(1, sizeof (struct mca_btl_base_endpoint_t));
|
||||
if(NULL != ep) {
|
||||
ep->peer_smp_rank = local_proc + mca_btl_vader_component.num_smp_procs;
|
||||
|
||||
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_proc_t *proc, int local_rank) {
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
struct vader_modex_t *modex;
|
||||
size_t msg_size;
|
||||
int rc;
|
||||
|
||||
ep->peer_smp_rank = local_rank;
|
||||
|
||||
if (OMPI_SUCCESS != (rc = ompi_modex_recv(&component->super.btl_version,
|
||||
proc, (void *)&modex, &msg_size))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ep;
|
||||
ep->apid = xpmem_get (modex->seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
|
||||
ep->rcache = mca_rcache_base_module_create("vma");
|
||||
ep->next_fbox_out = 0;
|
||||
ep->next_fbox_in = 0;
|
||||
|
||||
/* Attatch to the remote process' segment */
|
||||
ep->segment_base =
|
||||
vader_reg_to_ptr (vader_get_registation (ep, modex->segment_base, mca_btl_vader_component.segment_size,
|
||||
MCA_MPOOL_FLAGS_PERSIST),
|
||||
modex->segment_base);
|
||||
|
||||
ep->fifo = (struct vader_fifo_t *) ep->segment_base;
|
||||
ep->fbox_out = ep->segment_base + (1 + component->my_smp_rank) * 4096;
|
||||
ep->fbox_in = component->my_segment + (1 + local_rank) * 4096;
|
||||
|
||||
memset (ep->fbox_in, MCA_BTL_VADER_FBOX_FREE, 4096);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -308,7 +202,7 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
|
||||
{
|
||||
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
||||
mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
|
||||
int32_t n_local_procs = 0, proc, i, my_smp_rank = -1;
|
||||
int32_t n_local_procs = 0, proc, local_rank, my_smp_rank = -1;
|
||||
bool have_connected_peer = false;
|
||||
ompi_proc_t *my_proc;
|
||||
int rc = OMPI_SUCCESS;
|
||||
@ -329,130 +223,65 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
|
||||
if they're on my local host and in my job) */
|
||||
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
||||
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
|
||||
peers[proc] = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check to see if this is me */
|
||||
if (my_proc == procs[proc]) {
|
||||
my_smp_rank = component->my_smp_rank = n_local_procs++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* we have someone to talk to */
|
||||
have_connected_peer = true;
|
||||
|
||||
if (!(peers[proc] = create_vader_endpoint (n_local_procs, procs[proc]))) {
|
||||
rc = OMPI_ERROR;
|
||||
goto CLEANUP;
|
||||
}
|
||||
n_local_procs++;
|
||||
|
||||
/* add this proc to shared memory accessibility list */
|
||||
rc = opal_bitmap_set_bit (reachability, proc);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
goto CLEANUP;
|
||||
if (my_proc != procs[proc]) {
|
||||
/* we have someone to talk to */
|
||||
have_connected_peer = true;
|
||||
|
||||
/* add this proc to shared memory accessibility list */
|
||||
rc = opal_bitmap_set_bit (reachability, proc);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
return rc;
|
||||
}
|
||||
} else {
|
||||
my_smp_rank = mca_btl_vader_component.my_smp_rank = n_local_procs - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* jump out if there's not someone we can talk to */
|
||||
if (!have_connected_peer) {
|
||||
goto CLEANUP;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* make sure that my_smp_rank has been defined */
|
||||
if(-1 == my_smp_rank) {
|
||||
rc = OMPI_ERROR;
|
||||
goto CLEANUP;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (!vader_btl->btl_inited) {
|
||||
rc = vader_btl_first_time_init(vader_btl, n_local_procs);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
goto CLEANUP;
|
||||
rc = vader_btl_first_time_init (vader_btl, n_local_procs);
|
||||
if (rc != OMPI_SUCCESS) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
/* set local proc's smp rank in the peers structure for
|
||||
* rapid access and calculate reachability */
|
||||
for (proc = 0; proc < (int32_t) nprocs; ++proc) {
|
||||
if(NULL == peers[proc])
|
||||
for (proc = 0, local_rank = 0; proc < (int32_t) nprocs; ++proc) {
|
||||
/* check to see if this proc can be reached via shmem (i.e.,
|
||||
if they're on my local host and in my job) */
|
||||
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
|
||||
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
|
||||
peers[proc] = NULL;
|
||||
continue;
|
||||
component->vader_peers[peers[proc]->peer_smp_rank] = peers[proc];
|
||||
peers[proc]->my_smp_rank = my_smp_rank;
|
||||
}
|
||||
|
||||
/* initialize own FIFOs */
|
||||
/*
|
||||
* The receiver initializes all its FIFOs. All components will
|
||||
* be allocated near the receiver. Nothing will be local to
|
||||
* "the sender" since there will be many senders.
|
||||
*/
|
||||
rc = vader_fifo_init (component->fifo[my_smp_rank]);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
opal_atomic_wmb();
|
||||
|
||||
/* Sync with other local procs. Force the FIFO initialization to always
|
||||
* happens before the readers access it.
|
||||
*/
|
||||
opal_atomic_add_32( &component->vader_seg->module_seg->seg_inited, 1);
|
||||
while (n_local_procs >
|
||||
component->vader_seg->module_seg->seg_inited) {
|
||||
opal_progress();
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
|
||||
/* coordinate with other processes */
|
||||
for (i = 0 ; i < n_local_procs ; ++i) {
|
||||
int peer_smp_rank = i + component->num_smp_procs;
|
||||
|
||||
/* spin until this element is allocated */
|
||||
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */
|
||||
while (NULL == component->shm_fifo[peer_smp_rank]) {
|
||||
opal_progress();
|
||||
opal_atomic_rmb();
|
||||
}
|
||||
|
||||
if (my_smp_rank != peer_smp_rank) {
|
||||
void *rem_ptr = (void *) component->shm_fifo[peer_smp_rank];
|
||||
/* setup endpoint */
|
||||
peers[proc] = component->endpoints + local_rank;
|
||||
init_vader_endpoint (peers[proc], procs[proc], local_rank++);
|
||||
|
||||
component->apids[peer_smp_rank] =
|
||||
xpmem_get (component->shm_seg_ids[peer_smp_rank],
|
||||
XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
|
||||
component->xpmem_rcaches[peer_smp_rank] = mca_rcache_base_module_create("vma");
|
||||
|
||||
/* get a persistent pointer to the peer's fifo */
|
||||
component->fifo[peer_smp_rank] =
|
||||
vader_reg_to_ptr (vader_get_registation (peer_smp_rank, rem_ptr,
|
||||
(n_local_procs + 1) * getpagesize (),
|
||||
MCA_MPOOL_FLAGS_PERSIST), rem_ptr);
|
||||
|
||||
/* fast boxes are allocated at the same time as the fifos */
|
||||
component->vader_fboxes_in[peer_smp_rank] = (unsigned char *) component->fifo[my_smp_rank] +
|
||||
(peer_smp_rank + 1) * getpagesize ();
|
||||
component->vader_fboxes_out[peer_smp_rank] = (unsigned char *) component->fifo[peer_smp_rank] +
|
||||
(my_smp_rank + 1) * getpagesize ();
|
||||
|
||||
component->vader_next_fbox_in[peer_smp_rank] = 0;
|
||||
component->vader_next_fbox_out[peer_smp_rank] = 0;
|
||||
|
||||
memset (component->vader_fboxes_in[peer_smp_rank], MCA_BTL_VADER_FBOX_FREE, getpagesize());
|
||||
/* check to see if this is me */
|
||||
if (my_proc == procs[proc]) {
|
||||
peers[proc] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* update the local smp process count */
|
||||
component->num_smp_procs += n_local_procs;
|
||||
|
||||
/* make sure we have enough eager fragmnents for each process */
|
||||
rc = ompi_free_list_resize(&component->vader_frags_eager,
|
||||
component->num_smp_procs * 2);
|
||||
|
||||
CLEANUP:
|
||||
|
||||
return rc;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -518,9 +347,9 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl
|
||||
{
|
||||
mca_btl_vader_frag_t *frag = NULL;
|
||||
|
||||
if (size <= (size_t) mca_btl_vader_max_inline_send) {
|
||||
if (size <= (size_t) mca_btl_vader_component.max_inline_send) {
|
||||
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
|
||||
} else if (size <= mca_btl_vader_component.eager_limit) {
|
||||
} else if (size <= mca_btl_vader.super.btl_eager_limit) {
|
||||
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
|
||||
}
|
||||
|
||||
@ -530,10 +359,6 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl
|
||||
|
||||
frag->base.des_flags = flags;
|
||||
frag->base.order = order;
|
||||
frag->base.des_src = frag->segments;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = frag->segments;
|
||||
frag->base.des_src_cnt = 1;
|
||||
}
|
||||
|
||||
return (mca_btl_base_descriptor_t *) frag;
|
||||
@ -572,8 +397,6 @@ struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t
|
||||
frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
|
||||
frag->segments[0].seg_len = *size;
|
||||
|
||||
frag->base.des_dst = frag->segments;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.order = order;
|
||||
frag->base.des_flags = flags;
|
||||
|
||||
@ -594,7 +417,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
|
||||
uint8_t order, size_t reserve, size_t *size,
|
||||
uint32_t flags)
|
||||
{
|
||||
struct iovec iov, *lcl_mem;
|
||||
struct iovec iov;
|
||||
mca_btl_vader_frag_t *frag;
|
||||
uint32_t iov_count = 1;
|
||||
void *data_ptr, *fbox_ptr;
|
||||
@ -630,7 +453,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ((*size + reserve) > (size_t) mca_btl_vader_max_inline_send) {
|
||||
if ((*size + reserve) > (size_t) mca_btl_vader_component.max_inline_send) {
|
||||
/* single copy send */
|
||||
frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;
|
||||
|
||||
@ -646,7 +469,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
|
||||
/* inline send */
|
||||
|
||||
/* try to reserve a fast box for this transfer */
|
||||
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint->peer_smp_rank, reserve + *size);
|
||||
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, reserve + *size);
|
||||
|
||||
if (fbox_ptr) {
|
||||
frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -33,8 +33,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
|
||||
mca_mpool_base_registration_t *reg;
|
||||
void *rem_ptr;
|
||||
|
||||
reg = vader_get_registation (endpoint->peer_smp_rank,
|
||||
(void *)(uintptr_t) dst->seg_addr.lval,
|
||||
reg = vader_get_registation (endpoint, (void *)(uintptr_t) dst->seg_addr.lval,
|
||||
dst->seg_len, 0);
|
||||
if (OPAL_UNLIKELY(NULL == reg)) {
|
||||
return OMPI_ERROR;
|
||||
@ -44,7 +43,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
|
||||
|
||||
vader_memmove (rem_ptr, (void *)(uintptr_t) src->seg_addr.lval, size);
|
||||
|
||||
vader_return_registration (reg, endpoint->peer_smp_rank);
|
||||
vader_return_registration (reg, endpoint);
|
||||
|
||||
/* always call the callback function */
|
||||
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -57,7 +57,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
|
||||
opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
|
||||
|
||||
/* post the relative address of the descriptor into the peer's fifo */
|
||||
vader_fifo_write (frag->hdr, endpoint->peer_smp_rank);
|
||||
vader_fifo_write (frag->hdr, endpoint);
|
||||
|
||||
if (frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY ||
|
||||
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
|
||||
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -50,13 +50,14 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
size_t max_data;
|
||||
void *data_ptr = NULL;
|
||||
|
||||
assert (length < mca_btl_vader_component.eager_limit);
|
||||
assert (length < mca_btl_vader.super.btl_eager_limit);
|
||||
assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
|
||||
|
||||
/* we won't ever return a descriptor */
|
||||
*descriptor = NULL;
|
||||
|
||||
if (OPAL_LIKELY(!(payload_size && opal_convertor_need_buffers (convertor)))) {
|
||||
if (OPAL_LIKELY((payload_size + header_size) < mca_btl_vader_component.max_inline_send &&
|
||||
!opal_convertor_need_buffers (convertor))) {
|
||||
if (payload_size) {
|
||||
opal_convertor_get_current_pointer (convertor, &data_ptr);
|
||||
}
|
||||
@ -103,7 +104,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
|
||||
opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
|
||||
|
||||
/* write the fragment pointer to peer's the FIFO */
|
||||
vader_fifo_write (frag->hdr, endpoint->peer_smp_rank);
|
||||
vader_fifo_write (frag->hdr, endpoint);
|
||||
|
||||
/* the progress function will return the fragment */
|
||||
|
||||
|
113
ompi/mca/btl/vader/btl_vader_xpmem.c
Обычный файл
113
ompi/mca/btl/vader/btl_vader_xpmem.c
Обычный файл
@ -0,0 +1,113 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi/mca/btl/vader/btl_vader.h"
|
||||
#include "opal/include/opal/align.h"
|
||||
|
||||
/* largest address we can attach to using xpmem */
|
||||
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
|
||||
|
||||
/* look up the remote pointer in the peer rcache and attach if
|
||||
* necessary */
|
||||
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
|
||||
size_t size, int flags)
|
||||
{
|
||||
struct mca_rcache_base_module_t *rcache = endpoint->rcache;
|
||||
mca_mpool_base_registration_t *regs[10], *reg = NULL;
|
||||
struct xpmem_addr xpmem_addr;
|
||||
uintptr_t base, bound;
|
||||
int rc, i;
|
||||
|
||||
if (OPAL_UNLIKELY(endpoint->peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
|
||||
return rem_ptr;
|
||||
}
|
||||
|
||||
base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_component.log_attach_align);
|
||||
bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
|
||||
mca_btl_vader_component.log_attach_align) + 1;
|
||||
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
|
||||
bound = VADER_MAX_ADDRESS;
|
||||
}
|
||||
|
||||
/* several segments may match the base pointer */
|
||||
rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
|
||||
for (i = 0 ; i < rc ; ++i) {
|
||||
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
|
||||
opal_atomic_add (®s[i]->ref_count, 1);
|
||||
return regs[i];
|
||||
}
|
||||
|
||||
if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* remove this pointer from the rcache and decrement its reference count
|
||||
(so it is detached later) */
|
||||
rc = rcache->rcache_delete (rcache, regs[i]);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
/* someone beat us to it? */
|
||||
break;
|
||||
}
|
||||
|
||||
/* start the new segment from the lower of the two bases */
|
||||
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
|
||||
|
||||
opal_atomic_add (®s[i]->ref_count, -1);
|
||||
|
||||
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
|
||||
/* this pointer is not in use */
|
||||
(void) xpmem_detach (regs[i]->alloc_base);
|
||||
OBJ_RELEASE(regs[i]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
reg = OBJ_NEW(mca_mpool_base_registration_t);
|
||||
if (OPAL_LIKELY(NULL != reg)) {
|
||||
/* stick around for awhile */
|
||||
reg->ref_count = 2;
|
||||
reg->base = (unsigned char *) base;
|
||||
reg->bound = (unsigned char *) bound;
|
||||
reg->flags = flags;
|
||||
|
||||
xpmem_addr.apid = endpoint->apid;
|
||||
xpmem_addr.offset = base;
|
||||
|
||||
reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
|
||||
if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
|
||||
OBJ_RELEASE(reg);
|
||||
reg = NULL;
|
||||
} else {
|
||||
rcache->rcache_insert (rcache, reg, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return reg;
|
||||
}
|
||||
|
||||
void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
struct mca_rcache_base_module_t *rcache = endpoint->rcache;
|
||||
|
||||
opal_atomic_add (®->ref_count, -1);
|
||||
if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
|
||||
rcache->rcache_delete (rcache, reg);
|
||||
(void)xpmem_detach (reg->alloc_base);
|
||||
OBJ_RELEASE (reg);
|
||||
}
|
||||
}
|
||||
|
||||
void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
|
||||
{
|
||||
return (void *) ((uintptr_t) reg->alloc_base +
|
||||
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user