1
1

btl/vader: do not use common/sm for shared memory fragments

This commit was SVN r28250.
Этот коммит содержится в:
Nathan Hjelm 2013-03-27 22:10:02 +00:00
родитель d12eed0703
Коммит 113fadd749
14 изменённых файлов: 501 добавлений и 600 удалений

Просмотреть файл

@ -35,7 +35,8 @@ libmca_btl_vader_la_sources = \
btl_vader_sendi.c \
btl_vader_fbox.h \
btl_vader_get.c \
btl_vader_put.c
btl_vader_put.c \
btl_vader_xpmem.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -45,44 +45,39 @@
/* xpmem is required by vader atm */
#include <xpmem.h>
#include "opal/include/opal/align.h"
#include "opal/class/opal_free_list.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/sm/mpool_sm.h"
#include "ompi/mca/common/sm/common_sm.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "ompi/mca/rcache/rcache.h"
#include "ompi/mca/rcache/base/base.h"
#include "btl_vader_endpoint.h"
BEGIN_C_DECLS
#define min(a,b) ((a) < (b) ? (a) : (b))
extern int mca_btl_vader_memcpy_limit;
extern int mca_btl_vader_log_align;
extern int mca_btl_vader_max_inline_send;
/* We can't use opal_cache_line_size here because we need a
compile-time constant for padding the struct. We can't really have
a compile-time constant that is portable, either (e.g., compile on
one machine and run on another). So just use a big enough cache
line that should hopefully be good in most places. */
#define VADER_CACHE_LINE_PAD 128
/* largest address we can attach to using xpmem */
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
/*
* Shared Memory resource managment
*/
struct vader_fifo_t;
/*
* Modex data
*/
struct vader_modex_t {
xpmem_segid_t seg_id;
void *segment_base;
};
/**
* Shared Memory (VADER) BTL module.
*/
@ -92,20 +87,10 @@ struct mca_btl_vader_component_t {
int vader_free_list_max; /**< maximum size of free lists */
int vader_free_list_inc; /**< number of elements to alloc
* when growing free lists */
mca_mpool_base_module_t *vader_mpool; /**< mpool on local node */
void *vader_mpool_base; /**< base address of shared memory pool */
size_t eager_limit; /**< send fragment size */
mca_common_sm_module_t *vader_seg; /**< description of shared memory segment */
volatile struct vader_fifo_t **shm_fifo;/**< pointer to fifo 2D array in
* shared memory */
char **shm_bases; /**< pointer to base pointers in
* shared memory */
xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
xpmem_segid_t *shm_seg_ids; /* xpmem segment ids */
struct vader_fifo_t **fifo; /**< cached copy of the pointer to
* the 2D fifo array. */
struct mca_rcache_base_module_t **xpmem_rcaches;
xpmem_apid_t *apids; /* xpmem apids */
char *my_segment; /* this rank's base pointer */
size_t segment_size; /* size of my_segment */
size_t segment_offset; /* start of unused portion of my_segment */
int32_t num_smp_procs; /**< current number of smp procs on this host */
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
* SMP specfic data structures. */
@ -114,13 +99,11 @@ struct mca_btl_vader_component_t {
opal_list_t active_sends; /**< list of outstanding fragments */
unsigned char **vader_fboxes_in; /**< incomming fast boxes (memory belongs to this process) */
unsigned char **vader_fboxes_out; /**< outgoing fast boxes (memory belongs to remote peers) */
int memcpy_limit; /** Limit where we switch from memmove to memcpy */
int log_attach_align; /** Log of the alignment for xpmem segments */
int max_inline_send; /** Limit for copy-in-copy-out fragments */
unsigned char *vader_next_fbox_in; /**< indices of fast boxes to poll */
unsigned char *vader_next_fbox_out; /**< indices of fast boxes to write */
struct mca_btl_base_endpoint_t **vader_peers;
struct mca_btl_base_endpoint_t *endpoints;
};
typedef struct mca_btl_vader_component_t mca_btl_vader_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;
@ -144,117 +127,38 @@ OMPI_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader;
* we define macros to translate between relative addresses and
* virtual addresses.
*/
#define VIRTUAL2RELATIVE(VADDR ) ((intptr_t)(VADDR) - (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
#define RELATIVE2VIRTUAL(OFFSET) ((intptr_t)(OFFSET) + (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
/* look up the remote pointer in the peer rcache and attach if
* necessary */
static inline mca_mpool_base_registration_t *vader_get_registation (int peer_smp_rank, void *rem_ptr,
size_t size, int flags)
/* This only works for finding the relative address for a pointer within my_segment */
static inline int64_t virtual2relative (char *addr)
{
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
mca_mpool_base_registration_t *regs[10], *reg = NULL;
struct xpmem_addr xpmem_addr;
uintptr_t base, bound;
int rc, i;
if (OPAL_UNLIKELY(peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
return rem_ptr;
}
base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_log_align);
bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
mca_btl_vader_log_align) + 1;
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
bound = VADER_MAX_ADDRESS;
}
/* several segments may match the base pointer */
rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
for (i = 0 ; i < rc ; ++i) {
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
opal_atomic_add (&regs[i]->ref_count, 1);
return regs[i];
}
if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
continue;
}
/* remove this pointer from the rcache and decrement its reference count
(so it is detached later) */
rc = rcache->rcache_delete (rcache, regs[i]);
if (OPAL_UNLIKELY(0 != rc)) {
/* someone beat us to it? */
break;
}
/* start the new segment from the lower of the two bases */
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
opal_atomic_add (&regs[i]->ref_count, -1);
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
/* this pointer is not in use */
(void) xpmem_detach (regs[i]->alloc_base);
OBJ_RELEASE(regs[i]);
}
break;
}
reg = OBJ_NEW(mca_mpool_base_registration_t);
if (OPAL_LIKELY(NULL != reg)) {
/* stick around for awhile */
reg->ref_count = 2;
reg->base = (unsigned char *) base;
reg->bound = (unsigned char *) bound;
reg->flags = flags;
xpmem_addr.apid = mca_btl_vader_component.apids[peer_smp_rank];
xpmem_addr.offset = base;
reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
OBJ_RELEASE(reg);
reg = NULL;
} else {
rcache->rcache_insert (rcache, reg, 0);
}
}
return reg;
return (int64_t)(uintptr_t) (addr - mca_btl_vader_component.my_segment) | ((int64_t)mca_btl_vader_component.my_smp_rank << 32);
}
static inline void vader_return_registration (mca_mpool_base_registration_t *reg, int peer_smp_rank)
static inline void *relative2virtual (int64_t offset)
{
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
opal_atomic_add (&reg->ref_count, -1);
if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
rcache->rcache_delete (rcache, reg);
(void)xpmem_detach (reg->alloc_base);
OBJ_RELEASE (reg);
}
}
static inline void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
{
return (void *) ((uintptr_t) reg->alloc_base +
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
return (void *)(uintptr_t)((offset & 0xffffffffull) + mca_btl_vader_component.endpoints[offset >> 32].segment_base);
}
/* memcpy is faster at larger sizes but is undefined if the
pointers are aliased (TODO -- readd alias check) */
static inline void vader_memmove (void *dst, void *src, size_t size)
{
if (size >= (size_t) mca_btl_vader_memcpy_limit) {
if (size >= (size_t) mca_btl_vader_component.memcpy_limit) {
memcpy (dst, src, size);
} else {
memmove (dst, src, size);
}
}
/* look up the remote pointer in the peer rcache and attach if
* necessary */
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
size_t size, int flags);
void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr);
/**
* Initiate a send to the peer.
*

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
@ -26,6 +26,7 @@
#include "ompi/constants.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "btl_vader.h"
@ -33,6 +34,8 @@
#include "btl_vader_fifo.h"
#include "btl_vader_fbox.h"
#include <sys/mman.h>
static int mca_btl_vader_component_progress (void);
static int mca_btl_vader_component_open(void);
static int mca_btl_vader_component_close(void);
@ -41,38 +44,32 @@ static mca_btl_base_module_t** mca_btl_vader_component_init(int *num_btls,
bool enable_progress_threads,
bool enable_mpi_threads);
/* limit where we should switch from bcopy to memcpy */
int mca_btl_vader_memcpy_limit = 524288;
int mca_btl_vader_log_align = 21; /* 2 MiB */
/* maximum size for using copy-in-copy out semantics for contiguous sends */
int mca_btl_vader_max_inline_send = 256;
/*
* Shared Memory (VADER) component instance.
*/
mca_btl_vader_component_t mca_btl_vader_component = {
{
.super = {
/* First, the mca_base_component_t struct containing meta information
about the component itself */
{
.btl_version = {
MCA_BTL_BASE_VERSION_2_0_0,
"vader", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
mca_btl_vader_component_open, /* component open */
mca_btl_vader_component_close, /* component close */
NULL,
mca_btl_vader_component_register,
.mca_component_name = "vader",
.mca_component_major_version = OMPI_MAJOR_VERSION,
.mca_component_minor_version = OMPI_MINOR_VERSION,
.mca_component_release_version = OMPI_RELEASE_VERSION,
.mca_open_component = mca_btl_vader_component_open,
.mca_close_component = mca_btl_vader_component_close,
.mca_query_component = NULL,
.mca_register_component_params = mca_btl_vader_component_register,
},
{
.btl_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_btl_vader_component_init,
mca_btl_vader_component_progress,
.btl_init = mca_btl_vader_component_init,
.btl_progress = mca_btl_vader_component_progress,
} /* end super */
};
@ -84,60 +81,73 @@ static int mca_btl_vader_component_register (void)
/* register VADER component variables */
mca_btl_vader_component.vader_free_list_num = 8;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"free_list_num", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9,
"free_list_num", "Initial number of fragments "
"to allocate for shared memory communication.",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.vader_free_list_num);
mca_btl_vader_component.vader_free_list_max = 8192;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"free_list_max", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9,
"free_list_max", "Maximum number of fragments "
"to allocate for shared memory communication.",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.vader_free_list_max);
mca_btl_vader_component.vader_free_list_inc = 64;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9,
"free_list_inc", "Number of fragments to create "
"on each allocation.", MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.vader_free_list_inc);
mca_btl_vader_memcpy_limit = 524288;
mca_btl_vader_component.memcpy_limit = 524288;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"memcpy_limit", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
"memcpy_limit", "Message size to switch from using "
"memove to memcpy. The relative speed of these two "
"routines can vary by size.", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_memcpy_limit);
mca_btl_vader_log_align = 21;
&mca_btl_vader_component.memcpy_limit);
mca_btl_vader_component.log_attach_align = 21;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"log_align", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
"log_align", "Log base 2 of the alignment to use for xpmem "
"segments (default: 21, minimum: 12, maximum: 25)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_log_align);
&mca_btl_vader_component.log_attach_align);
mca_btl_vader_max_inline_send = 256;
mca_btl_vader_component.segment_size = 1 << 24;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"max_inline_send", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_btl_vader_max_inline_send);
"segment_size", "Maximum size of all shared "
"memory buffers (default: 16M)",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.segment_size);
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
mca_btl_vader.super.btl_eager_limit = 64 * 1024;
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader_component.max_inline_send = 256;
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"max_inline_send", "Maximum size to transfer "
"using copy-in copy-out semantics",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.max_inline_send);
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
mca_btl_vader.super.btl_eager_limit = 64 * 1024;
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT |
MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
mca_btl_vader.super.btl_latency = 1; /* Microsecs */
@ -155,16 +165,6 @@ static int mca_btl_vader_component_register (void)
static int mca_btl_vader_component_open(void)
{
/* limit segment alignment to be between 4k and 16M */
if (mca_btl_vader_log_align < 12) {
mca_btl_vader_log_align = 12;
} else if (mca_btl_vader_log_align > 25) {
mca_btl_vader_log_align = 25;
}
mca_btl_vader_component.eager_limit = mca_btl_vader.super.btl_eager_limit;
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
@ -180,39 +180,26 @@ static int mca_btl_vader_component_open(void)
static int mca_btl_vader_component_close(void)
{
int return_value = OMPI_SUCCESS;
/**
* We don't have to destroy the fragment lists. They are allocated
* directly into the mmapped file, they will auto-magically disappear
* when the file get unmapped.
*/
/*OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);*/
/* unmap the shared memory control structure */
if(mca_btl_vader_component.vader_seg != NULL) {
return_value = mca_common_sm_fini( mca_btl_vader_component.vader_seg );
if( OMPI_SUCCESS != return_value ) {
return_value=OMPI_ERROR;
opal_output(0," mca_common_sm_fini failed\n");
goto CLEANUP;
}
/* unlink file, so that it will be deleted when all references
* to it are gone - no error checking, since we want all procs
* to call this, so that in an abnormal termination scenario,
* this file will still get cleaned up */
/* XXX LANL TODO -- remove unlink once the shmem segment uses xpmem */
unlink(mca_btl_vader_component.vader_seg->shmem_ds.seg_name);
OBJ_RELEASE(mca_btl_vader_component.vader_seg);
}
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user);
OBJ_DESTRUCT(&mca_btl_vader_component.active_sends);
CLEANUP:
if (NULL != mca_btl_vader_component.my_segment) {
munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size);
}
/* return */
return return_value;
return OMPI_SUCCESS;
}
static int mca_btl_base_vader_modex_send (void)
{
struct vader_modex_t modex;
modex.seg_id = mca_btl_vader_component.my_seg_id;
modex.segment_base = mca_btl_vader_component.my_segment;
return ompi_modex_send(&mca_btl_vader_component.super.btl_version,
&modex, sizeof (modex));
}
/*
@ -224,18 +211,17 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_btl_base_module_t **btls = NULL;
int rc;
*num_btls = 0;
/* if no session directory was created, then we cannot be used */
/* XXX LANL FIXME -- this is not the case. we can use an anonymous segment */
if (NULL == ompi_process_info.job_session_dir) {
return NULL;
}
/* limit segment alignment to be between 4k and 16M */
/* lookup/create shared memory pool only when used */
component->vader_mpool = NULL;
component->vader_mpool_base = NULL;
if (mca_btl_vader_component.segment_size < 12) {
mca_btl_vader_component.segment_size = 12;
} else if (mca_btl_vader_component.segment_size > 25) {
mca_btl_vader_component.segment_size = 25;
}
btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *));
if (NULL == btls) {
@ -250,6 +236,35 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
return NULL;
}
/* ensure a sane segment size */
if (mca_btl_vader_component.segment_size < (1 << 20)) {
mca_btl_vader_component.segment_size = (1 << 20);
}
component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ |
PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
if ((void *)-1 == component->my_segment) {
free (btls);
return NULL;
}
component->segment_offset = 0;
/* initialize my fifo */
rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
if (OMPI_SUCCESS != rc) {
free (btls);
munmap (component->my_segment, mca_btl_vader_component.segment_size);
return NULL;
}
rc = mca_btl_base_vader_modex_send ();
if (OMPI_SUCCESS != rc) {
free (btls);
munmap (component->my_segment, mca_btl_vader_component.segment_size);
return NULL;
}
*num_btls = 1;
/* get pointer to the btls */
@ -268,21 +283,14 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
static inline void mca_btl_vader_progress_sends (void)
{
opal_list_t *list = &mca_btl_vader_component.active_sends;
opal_list_item_t *item, *next;
mca_btl_vader_frag_t *frag;
for (item = opal_list_get_first (list) ; item != opal_list_get_end (list) ; ) {
frag = (mca_btl_vader_frag_t *) item;
next = opal_list_get_next (item);
mca_btl_vader_frag_t *frag, *next;
OPAL_LIST_FOREACH_SAFE(frag, next, &mca_btl_vader_component.active_sends, mca_btl_vader_frag_t) {
if (OPAL_LIKELY(frag->hdr->complete)) {
opal_list_remove_item (&mca_btl_vader_component.active_sends, item);
opal_list_remove_item (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
mca_btl_vader_frag_complete (frag);
}
item = next;
}
}
@ -290,11 +298,11 @@ static inline void mca_btl_vader_progress_sends (void)
static int mca_btl_vader_component_progress (void)
{
int my_smp_rank = mca_btl_vader_component.my_smp_rank;
vader_fifo_t *fifo = mca_btl_vader_component.fifo[my_smp_rank];
mca_btl_active_message_callback_t *reg;
mca_btl_vader_frag_t frag;
mca_btl_vader_hdr_t *hdr;
mca_mpool_base_registration_t *xpmem_reg = NULL;
struct mca_btl_base_endpoint_t *endpoint;
/* check active sends for completion */
mca_btl_vader_progress_sends ();
@ -303,7 +311,7 @@ static int mca_btl_vader_component_progress (void)
mca_btl_vader_check_fboxes ();
/* poll the fifo once */
hdr = vader_fifo_read (fifo);
hdr = vader_fifo_read (mca_btl_vader_component.endpoints[my_smp_rank].fifo);
if (NULL == hdr) {
return 0;
}
@ -314,7 +322,8 @@ static int mca_btl_vader_component_progress (void)
frag.segments[0].seg_len = hdr->len;
if (OPAL_UNLIKELY(hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY)) {
xpmem_reg = vader_get_registation (hdr->my_smp_rank, hdr->sc_iov.iov_base,
endpoint = mca_btl_vader_component.endpoints + hdr->my_smp_rank;
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
hdr->sc_iov.iov_len, 0);
frag.segments[1].seg_addr.pval = vader_reg_to_ptr (xpmem_reg, hdr->sc_iov.iov_base);
@ -323,7 +332,7 @@ static int mca_btl_vader_component_progress (void)
/* recv upcall */
frag.base.des_dst_cnt = 2;
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
vader_return_registration (xpmem_reg, hdr->my_smp_rank);
vader_return_registration (xpmem_reg, endpoint);
} else {
frag.base.des_dst_cnt = 1;
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);

Просмотреть файл

@ -24,6 +24,10 @@
#ifndef MCA_BTL_VADER_ENDPOINT_H
#define MCA_BTL_VADER_ENDPOINT_H
#include <xpmem.h>
struct vader_fifo_t;
/**
* An abstraction that represents a connection to a endpoint process.
* An instance of mca_ptl_base_endpoint_t is associated w/ each process
@ -31,10 +35,16 @@
*/
struct mca_btl_base_endpoint_t {
int my_smp_rank; /**< My SMP process rank. Used for accessing
* SMP specfic data structures. */
int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing
* SMP specfic data structures. */
char *segment_base;
struct vader_fifo_t *fifo;
xpmem_apid_t apid;
char *fbox_out;
char *fbox_in;
int next_fbox_out;
int next_fbox_in;
struct mca_rcache_base_module_t *rcache;
};
#endif /* MCA_BTL_VADER_ENDPOINT_H */

Просмотреть файл

@ -23,16 +23,13 @@
enum {MCA_BTL_VADER_FBOX_FREE = 0xfe, MCA_BTL_VADER_FBOX_RESERVED = 0xff};
#define MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, fbox) \
(mca_btl_vader_component.vader_fboxes_out[peer_smp_rank] + FBOX_SIZE * (fbox))
#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + FBOX_SIZE * (fbox))
#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + FBOX_SIZE * (fbox))
#define MCA_BTL_VADER_FBOX_IN_PTR(peer_smp_rank, fbox) \
(mca_btl_vader_component.vader_fboxes_in[peer_smp_rank] + FBOX_SIZE * (fbox))
static inline unsigned char *mca_btl_vader_reserve_fbox (int peer_smp_rank, size_t size)
static inline unsigned char *mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, size_t size)
{
int next_fbox = mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank];
unsigned char *fbox = MCA_BTL_VADER_FBOX_OUT_PTR(peer_smp_rank, next_fbox);
int next_fbox = ep->next_fbox_out;
unsigned char *fbox = (unsigned char *) MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
/* todo -- need thread locks/atomics here for the multi-threaded case */
if (OPAL_UNLIKELY(size > MAX_MSG || fbox[0] != MCA_BTL_VADER_FBOX_FREE)) {
@ -40,7 +37,7 @@ static inline unsigned char *mca_btl_vader_reserve_fbox (int peer_smp_rank, size
return NULL;
}
mca_btl_vader_component.vader_next_fbox_out[peer_smp_rank] = (next_fbox + 1) & LAST_FBOX;
ep->next_fbox_out = (next_fbox + 1) & LAST_FBOX;
/* mark this fast box as in use */
fbox[0] = MCA_BTL_VADER_FBOX_RESERVED;
@ -64,7 +61,7 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
{
unsigned char *fbox;
fbox = mca_btl_vader_reserve_fbox(endpoint->peer_smp_rank, header_size + payload_size);
fbox = mca_btl_vader_reserve_fbox(endpoint, header_size + payload_size);
if (OPAL_UNLIKELY(NULL == fbox)) {
return 0;
}
@ -86,18 +83,20 @@ static inline void mca_btl_vader_check_fboxes (void)
{
int my_smp_rank = mca_btl_vader_component.my_smp_rank;
mca_btl_active_message_callback_t *reg;
struct mca_btl_base_endpoint_t *endpoint;
unsigned char size, tag, *fbox;
mca_btl_vader_frag_t frag;
unsigned char size, tag;
int i;
int i, next_fbox;
for (i = 0 ; i < mca_btl_vader_component.num_smp_procs ; ++i) {
int next_fbox = mca_btl_vader_component.vader_next_fbox_in[i];
unsigned char *fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox);
if (my_smp_rank == i) {
continue;
}
endpoint = mca_btl_vader_component.endpoints + i;
next_fbox = endpoint->next_fbox_in;
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
/* process all fast-box messages */
while (0xfe != ((size = fbox[0]) & 0xfe)) {
opal_atomic_rmb ();
@ -116,10 +115,10 @@ static inline void mca_btl_vader_check_fboxes (void)
fbox[0] = MCA_BTL_VADER_FBOX_FREE;
next_fbox = next_fbox == LAST_FBOX ? 0 : next_fbox + 1;
fbox = MCA_BTL_VADER_FBOX_IN_PTR(i, next_fbox);
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
}
mca_btl_vader_component.vader_next_fbox_in[i] = next_fbox;
endpoint->next_fbox_in = next_fbox;
}
}

Просмотреть файл

@ -30,7 +30,7 @@
#include "btl_vader_endpoint.h"
#include "btl_vader_frag.h"
#define VADER_FIFO_FREE ((intptr_t)-2)
#define VADER_FIFO_FREE ((int64_t)-2)
/*
* Shared Memory FIFOs
@ -48,9 +48,10 @@
/* lock free fifo */
typedef struct vader_fifo_t {
volatile intptr_t fifo_head;
volatile intptr_t fifo_tail;
char pad[VADER_CACHE_LINE_PAD - 2 * sizeof (intptr_t)];
volatile int64_t fifo_head;
volatile int64_t fifo_tail;
/* pad out to fill a cache line (64 or 128 bytes) */
char pad[128 - 2 * sizeof (int64_t)];
} vader_fifo_t;
static inline int vader_fifo_init (vader_fifo_t *fifo)
@ -60,19 +61,21 @@ static inline int vader_fifo_init (vader_fifo_t *fifo)
return OMPI_SUCCESS;
}
static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, int rank)
static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
vader_fifo_t *fifo = mca_btl_vader_component.fifo[rank];
intptr_t prev, value = VIRTUAL2RELATIVE(hdr);
vader_fifo_t *fifo = ep->fifo;
int64_t prev, value = virtual2relative ((char *) hdr);
hdr->next = VADER_FIFO_FREE;
opal_atomic_wmb ();
prev = opal_atomic_swap_ptr (&fifo->fifo_tail, value);
prev = opal_atomic_swap_64 (&fifo->fifo_tail, value);
opal_atomic_rmb ();
assert (prev != value);
if (OPAL_LIKELY(VADER_FIFO_FREE != prev)) {
hdr = (mca_btl_vader_hdr_t *) RELATIVE2VIRTUAL(prev);
hdr = (mca_btl_vader_hdr_t *) relative2virtual (prev);
hdr->next = value;
} else {
fifo->fifo_head = value;
@ -84,17 +87,19 @@ static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, int rank)
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
{
mca_btl_vader_hdr_t *hdr;
intptr_t value;
int64_t value;
opal_atomic_rmb ();
value = opal_atomic_swap_ptr (&fifo->fifo_head, VADER_FIFO_FREE);
value = opal_atomic_swap_64 (&fifo->fifo_head, VADER_FIFO_FREE);
if (VADER_FIFO_FREE == value) {
/* fifo is empty or we lost the race with another thread */
return NULL;
}
hdr = (mca_btl_vader_hdr_t *) RELATIVE2VIRTUAL(value);
hdr = (mca_btl_vader_hdr_t *) relative2virtual (value);
assert (hdr->next != value);
if (OPAL_UNLIKELY(VADER_FIFO_FREE == hdr->next)) {
opal_atomic_rmb();

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -31,6 +31,35 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
if(frag->hdr != NULL) {
frag->hdr->my_smp_rank = mca_btl_vader_component.my_smp_rank;
}
frag->base.des_src = frag->segments;
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
}
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
{
unsigned int frag_size = (unsigned int)(uintptr_t) ctx;
if (mca_btl_vader_component.segment_size < mca_btl_vader_component.segment_offset + frag_size) {
item->ptr = NULL;
}
item->ptr = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset;
mca_btl_vader_component.segment_offset += frag_size;
mca_btl_vader_frag_constructor ((mca_btl_vader_frag_t *) item);
}
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag)
{
frag->base.des_src = frag->segments;
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t *)frag);
}
OBJ_CLASS_INSTANCE(mca_btl_vader_frag_t, mca_btl_base_descriptor_t,

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -34,7 +34,6 @@ struct mca_btl_vader_hdr_t {
volatile intptr_t next; /* next item in fifo. many peers may touch this */
volatile bool complete; /* fragment completion (usually 1 byte) */
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
char pad[2];
int flags; /* vader send flags */
int my_smp_rank; /* smp rank of owning process */
size_t len; /* length of data following this header */
@ -64,6 +63,12 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
OMPI_FREE_LIST_GET(list, item, rc);
*frag = (mca_btl_vader_frag_t *) item;
if (OPAL_LIKELY(NULL != item)) {
if (NULL == (*frag)->hdr) {
OMPI_FREE_LIST_RETURN(list, (ompi_free_list_item_t *)*frag);
*frag = NULL;
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
(*frag)->hdr->complete = false;
(*frag)->hdr->flags = MCA_BTL_VADER_FLAG_INLINE;
(*frag)->segments[0].seg_addr.pval = (char *)((*frag)->hdr + 1);
@ -73,12 +78,14 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
return rc;
}
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag);
#define MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_eager)
#define MCA_BTL_VADER_FRAG_ALLOC_USER(frag) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user)
#define MCA_BTL_VADER_FRAG_RETURN(frag) \
OMPI_FREE_LIST_RETURN((frag)->my_list, (ompi_free_list_item_t *)(frag))
#define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag)
static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
if (OPAL_UNLIKELY(MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) {
@ -92,4 +99,6 @@ static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
}
}
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx);
#endif /* MCA_BTL_VADER_SEND_FRAG_H */

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -33,8 +33,7 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
mca_mpool_base_registration_t *reg;
void *rem_ptr;
reg = vader_get_registation (endpoint->peer_smp_rank,
(void *)(uintptr_t) src->seg_addr.lval,
reg = vader_get_registation (endpoint, (void *)(uintptr_t) src->seg_addr.lval,
src->seg_len, 0);
if (OPAL_UNLIKELY(NULL == reg)) {
return OMPI_ERROR;
@ -44,7 +43,7 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
vader_memmove ((void *)(uintptr_t) dst->seg_addr.lval, rem_ptr, size);
vader_return_registration (reg, endpoint->peer_smp_rank);
vader_return_registration (reg, endpoint);
mca_btl_vader_frag_complete (frag);

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -70,204 +70,76 @@ static int vader_ft_event (int state);
mca_btl_vader_t mca_btl_vader = {
{
&mca_btl_vader_component.super,
0, /* btl_eager_limit */
0, /* btl_rndv_eager_limit */
0, /* btl_max_send_size */
0, /* btl_rdma_pipeline_send_length */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* btl_exclusivity */
0, /* bTl_latency */
0, /* btl_bandwidth */
0, /* btl_flags */
0, /* btl segment size */
vader_add_procs,
vader_del_procs,
NULL, /* btl_register */
vader_finalize,
mca_btl_vader_alloc,
vader_free,
vader_prepare_src,
vader_prepare_dst,
mca_btl_vader_send,
mca_btl_vader_sendi,
mca_btl_vader_put,
mca_btl_vader_get,
mca_btl_base_dump,
NULL, /* btl_mpool */
vader_register_error_cb, /* register error */
vader_ft_event
.btl_eager_limit = 0,
.btl_rndv_eager_limit = 0,
.btl_max_send_size = 0,
.btl_rdma_pipeline_send_length = 0,
.btl_rdma_pipeline_frag_size = 0,
.btl_min_rdma_pipeline_size = 0,
.btl_exclusivity = 0,
.btl_latency = 0,
.btl_bandwidth = 0,
.btl_flags = 0,
.btl_seg_size = 0,
.btl_add_procs = vader_add_procs,
.btl_del_procs = vader_del_procs,
.btl_register = NULL,
.btl_finalize = vader_finalize,
.btl_alloc = mca_btl_vader_alloc,
.btl_free = vader_free,
.btl_prepare_src = vader_prepare_src,
.btl_prepare_dst = vader_prepare_dst,
.btl_send = mca_btl_vader_send,
.btl_sendi = mca_btl_vader_sendi,
.btl_put = mca_btl_vader_put,
.btl_get = mca_btl_vader_get,
.btl_dump = mca_btl_base_dump,
.btl_mpool = NULL,
.btl_register_error = vader_register_error_cb,
.btl_ft_event = vader_ft_event
}
};
static inline int vader_init_mpool (mca_btl_vader_t *vader_btl, int n)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_mpool_base_resources_t res;
res.mem_node = -1;
/* determine how much memory to create */
/*
* This heuristic formula mostly says that we request memory for:
* - a vader fifo
* - eager fragments (2 * n of them, allocated in vader_free_list_inc chunks)
*
* On top of all that, we sprinkle in some number of "opal_cache_line_size"
* additions to account for some padding and edge effects that may lie
* in the allocator.
*/
res.size = sizeof (vader_fifo_t) + 4 * opal_cache_line_size +
(2 * n + component->vader_free_list_inc) * (component->eager_limit + 2 * opal_cache_line_size);
/* before we multiply by n, make sure the result won't overflow */
/* Stick that little pad in, particularly since we'll eventually
* need a little extra space. E.g., in mca_mpool_vader_init() in
* mpool_vader_component.c when sizeof(mca_common_sm_module_t) is
* added.
*/
if ( ((double) res.size) * n > LONG_MAX - 4096 )
return OMPI_ERR_OUT_OF_RESOURCE;
res.size *= n;
/* now, create it */
component->vader_mpool =
mca_mpool_base_module_create("sm", vader_btl, &res);
/* Sanity check to ensure that we found it */
if(NULL == component->vader_mpool) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
component->vader_mpool_base =
component->vader_mpool->mpool_base (component->vader_mpool);
return OMPI_SUCCESS;
}
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
size_t size;
char *vader_ctl_file;
vader_fifo_t *my_fifos;
ompi_proc_t **procs;
size_t num_procs;
int i, rc;
int rc;
rc = vader_init_mpool (vader_btl, n);
/* generate the endpoints */
component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n, sizeof (struct mca_btl_base_endpoint_t));
component->segment_offset = (n + 1) * 4096;
/* initialize fragment descriptor free lists */
/* initialize free list for send fragments */
rc = ompi_free_list_init_ex_new(&component->vader_frags_eager,
sizeof (mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
0, opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
NULL, mca_btl_vader_frag_init,
(void *) (sizeof (mca_btl_vader_hdr_t) +
mca_btl_vader.super.btl_eager_limit));
if (OMPI_SUCCESS != rc) {
return rc;
}
/* create a list of peers */
component->vader_peers = (struct mca_btl_base_endpoint_t **)
calloc(n, sizeof(struct mca_btl_base_endpoint_t *));
if(NULL == component->vader_peers)
return OMPI_ERR_OUT_OF_RESOURCE;
/* Allocate Shared Memory BTL process coordination
* data structure. This will reside in shared memory */
/* set file name */
if(asprintf(&vader_ctl_file, "%s"OPAL_PATH_SEP"vader_btl_module.%s",
ompi_process_info.job_session_dir,
ompi_process_info.nodename) < 0)
return OMPI_ERR_OUT_OF_RESOURCE;
/* Pass in a data segment alignment of 0 to get no data
segment (only the shared control structure) */
size = sizeof (mca_common_sm_seg_header_t) +
n * (sizeof (vader_fifo_t *) + sizeof (char *)
+ sizeof (xpmem_segid_t)) + opal_cache_line_size;
procs = ompi_proc_world(&num_procs);
if (!(mca_btl_vader_component.vader_seg =
mca_common_sm_init(procs, num_procs, size, vader_ctl_file,
sizeof (mca_common_sm_seg_header_t),
opal_cache_line_size))) {
opal_output(0, "vader_add_procs: unable to create shared memory "
"BTL coordinating strucure :: size %lu \n",
(unsigned long) size);
free(procs);
free(vader_ctl_file);
return OMPI_ERROR;
}
free(procs);
free(vader_ctl_file);
component->shm_fifo = (volatile vader_fifo_t **) component->vader_seg->module_data_addr;
component->shm_bases = (char **)(component->shm_fifo + n);
component->shm_seg_ids = (xpmem_segid_t *)(component->shm_bases + n);
/* set the base of the shared memory segment */
component->shm_bases[component->my_smp_rank] = (char *)component->vader_mpool_base;
component->shm_seg_ids[component->my_smp_rank] = component->my_seg_id;
/* initialize the fifo and fast boxes "owned" by this process */
posix_memalign ((void **)&my_fifos, getpagesize (), (n + 1) * getpagesize ());
if(NULL == my_fifos)
return OMPI_ERR_OUT_OF_RESOURCE;
/* cache the pointer to the 2d fifo array. These addresses
* are valid in the current process space */
component->fifo = (vader_fifo_t **) calloc (n, sizeof(vader_fifo_t *));
if(NULL == component->fifo)
return OMPI_ERR_OUT_OF_RESOURCE;
component->shm_fifo[component->my_smp_rank] =
component->fifo[component->my_smp_rank] = my_fifos;
component->apids = (xpmem_apid_t *) calloc (n, sizeof (xpmem_apid_t));
if (NULL == component->apids)
return OMPI_ERR_OUT_OF_RESOURCE;
component->xpmem_rcaches =
(struct mca_rcache_base_module_t **) calloc (n, sizeof (struct mca_rcache_base_module_t *));
if (NULL == component->xpmem_rcaches)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_fboxes_in = (unsigned char **) calloc (n, sizeof (char *));
if (NULL == component->vader_fboxes_in)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_fboxes_out = (unsigned char **) calloc (n, sizeof (char *));
if (NULL == component->vader_fboxes_out)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_next_fbox_in = (unsigned char *) calloc (64, 1);
if (NULL == component->vader_next_fbox_in)
return OMPI_ERR_OUT_OF_RESOURCE;
component->vader_next_fbox_out = (unsigned char *) calloc (64, 1);
if (NULL == component->vader_next_fbox_out)
return OMPI_ERR_OUT_OF_RESOURCE;
/* initialize fragment descriptor free lists */
/* initialize free list for send fragments */
i = ompi_free_list_init_new(&component->vader_frags_eager,
sizeof (mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
sizeof (mca_btl_vader_hdr_t) + component->eager_limit,
opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
component->vader_mpool);
if (OMPI_SUCCESS != i)
return i;
/* initialize free list for put/get fragments */
i = ompi_free_list_init_new(&component->vader_frags_user,
sizeof(mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
sizeof(mca_btl_vader_hdr_t) + mca_btl_vader_max_inline_send,
opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
component->vader_mpool);
if (OMPI_SUCCESS != i)
return i;
rc = ompi_free_list_init_ex_new(&component->vader_frags_user,
sizeof(mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
0, opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
NULL, mca_btl_vader_frag_init,
(void *) (sizeof(mca_btl_vader_hdr_t) +
mca_btl_vader_component.max_inline_send));
if (OMPI_SUCCESS != rc) {
return rc;
}
/* set flag indicating btl has been inited */
vader_btl->btl_inited = true;
@ -275,16 +147,38 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
return OMPI_SUCCESS;
}
static struct
mca_btl_base_endpoint_t *create_vader_endpoint (int local_proc, struct ompi_proc_t *proc)
{
struct mca_btl_base_endpoint_t *ep = (struct mca_btl_base_endpoint_t *)
calloc(1, sizeof (struct mca_btl_base_endpoint_t));
if(NULL != ep) {
ep->peer_smp_rank = local_proc + mca_btl_vader_component.num_smp_procs;
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_proc_t *proc, int local_rank) {
mca_btl_vader_component_t *component = &mca_btl_vader_component;
struct vader_modex_t *modex;
size_t msg_size;
int rc;
ep->peer_smp_rank = local_rank;
if (OMPI_SUCCESS != (rc = ompi_modex_recv(&component->super.btl_version,
proc, (void *)&modex, &msg_size))) {
return rc;
}
return ep;
ep->apid = xpmem_get (modex->seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
ep->rcache = mca_rcache_base_module_create("vma");
ep->next_fbox_out = 0;
ep->next_fbox_in = 0;
/* Attatch to the remote process' segment */
ep->segment_base =
vader_reg_to_ptr (vader_get_registation (ep, modex->segment_base, mca_btl_vader_component.segment_size,
MCA_MPOOL_FLAGS_PERSIST),
modex->segment_base);
ep->fifo = (struct vader_fifo_t *) ep->segment_base;
ep->fbox_out = ep->segment_base + (1 + component->my_smp_rank) * 4096;
ep->fbox_in = component->my_segment + (1 + local_rank) * 4096;
memset (ep->fbox_in, MCA_BTL_VADER_FBOX_FREE, 4096);
return OMPI_SUCCESS;
}
/**
@ -308,7 +202,7 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
int32_t n_local_procs = 0, proc, i, my_smp_rank = -1;
int32_t n_local_procs = 0, proc, local_rank, my_smp_rank = -1;
bool have_connected_peer = false;
ompi_proc_t *my_proc;
int rc = OMPI_SUCCESS;
@ -329,130 +223,65 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
peers[proc] = NULL;
continue;
}
/* check to see if this is me */
if (my_proc == procs[proc]) {
my_smp_rank = component->my_smp_rank = n_local_procs++;
continue;
}
/* we have someone to talk to */
have_connected_peer = true;
if (!(peers[proc] = create_vader_endpoint (n_local_procs, procs[proc]))) {
rc = OMPI_ERROR;
goto CLEANUP;
}
n_local_procs++;
/* add this proc to shared memory accessibility list */
rc = opal_bitmap_set_bit (reachability, proc);
if(OMPI_SUCCESS != rc) {
goto CLEANUP;
if (my_proc != procs[proc]) {
/* we have someone to talk to */
have_connected_peer = true;
/* add this proc to shared memory accessibility list */
rc = opal_bitmap_set_bit (reachability, proc);
if(OMPI_SUCCESS != rc) {
return rc;
}
} else {
my_smp_rank = mca_btl_vader_component.my_smp_rank = n_local_procs - 1;
}
}
/* jump out if there's not someone we can talk to */
if (!have_connected_peer) {
goto CLEANUP;
return OMPI_SUCCESS;
}
/* make sure that my_smp_rank has been defined */
if(-1 == my_smp_rank) {
rc = OMPI_ERROR;
goto CLEANUP;
return OMPI_ERROR;
}
if (!vader_btl->btl_inited) {
rc = vader_btl_first_time_init(vader_btl, n_local_procs);
if(rc != OMPI_SUCCESS) {
goto CLEANUP;
rc = vader_btl_first_time_init (vader_btl, n_local_procs);
if (rc != OMPI_SUCCESS) {
return rc;
}
}
/* set local proc's smp rank in the peers structure for
* rapid access and calculate reachability */
for (proc = 0; proc < (int32_t) nprocs; ++proc) {
if(NULL == peers[proc])
for (proc = 0, local_rank = 0; proc < (int32_t) nprocs; ++proc) {
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
!OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
peers[proc] = NULL;
continue;
component->vader_peers[peers[proc]->peer_smp_rank] = peers[proc];
peers[proc]->my_smp_rank = my_smp_rank;
}
/* initialize own FIFOs */
/*
* The receiver initializes all its FIFOs. All components will
* be allocated near the receiver. Nothing will be local to
* "the sender" since there will be many senders.
*/
rc = vader_fifo_init (component->fifo[my_smp_rank]);
if (OMPI_SUCCESS != rc) {
goto CLEANUP;
}
opal_atomic_wmb();
/* Sync with other local procs. Force the FIFO initialization to always
* happens before the readers access it.
*/
opal_atomic_add_32( &component->vader_seg->module_seg->seg_inited, 1);
while (n_local_procs >
component->vader_seg->module_seg->seg_inited) {
opal_progress();
opal_atomic_rmb();
}
/* coordinate with other processes */
for (i = 0 ; i < n_local_procs ; ++i) {
int peer_smp_rank = i + component->num_smp_procs;
/* spin until this element is allocated */
/* doesn't really wait for that process... FIFO might be allocated, but not initialized */
while (NULL == component->shm_fifo[peer_smp_rank]) {
opal_progress();
opal_atomic_rmb();
}
if (my_smp_rank != peer_smp_rank) {
void *rem_ptr = (void *) component->shm_fifo[peer_smp_rank];
/* setup endpoint */
peers[proc] = component->endpoints + local_rank;
init_vader_endpoint (peers[proc], procs[proc], local_rank++);
component->apids[peer_smp_rank] =
xpmem_get (component->shm_seg_ids[peer_smp_rank],
XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
component->xpmem_rcaches[peer_smp_rank] = mca_rcache_base_module_create("vma");
/* get a persistent pointer to the peer's fifo */
component->fifo[peer_smp_rank] =
vader_reg_to_ptr (vader_get_registation (peer_smp_rank, rem_ptr,
(n_local_procs + 1) * getpagesize (),
MCA_MPOOL_FLAGS_PERSIST), rem_ptr);
/* fast boxes are allocated at the same time as the fifos */
component->vader_fboxes_in[peer_smp_rank] = (unsigned char *) component->fifo[my_smp_rank] +
(peer_smp_rank + 1) * getpagesize ();
component->vader_fboxes_out[peer_smp_rank] = (unsigned char *) component->fifo[peer_smp_rank] +
(my_smp_rank + 1) * getpagesize ();
component->vader_next_fbox_in[peer_smp_rank] = 0;
component->vader_next_fbox_out[peer_smp_rank] = 0;
memset (component->vader_fboxes_in[peer_smp_rank], MCA_BTL_VADER_FBOX_FREE, getpagesize());
/* check to see if this is me */
if (my_proc == procs[proc]) {
peers[proc] = NULL;
}
}
/* update the local smp process count */
component->num_smp_procs += n_local_procs;
/* make sure we have enough eager fragmnents for each process */
rc = ompi_free_list_resize(&component->vader_frags_eager,
component->num_smp_procs * 2);
CLEANUP:
return rc;
return OMPI_SUCCESS;
}
/**
@ -518,9 +347,9 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl
{
mca_btl_vader_frag_t *frag = NULL;
if (size <= (size_t) mca_btl_vader_max_inline_send) {
if (size <= (size_t) mca_btl_vader_component.max_inline_send) {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
} else if (size <= mca_btl_vader_component.eager_limit) {
} else if (size <= mca_btl_vader.super.btl_eager_limit) {
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
}
@ -530,10 +359,6 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl
frag->base.des_flags = flags;
frag->base.order = order;
frag->base.des_src = frag->segments;
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_src_cnt = 1;
}
return (mca_btl_base_descriptor_t *) frag;
@ -572,8 +397,6 @@ struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t
frag->segments[0].seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].seg_len = *size;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->base.order = order;
frag->base.des_flags = flags;
@ -594,7 +417,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
struct iovec iov, *lcl_mem;
struct iovec iov;
mca_btl_vader_frag_t *frag;
uint32_t iov_count = 1;
void *data_ptr, *fbox_ptr;
@ -630,7 +453,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
return NULL;
}
if ((*size + reserve) > (size_t) mca_btl_vader_max_inline_send) {
if ((*size + reserve) > (size_t) mca_btl_vader_component.max_inline_send) {
/* single copy send */
frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;
@ -646,7 +469,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
/* inline send */
/* try to reserve a fast box for this transfer */
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint->peer_smp_rank, reserve + *size);
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, reserve + *size);
if (fbox_ptr) {
frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2010-2011 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -33,8 +33,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
mca_mpool_base_registration_t *reg;
void *rem_ptr;
reg = vader_get_registation (endpoint->peer_smp_rank,
(void *)(uintptr_t) dst->seg_addr.lval,
reg = vader_get_registation (endpoint, (void *)(uintptr_t) dst->seg_addr.lval,
dst->seg_len, 0);
if (OPAL_UNLIKELY(NULL == reg)) {
return OMPI_ERROR;
@ -44,7 +43,7 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
vader_memmove (rem_ptr, (void *)(uintptr_t) src->seg_addr.lval, size);
vader_return_registration (reg, endpoint->peer_smp_rank);
vader_return_registration (reg, endpoint);
/* always call the callback function */
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -57,7 +57,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
/* post the relative address of the descriptor into the peer's fifo */
vader_fifo_write (frag->hdr, endpoint->peer_smp_rank);
vader_fifo_write (frag->hdr, endpoint);
if (frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY ||
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* Copyright (c) 2010-2013 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
@ -50,13 +50,14 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
size_t max_data;
void *data_ptr = NULL;
assert (length < mca_btl_vader_component.eager_limit);
assert (length < mca_btl_vader.super.btl_eager_limit);
assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
/* we won't ever return a descriptor */
*descriptor = NULL;
if (OPAL_LIKELY(!(payload_size && opal_convertor_need_buffers (convertor)))) {
if (OPAL_LIKELY((payload_size + header_size) < mca_btl_vader_component.max_inline_send &&
!opal_convertor_need_buffers (convertor))) {
if (payload_size) {
opal_convertor_get_current_pointer (convertor, &data_ptr);
}
@ -103,7 +104,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
opal_list_append (&mca_btl_vader_component.active_sends, (opal_list_item_t *) frag);
/* write the fragment pointer to peer's the FIFO */
vader_fifo_write (frag->hdr, endpoint->peer_smp_rank);
vader_fifo_write (frag->hdr, endpoint);
/* the progress function will return the fragment */

113
ompi/mca/btl/vader/btl_vader_xpmem.c Обычный файл
Просмотреть файл

@ -0,0 +1,113 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi/mca/btl/vader/btl_vader.h"
#include "opal/include/opal/align.h"
/* largest address we can attach to using xpmem */
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
/* look up the remote pointer in the peer rcache and attach if
* necessary */
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
size_t size, int flags)
{
struct mca_rcache_base_module_t *rcache = endpoint->rcache;
mca_mpool_base_registration_t *regs[10], *reg = NULL;
struct xpmem_addr xpmem_addr;
uintptr_t base, bound;
int rc, i;
if (OPAL_UNLIKELY(endpoint->peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
return rem_ptr;
}
base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_component.log_attach_align);
bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
mca_btl_vader_component.log_attach_align) + 1;
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
bound = VADER_MAX_ADDRESS;
}
/* several segments may match the base pointer */
rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
for (i = 0 ; i < rc ; ++i) {
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
opal_atomic_add (&regs[i]->ref_count, 1);
return regs[i];
}
if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
continue;
}
/* remove this pointer from the rcache and decrement its reference count
(so it is detached later) */
rc = rcache->rcache_delete (rcache, regs[i]);
if (OPAL_UNLIKELY(0 != rc)) {
/* someone beat us to it? */
break;
}
/* start the new segment from the lower of the two bases */
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
opal_atomic_add (&regs[i]->ref_count, -1);
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
/* this pointer is not in use */
(void) xpmem_detach (regs[i]->alloc_base);
OBJ_RELEASE(regs[i]);
}
break;
}
reg = OBJ_NEW(mca_mpool_base_registration_t);
if (OPAL_LIKELY(NULL != reg)) {
/* stick around for awhile */
reg->ref_count = 2;
reg->base = (unsigned char *) base;
reg->bound = (unsigned char *) bound;
reg->flags = flags;
xpmem_addr.apid = endpoint->apid;
xpmem_addr.offset = base;
reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
OBJ_RELEASE(reg);
reg = NULL;
} else {
rcache->rcache_insert (rcache, reg, 0);
}
}
return reg;
}
void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint)
{
struct mca_rcache_base_module_t *rcache = endpoint->rcache;
opal_atomic_add (&reg->ref_count, -1);
if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
rcache->rcache_delete (rcache, reg);
(void)xpmem_detach (reg->alloc_base);
OBJ_RELEASE (reg);
}
}
void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
{
return (void *) ((uintptr_t) reg->alloc_base +
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
}