1
1
openmpi/ompi/mca/btl/vader/btl_vader.h
Nathan Hjelm cf377db823 MCA/base: Add new MCA variable system
Features:
 - Support for an override parameter file (openmpi-mca-param-override.conf).
   Variable values in this file can not be overridden by any file or environment
   value.
 - Support for boolean, unsigned, and unsigned long long variables.
 - Support for true/false values.
 - Support for enumerations on integer variables.
 - Support for MPIT scope, verbosity, and binding.
 - Support for command line source.
 - Support for setting variable source via the environment using
   OMPI_MCA_SOURCE_<var name>=source (either command or file:filename)
 - Cleaner API.
 - Support for variable groups (equivalent to MPIT categories).

Notes:
 - Variables must be created with a backing store (char **, int *, or bool *)
   that must live at least as long as the variable.
 - Creating a variable with the MCA_BASE_VAR_FLAG_SETTABLE enables the use of
   mca_base_var_set_value() to change the value.
 - String values are duplicated when the variable is registered. It is up to
   the caller to free the original value if necessary. The new value will be
   freed by the mca_base_var system and must not be freed by the user.
 - Variables with constant scope may not be settable.
 - Variable groups (and all associated variables) are deregistered when the
   component is closed or the component repository item is freed. This
   prevents a segmentation fault from accessing a variable after its component
   is unloaded.
 - After some discussion we decided we should remove the automatic registration
   of component priority variables. Few component actually made use of this
   feature.
 - The enumerator interface was updated to be general enough to handle
   future uses of the interface.
 - The code to generate ompi_info output has been moved into the MCA variable
   system. See mca_base_var_dump().

opal: update core and components to mca_base_var system
orte: update core and components to mca_base_var system
ompi: update core and components to mca_base_var system

This commit also modifies the rmaps framework. The following variables were
moved from ppr and lama: rmaps_base_pernode, rmaps_base_n_pernode,
rmaps_base_n_persocket. Both lama and ppr create synonyms for these variables.

This commit was SVN r28236.
2013-03-27 21:09:41 +00:00

319 строки
12 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_BTL_VADER_H
#define MCA_BTL_VADER_H
#include "ompi_config.h"
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_STDINT_H
# include <stdint.h>
#endif /* HAVE_STDINT_H */
#ifdef HAVE_SCHED_H
# include <sched.h>
#endif /* HAVE_SCHED_H */
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
/* xpmem is required by vader atm */
#include <xpmem.h>
#include "opal/include/opal/align.h"
#include "opal/class/opal_free_list.h"
#include "opal/sys/atomic.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/sm/mpool_sm.h"
#include "ompi/mca/common/sm/common_sm.h"
#include "ompi/mca/rcache/rcache.h"
#include "ompi/mca/rcache/base/base.h"
BEGIN_C_DECLS
#define min(a,b) ((a) < (b) ? (a) : (b))
extern int mca_btl_vader_memcpy_limit;
extern int mca_btl_vader_log_align;
extern int mca_btl_vader_max_inline_send;
/* We can't use opal_cache_line_size here because we need a
compile-time constant for padding the struct. We can't really have
a compile-time constant that is portable, either (e.g., compile on
one machine and run on another). So just use a big enough cache
line that should hopefully be good in most places. */
#define VADER_CACHE_LINE_PAD 128
/* largest address we can attach to using xpmem */
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
/*
* Shared Memory resource managment
*/
struct vader_fifo_t;
/**
* Shared Memory (VADER) BTL module.
*/
struct mca_btl_vader_component_t {
mca_btl_base_component_2_0_0_t super; /**< base BTL component */
int vader_free_list_num; /**< initial size of free lists */
int vader_free_list_max; /**< maximum size of free lists */
int vader_free_list_inc; /**< number of elements to alloc
* when growing free lists */
mca_mpool_base_module_t *vader_mpool; /**< mpool on local node */
void *vader_mpool_base; /**< base address of shared memory pool */
size_t eager_limit; /**< send fragment size */
mca_common_sm_module_t *vader_seg; /**< description of shared memory segment */
volatile struct vader_fifo_t **shm_fifo;/**< pointer to fifo 2D array in
* shared memory */
char **shm_bases; /**< pointer to base pointers in
* shared memory */
xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
xpmem_segid_t *shm_seg_ids; /* xpmem segment ids */
struct vader_fifo_t **fifo; /**< cached copy of the pointer to
* the 2D fifo array. */
struct mca_rcache_base_module_t **xpmem_rcaches;
xpmem_apid_t *apids; /* xpmem apids */
int32_t num_smp_procs; /**< current number of smp procs on this host */
int32_t my_smp_rank; /**< My SMP process rank. Used for accessing
* SMP specfic data structures. */
ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */
ompi_free_list_t vader_frags_user; /**< free list of vader put/get frags */
opal_list_t active_sends; /**< list of outstanding fragments */
unsigned char **vader_fboxes_in; /**< incomming fast boxes (memory belongs to this process) */
unsigned char **vader_fboxes_out; /**< outgoing fast boxes (memory belongs to remote peers) */
unsigned char *vader_next_fbox_in; /**< indices of fast boxes to poll */
unsigned char *vader_next_fbox_out; /**< indices of fast boxes to write */
struct mca_btl_base_endpoint_t **vader_peers;
};
typedef struct mca_btl_vader_component_t mca_btl_vader_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;
/**
* VADER BTL Interface
*/
struct mca_btl_vader_t {
mca_btl_base_module_t super; /**< base BTL interface */
bool btl_inited; /**< flag indicating if btl has been inited */
mca_btl_base_module_error_cb_fn_t error_cb;
};
typedef struct mca_btl_vader_t mca_btl_vader_t;
OMPI_MODULE_DECLSPEC extern mca_btl_vader_t mca_btl_vader;
/***
* One or more FIFO components may be a pointer that must be
* accessed by multiple processes. Since the shared region may
* be mmapped differently into each process's address space,
* these pointers will be relative to some base address. Here,
* we define macros to translate between relative addresses and
* virtual addresses.
*/
#define VIRTUAL2RELATIVE(VADDR ) ((intptr_t)(VADDR) - (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
#define RELATIVE2VIRTUAL(OFFSET) ((intptr_t)(OFFSET) + (intptr_t)mca_btl_vader_component.shm_bases[mca_btl_vader_component.my_smp_rank])
/* look up the remote pointer in the peer rcache and attach if
* necessary */
static inline mca_mpool_base_registration_t *vader_get_registation (int peer_smp_rank, void *rem_ptr,
size_t size, int flags)
{
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
mca_mpool_base_registration_t *regs[10], *reg = NULL;
struct xpmem_addr xpmem_addr;
uintptr_t base, bound;
int rc, i;
if (OPAL_UNLIKELY(peer_smp_rank == mca_btl_vader_component.my_smp_rank)) {
return rem_ptr;
}
base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_log_align);
bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1),
mca_btl_vader_log_align) + 1;
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
bound = VADER_MAX_ADDRESS;
}
/* several segments may match the base pointer */
rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10);
for (i = 0 ; i < rc ; ++i) {
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
opal_atomic_add (&regs[i]->ref_count, 1);
return regs[i];
}
if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) {
continue;
}
/* remove this pointer from the rcache and decrement its reference count
(so it is detached later) */
rc = rcache->rcache_delete (rcache, regs[i]);
if (OPAL_UNLIKELY(0 != rc)) {
/* someone beat us to it? */
break;
}
/* start the new segment from the lower of the two bases */
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
opal_atomic_add (&regs[i]->ref_count, -1);
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
/* this pointer is not in use */
(void) xpmem_detach (regs[i]->alloc_base);
OBJ_RELEASE(regs[i]);
}
break;
}
reg = OBJ_NEW(mca_mpool_base_registration_t);
if (OPAL_LIKELY(NULL != reg)) {
/* stick around for awhile */
reg->ref_count = 2;
reg->base = (unsigned char *) base;
reg->bound = (unsigned char *) bound;
reg->flags = flags;
xpmem_addr.apid = mca_btl_vader_component.apids[peer_smp_rank];
xpmem_addr.offset = base;
reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) {
OBJ_RELEASE(reg);
reg = NULL;
} else {
rcache->rcache_insert (rcache, reg, 0);
}
}
return reg;
}
static inline void vader_return_registration (mca_mpool_base_registration_t *reg, int peer_smp_rank)
{
struct mca_rcache_base_module_t *rcache = mca_btl_vader_component.xpmem_rcaches[peer_smp_rank];
opal_atomic_add (&reg->ref_count, -1);
if (OPAL_UNLIKELY(0 == reg->ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) {
rcache->rcache_delete (rcache, reg);
(void)xpmem_detach (reg->alloc_base);
OBJ_RELEASE (reg);
}
}
static inline void *vader_reg_to_ptr (mca_mpool_base_registration_t *reg, void *rem_ptr)
{
return (void *) ((uintptr_t) reg->alloc_base +
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
}
/* memcpy is faster at larger sizes but is undefined if the
pointers are aliased (TODO -- readd alias check) */
static inline void vader_memmove (void *dst, void *src, size_t size)
{
if (size >= (size_t) mca_btl_vader_memcpy_limit) {
memcpy (dst, src, size);
} else {
memmove (dst, src, size);
}
}
/**
* Initiate a send to the peer.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
int mca_btl_vader_send(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag);
/**
* Initiate an inline send to the peer.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor);
/**
* Initiate an synchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
/**
* Initiate an synchronous get.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
/**
* Allocate a segment.
*
* @param btl (IN) BTL module
* @param size (IN) Request segment size.
*/
mca_btl_base_descriptor_t* mca_btl_vader_alloc (struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
uint8_t order, size_t size, uint32_t flags);
END_C_DECLS
#endif