1
1

btl/vader: add support for traditional shared memory.

This commit adds support for placing the send memory segment in a
traditional shared memory segment when XPMEM is not available. The
current default is to reserve 4MB for shared memory on each process.
The latest benchmarks show vader performing better than sm on both
Intel and AMD CPUs.

For large messages vader will now use CMA if it is available (and
XPMEM is not).

cmr=v1.7.5:reviewer=jsquyres

This commit was SVN r30123.
Этот коммит содержится в:
Nathan Hjelm 2014-01-06 19:51:44 +00:00
родитель 5c8ea3a251
Коммит e627c91227
18 изменённых файлов: 620 добавлений и 260 удалений

42
config/ompi_check_cma.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
# -*- shell-script -*-
#
# Copyright (c) 2009 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
# Copyright (c) 2013 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# OMPI_CHECK_CMA(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if cma support is wanted.
AC_DEFUN([OMPI_CHECK_CMA],[
OPAL_VAR_SCOPE_PUSH([ompi_check_cma_happy ompi_check_cma_need_defs])
ompi_check_cma_happy="no"
AC_ARG_WITH([cma],
[AC_HELP_STRING([--with-cma],
[Build Cross Memory Attach support (default: no)])])
AC_MSG_CHECKING([if user requested CMA build])
if test "$with_cma" = "yes" ; then
ompi_check_cma_happy="yes"
AC_CHECK_FUNC(process_vm_readv, [ompi_check_cma_need_defs=0],
[ompi_check_cma_need_defs=1])
AC_DEFINE_UNQUOTED([OMPI_CMA_NEED_SYSCALL_DEFS],
[$ompi_check_cma_need_defs],
[Need CMA syscalls defined])
fi
AS_IF([test "$ompi_check_cma_happy" = "yes"],
[$2],
[$3])
OPAL_VAR_SCOPE_POP
])dnl

Просмотреть файл

@ -35,9 +35,9 @@
#include <sys/mman.h>
#endif /* HAVE_SYS_MMAN_H */
#if OMPI_BTL_SM_HAVE_CMA && defined(OMPI_BTL_SM_CMA_NEED_SYSCALL_DEFS)
#if OMPI_BTL_SM_HAVE_CMA && defined(OMPI_CMA_NEED_SYSCALL_DEFS)
#include "opal/sys/cma.h"
#endif /* OMPI_BTL_SM_CMA_NEED_SYSCALL_DEFS */
#endif /* OMPI_CMA_NEED_SYSCALL_DEFS */
#include "opal/sys/atomic.h"
#include "opal/class/opal_bitmap.h"

Просмотреть файл

@ -12,30 +12,6 @@
# $HEADER$
#
# OMPI_CHECK_CMA(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if cma support is wanted.
AC_DEFUN([OMPI_CHECK_CMA],[
AC_ARG_WITH([cma],
[AC_HELP_STRING([--with-cma],
[Build Cross Memory Attach support (default: no)])])
AC_MSG_CHECKING([if user requested CMA build])
if test "$with_cma" = "yes" ; then
btl_sm_cma_happy="yes"
AC_CHECK_FUNC(process_vm_readv, [btl_sm_cma_need_defs=0],
[btl_sm_cma_need_defs=1])
AC_DEFINE_UNQUOTED([OMPI_BTL_SM_CMA_NEED_SYSCALL_DEFS],
[$btl_sm_cma_need_defs],
[Need CMA syscalls defined])
fi
AS_IF([test "$btl_sm_cma_happy" = "yes"],
[$2],
[$3])
])dnl
# OMPI_CHECK_KNEM(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if knem support can be found. sets prefix_{CPPFLAGS,

Просмотреть файл

@ -42,8 +42,12 @@
# include <unistd.h>
#endif /* HAVE_UNISTD_H */
#if OMPI_BTL_VADER_HAVE_XPMEM
/* xpmem is required by vader atm */
#include <xpmem.h>
#else
#include "opal/mca/shmem/base/base.h"
#endif
#include "opal/class/opal_free_list.h"
#include "opal/sys/atomic.h"
@ -77,8 +81,12 @@ struct vader_fifo_t;
* Modex data
*/
struct vader_modex_t {
#if OMPI_BTL_VADER_HAVE_XPMEM
xpmem_segid_t seg_id;
void *segment_base;
#else
opal_shmem_ds_t seg_ds;
#endif
};
/**
@ -90,19 +98,28 @@ struct mca_btl_vader_component_t {
int vader_free_list_max; /**< maximum size of free lists */
int vader_free_list_inc; /**< number of elements to alloc
* when growing free lists */
#if OMPI_BTL_VADER_HAVE_XPMEM
xpmem_segid_t my_seg_id; /* this rank's xpmem segment id */
#else
opal_shmem_ds_t seg_ds; /* this rank's shared memory segment */
#endif
char *my_segment; /* this rank's base pointer */
size_t segment_size; /* size of my_segment */
size_t segment_offset; /* start of unused portion of my_segment */
int32_t num_smp_procs; /**< current number of smp procs on this host */
ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */
#if !OMPI_BTL_VADER_HAVE_XPMEM
ompi_free_list_t vader_frags_max_send;
#endif
ompi_free_list_t vader_frags_user; /**< free list of vader put/get frags */
int memcpy_limit; /** Limit where we switch from memmove to memcpy */
int log_attach_align; /** Log of the alignment for xpmem segments */
int max_inline_send; /** Limit for copy-in-copy-out fragments */
unsigned int max_inline_send; /** Limit for copy-in-copy-out fragments */
struct mca_btl_base_endpoint_t *endpoints;
struct vader_fifo_t *my_fifo;
};
typedef struct mca_btl_vader_component_t mca_btl_vader_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_vader_component_t mca_btl_vader_component;

Просмотреть файл

@ -59,7 +59,6 @@ mca_btl_vader_component_t mca_btl_vader_component = {
.mca_component_release_version = OMPI_RELEASE_VERSION,
.mca_open_component = mca_btl_vader_component_open,
.mca_close_component = mca_btl_vader_component_close,
.mca_query_component = NULL,
.mca_register_component_params = mca_btl_vader_component_register,
},
.btl_data = {
@ -119,10 +118,18 @@ static int mca_btl_vader_component_register (void)
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.log_attach_align);
#if OMPI_BTL_VADER_HAVE_XPMEM
mca_btl_vader_component.segment_size = 1 << 24;
#else
mca_btl_vader_component.segment_size = 1 << 22;
#endif
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"segment_size", "Maximum size of all shared "
#if OMPI_BTL_VADER_HAVE_XPMEM
"memory buffers (default: 16M)",
#else
"memory buffers (default: 4M)",
#endif
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
@ -132,22 +139,39 @@ static int mca_btl_vader_component_register (void)
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
"max_inline_send", "Maximum size to transfer "
"using copy-in copy-out semantics",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_vader_component.max_inline_send);
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
#if OMPI_BTL_VADER_HAVE_XPMEM
mca_btl_vader.super.btl_eager_limit = 32 * 1024;
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
#else
mca_btl_vader.super.btl_eager_limit = 4 * 1024;
mca_btl_vader.super.btl_rndv_eager_limit = 32 * 1024;
mca_btl_vader.super.btl_max_send_size = 32 * 1024;
mca_btl_vader.super.btl_min_rdma_pipeline_size = 32 * 1024;
#endif
mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
#if OMPI_BTL_VADER_HAVE_XPMEM || OMPI_BTL_VADER_HAVE_CMA
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
#else
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE;
#endif
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
#if OMPI_BTL_VADER_HAVE_XPMEM || OMPI_BTL_VADER_HAVE_CMA
mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
#else
mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */
#endif
mca_btl_vader.super.btl_latency = 1; /* Microsecs */
/* Call the BTL based to register its MCA params */
@ -167,6 +191,9 @@ static int mca_btl_vader_component_open(void)
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
#if !OMPI_BTL_VADER_HAVE_XPMEM
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t);
#endif
return OMPI_SUCCESS;
}
@ -180,6 +207,9 @@ static int mca_btl_vader_component_close(void)
{
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user);
#if !OMPI_BTL_VADER_HAVE_XPMEM
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send);
#endif
if (NULL != mca_btl_vader_component.my_segment) {
munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size);
@ -191,12 +221,19 @@ static int mca_btl_vader_component_close(void)
static int mca_btl_base_vader_modex_send (void)
{
struct vader_modex_t modex;
int modex_size;
#if OMPI_BTL_VADER_HAVE_XPMEM
modex.seg_id = mca_btl_vader_component.my_seg_id;
modex.segment_base = mca_btl_vader_component.my_segment;
return ompi_modex_send(&mca_btl_vader_component.super.btl_version,
&modex, sizeof (modex));
modex_size = sizeof (modex);
#else
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
#endif
return ompi_modex_send(&mca_btl_vader_component.super.btl_version, &modex, modex_size);
}
/*
@ -230,6 +267,12 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
return NULL;
}
/* ensure a sane segment size */
if (mca_btl_vader_component.segment_size < (2 << 20)) {
mca_btl_vader_component.segment_size = (2 << 20);
}
#if OMPI_BTL_VADER_HAVE_XPMEM
/* create an xpmem segment for the entire memory space */
component->my_seg_id = xpmem_make (0, 0xffffffffffffffffll, XPMEM_PERMIT_MODE,
(void *)0666);
@ -238,36 +281,51 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
return NULL;
}
/* ensure a sane segment size */
if (mca_btl_vader_component.segment_size < (2 << 20)) {
mca_btl_vader_component.segment_size = (2 << 20);
}
component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ |
PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if ((void *)-1 == component->my_segment) {
free (btls);
return NULL;
}
#else
{
char *sm_file;
rc = asprintf(&sm_file, "%s" OPAL_PATH_SEP "vader_segment.%s.%d", ompi_process_info.job_session_dir,
ompi_process_info.nodename, MCA_BTL_VADER_LOCAL_RANK);
if (0 > rc) {
free (btls);
return NULL;
}
rc = opal_shmem_segment_create (&mca_btl_vader_component.seg_ds, sm_file, mca_btl_vader_component.segment_size);
free (sm_file);
if (OPAL_SUCCESS != rc) {
free (btls);
return NULL;
}
component->my_segment = opal_shmem_segment_attach (&mca_btl_vader_component.seg_ds);
if (NULL == component->my_segment) {
goto failed;
}
}
#endif
component->segment_offset = 0;
memset (component->my_segment + 4096, MCA_BTL_VADER_FBOX_FREE, MCA_BTL_VADER_NUM_LOCAL_PEERS *
memset (component->my_segment + MCA_BTL_VADER_FIFO_SIZE, 0, MCA_BTL_VADER_NUM_LOCAL_PEERS *
MCA_BTL_VADER_FBOX_PEER_SIZE);
/* initialize my fifo */
rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
if (OMPI_SUCCESS != rc) {
free (btls);
munmap (component->my_segment, mca_btl_vader_component.segment_size);
return NULL;
goto failed;
}
rc = mca_btl_base_vader_modex_send ();
if (OMPI_SUCCESS != rc) {
free (btls);
munmap (component->my_segment, mca_btl_vader_component.segment_size);
return NULL;
goto failed;
}
*num_btls = 1;
@ -279,24 +337,31 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
mca_btl_vader.btl_inited = false;
return btls;
failed:
#if OMPI_BTL_VADER_HAVE_XPMEM
munmap (component->my_segment, mca_btl_vader_component.segment_size);
#else
opal_shmem_unlink (&mca_btl_vader_component.seg_ds);
#endif
if (btls) {
free (btls);
}
return NULL;
}
static int mca_btl_vader_component_progress (void)
static int mca_btl_vader_poll_fifo (void)
{
mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}};
const int my_smp_rank = MCA_BTL_VADER_LOCAL_RANK;
mca_mpool_base_registration_t *xpmem_reg = NULL;
const mca_btl_active_message_callback_t *reg;
struct mca_btl_base_endpoint_t *endpoint;
mca_btl_vader_hdr_t *hdr;
int fifo_count;
/* check for messages in fast boxes */
mca_btl_vader_check_fboxes ();
mca_btl_vader_hdr_t *hdr;
/* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */
for (fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) {
hdr = vader_fifo_read (mca_btl_vader_component.endpoints[my_smp_rank].fifo);
for (int fifo_count = 0 ; fifo_count < 8 ; ++fifo_count) {
mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}};
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo);
if (NULL == hdr) {
return fifo_count;
}
@ -313,6 +378,8 @@ static int mca_btl_vader_component_progress (void)
endpoint = mca_btl_vader_component.endpoints + hdr->src_smp_rank;
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
mca_mpool_base_registration_t *xpmem_reg;
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
hdr->sc_iov.iov_len, 0,
&frag.segments[1].seg_addr.pval);
@ -322,7 +389,6 @@ static int mca_btl_vader_component_progress (void)
/* recv upcall */
frag.base.des_dst_cnt = 2;
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
frag.base.des_dst_cnt = 1;
vader_return_registration (xpmem_reg, endpoint);
} else {
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
@ -333,5 +399,24 @@ static int mca_btl_vader_component_progress (void)
vader_fifo_write_back (hdr, endpoint);
}
return fifo_count;
return 1;
}
static int mca_btl_vader_component_progress (void)
{
bool fboxed;
/* check for messages in fast boxes */
for (int spin_count = 5 ; spin_count ; --spin_count) {
fboxed = (int) mca_btl_vader_check_fboxes ();
if (fboxed) {
break;
}
}
if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) {
return (int) fboxed;
}
return mca_btl_vader_poll_fifo () + (int) fboxed;
}

Просмотреть файл

@ -24,7 +24,11 @@
#ifndef MCA_BTL_VADER_ENDPOINT_H
#define MCA_BTL_VADER_ENDPOINT_H
#if OMPI_BTL_VADER_HAVE_XPMEM
#include <xpmem.h>
#else
#include "opal/mca/shmem/base/base.h"
#endif
struct vader_fifo_t;
@ -34,17 +38,26 @@ struct vader_fifo_t;
* and BTL pair at startup.
*/
struct mca_btl_vader_fbox_t;
struct mca_btl_base_endpoint_t {
int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing
* SMP specfic data structures. */
char *segment_base;
struct vader_fifo_t *fifo;
xpmem_apid_t apid;
char *fbox_out;
char *fbox_in;
#if OMPI_BTL_VADER_HAVE_XPMEM
xpmem_apid_t apid;
#else
pid_t pid;
opal_shmem_ds_t seg_ds;
#endif
struct mca_btl_vader_fbox_t * restrict fbox_out;
struct mca_btl_vader_fbox_t * restrict fbox_in;
int next_fbox_out;
int next_fbox_in;
#if OMPI_BTL_VADER_HAVE_XPMEM
struct mca_rcache_base_module_t *rcache;
#endif
};
#endif /* MCA_BTL_VADER_ENDPOINT_H */

Просмотреть файл

@ -16,6 +16,8 @@
#include "btl_vader_endpoint.h"
#include "btl_vader_xpmem.h"
#include <string.h>
/* these hard-coded settings are based on the ideal setup for an Opteron 61xx chip and
* may need to be adjusted for other systems. adding an MCA variable is possible but
* can cost 20-40 ns on the fast path. this size is limited to 256 maximum bytes */
@ -23,66 +25,76 @@
/* there should be a power of two number of fast boxes to simplify the math in the
* critical path */
#define MCA_BTL_VADER_LAST_FBOX 63
/* two bytes are reserved for tag and size */
#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - 2)
/* two bytes are reserved for tag and size (update if the header is modified) */
#define MCA_BTL_VADER_FBOX_HDR_SIZE 2
#define MCA_BTL_VADER_FBOX_MAX_SIZE (MCA_BTL_VADER_FBOX_SIZE - MCA_BTL_VADER_FBOX_HDR_SIZE)
/* total size of all the fast boxes assigned to a particular peer */
#define MCA_BTL_VADER_FBOX_PEER_SIZE (MCA_BTL_VADER_FBOX_SIZE * (MCA_BTL_VADER_LAST_FBOX + 1))
enum {MCA_BTL_VADER_FBOX_FREE = 0, MCA_BTL_VADER_FBOX_RESERVED = 0x80};
typedef struct mca_btl_vader_fbox_t {
union {
struct {
uint8_t size;
uint8_t tag;
} hdr_data;
uint16_t ival;
} hdr;
#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + MCA_BTL_VADER_FBOX_SIZE * (fbox))
#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + MCA_BTL_VADER_FBOX_SIZE * (fbox))
uint8_t data[MCA_BTL_VADER_FBOX_MAX_SIZE];
} mca_btl_vader_fbox_t;
#define MCA_BTL_VADER_FBOX_OUT_PTR(ep, fbox) ((ep)->fbox_out + (fbox))
#define MCA_BTL_VADER_FBOX_IN_PTR(ep, fbox) ((ep)->fbox_in + (fbox))
#define MCA_BTL_VADER_NEXT_FBOX(fbox) (((fbox) + 1) & MCA_BTL_VADER_LAST_FBOX)
static inline unsigned char * restrict mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, const size_t size)
static inline mca_btl_vader_fbox_t * restrict mca_btl_vader_reserve_fbox (struct mca_btl_base_endpoint_t *ep, const size_t size)
{
const int next_fbox = ep->next_fbox_out;
unsigned char * restrict fbox = (unsigned char * restrict) MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
mca_btl_vader_fbox_t * restrict fbox = MCA_BTL_VADER_FBOX_OUT_PTR(ep, next_fbox);
/* todo -- need thread locks/atomics here for the multi-threaded case */
if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && fbox[0] == MCA_BTL_VADER_FBOX_FREE)) {
if (OPAL_LIKELY(size <= MCA_BTL_VADER_FBOX_MAX_SIZE && 0 == fbox->hdr.ival)) {
/* mark this fast box as in use */
fbox[0] = MCA_BTL_VADER_FBOX_RESERVED;
fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
return fbox + 2;
return fbox;
} else if (OPAL_LIKELY(size <= (MCA_BTL_VADER_FBOX_MAX_SIZE + MCA_BTL_VADER_FBOX_SIZE) && MCA_BTL_VADER_LAST_FBOX != next_fbox &&
MCA_BTL_VADER_FBOX_FREE == fbox[0] && MCA_BTL_VADER_FBOX_FREE == fbox[MCA_BTL_VADER_FBOX_SIZE])) {
0 == fbox->hdr.ival && 0 == fbox[1].hdr.ival)) {
/* aggregate two fast boxes */
fbox[0] = MCA_BTL_VADER_FBOX_RESERVED;
fbox->hdr.hdr_data.size = size;
ep->next_fbox_out = MCA_BTL_VADER_NEXT_FBOX(next_fbox + 1);
return fbox + 2;
return fbox;
}
return NULL;
}
static inline void mca_btl_vader_fbox_send (unsigned char * restrict fbox, unsigned char tag,
static inline void mca_btl_vader_fbox_send (mca_btl_vader_fbox_t * restrict fbox, unsigned char tag,
size_t size)
{
fbox[-1] = tag;
/* ensure data writes have completed before we mark the data as available */
opal_atomic_wmb ();
fbox[-2] = size;
fbox->hdr.hdr_data.tag = tag;
}
static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endpoint, char tag,
void * restrict header, const size_t header_size,
void * restrict payload, const size_t payload_size)
{
unsigned char * restrict fbox;
mca_btl_vader_fbox_t * restrict fbox;
fbox = mca_btl_vader_reserve_fbox(endpoint, header_size + payload_size);
if (OPAL_UNLIKELY(NULL == fbox)) {
return 0;
}
memmove (fbox, header, header_size);
if (OPAL_UNLIKELY(payload)) {
memcpy (fbox->data, header, header_size);
if (payload) {
/* inline sends are typically just pml headers (due to MCA_BTL_FLAGS_SEND_INPLACE) */
memmove (fbox + header_size, payload, payload_size);
memcpy (fbox->data + header_size, payload, payload_size);
}
/* mark the fbox as sent */
@ -92,55 +104,54 @@ static inline int mca_btl_vader_fbox_sendi (struct mca_btl_base_endpoint_t *endp
return 1;
}
static inline void mca_btl_vader_check_fboxes (void)
static inline bool mca_btl_vader_check_fboxes (void)
{
mca_btl_vader_frag_t frag = {.base = {.des_dst = frag.segments, .des_dst_cnt = 1}};
const int num_smp_procs = MCA_BTL_VADER_NUM_LOCAL_PEERS + 1;
const mca_btl_active_message_callback_t *reg;
struct mca_btl_base_endpoint_t *endpoint;
unsigned char * restrict fbox;
int i, next_fbox;
mca_btl_vader_fbox_t * restrict fbox;
mca_btl_base_segment_t segment;
mca_btl_base_descriptor_t desc;
bool processed = false;
int next_fbox;
for (i = 0, endpoint = mca_btl_vader_component.endpoints ; i < num_smp_procs ; ++i, ++endpoint) {
if (NULL == endpoint->fbox_in) {
for (endpoint = mca_btl_vader_component.endpoints ; endpoint->peer_smp_rank != -1 ; ++endpoint) {
next_fbox = endpoint->next_fbox_in;
fbox = MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
if (NULL == endpoint->fbox_in || 0 == fbox->hdr.hdr_data.tag) {
continue;
}
next_fbox = endpoint->next_fbox_in;
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
desc.des_dst = &segment;
desc.des_dst_cnt = 1;
processed = true;
/* process all fast-box messages */
while ((frag.segments[0].seg_len = fbox[0]) & 0x7f) {
const unsigned char tag = fbox[1];
while (0 != fbox->hdr.hdr_data.tag) {
opal_atomic_rmb ();
reg = mca_btl_base_active_message_trigger + tag;
reg = mca_btl_base_active_message_trigger + fbox->hdr.hdr_data.tag;
frag.segments[0].seg_addr.pval = fbox + 2;
segment.seg_addr.pval = fbox->data;
segment.seg_len = fbox->hdr.hdr_data.size;
reg->cbfunc(&mca_btl_vader.super, tag, &(frag.base), reg->cbdata);
reg->cbfunc(&mca_btl_vader.super, fbox->hdr.hdr_data.tag, &desc, reg->cbdata);
if (fbox[0] > MCA_BTL_VADER_FBOX_MAX_SIZE) {
fbox[MCA_BTL_VADER_FBOX_SIZE] = MCA_BTL_VADER_FBOX_FREE;
if (fbox->hdr.hdr_data.size > MCA_BTL_VADER_FBOX_MAX_SIZE) {
fbox[1].hdr.ival = 0;
++next_fbox;
}
fbox[0] = MCA_BTL_VADER_FBOX_FREE;
fbox->hdr.ival = 0;
next_fbox = MCA_BTL_VADER_NEXT_FBOX(next_fbox);
fbox = (unsigned char *) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
fbox = (mca_btl_vader_fbox_t * restrict) MCA_BTL_VADER_FBOX_IN_PTR(endpoint, next_fbox);
}
endpoint->next_fbox_in = next_fbox;
}
return processed;
}
#endif /* !defined(MCA_BTL_VADER_FBOX_H) */

Просмотреть файл

@ -50,50 +50,10 @@
typedef struct vader_fifo_t {
volatile int64_t fifo_head;
volatile int64_t fifo_tail;
/* pad out to fill a cache line (64 or 128 bytes) */
char pad[128 - 2 * sizeof (int64_t)];
} vader_fifo_t;
static inline int vader_fifo_init (vader_fifo_t *fifo)
{
fifo->fifo_head = fifo->fifo_tail = VADER_FIFO_FREE;
return OMPI_SUCCESS;
}
static inline void _vader_fifo_write (vader_fifo_t *fifo, int64_t value)
{
int64_t prev;
opal_atomic_wmb ();
prev = opal_atomic_swap_64 (&fifo->fifo_tail, value);
opal_atomic_rmb ();
assert (prev != value);
if (OPAL_LIKELY(VADER_FIFO_FREE != prev)) {
mca_btl_vader_hdr_t *hdr = (mca_btl_vader_hdr_t *) relative2virtual (prev);
hdr->next = value;
} else {
fifo->fifo_head = value;
}
opal_atomic_wmb ();
}
/* write a frag (relative to this process' base) to another rank's fifo */
static inline void vader_fifo_write (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
hdr->next = VADER_FIFO_FREE;
_vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr));
}
/* write a frag (relative to the remote process' base) to the remote fifo. note the remote peer must own hdr */
static inline void vader_fifo_write_back (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
hdr->next = VADER_FIFO_FREE;
_vader_fifo_write(ep->fifo, virtual2relativepeer (ep, (char *) hdr));
}
/* large enough to ensure the fifo is on its own cache line */
#define MCA_BTL_VADER_FIFO_SIZE 128
static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
{
@ -132,4 +92,46 @@ static inline mca_btl_vader_hdr_t *vader_fifo_read (vader_fifo_t *fifo)
return hdr;
}
static inline int vader_fifo_init (vader_fifo_t *fifo)
{
fifo->fifo_head = fifo->fifo_tail = VADER_FIFO_FREE;
mca_btl_vader_component.my_fifo = fifo;
return OMPI_SUCCESS;
}
static inline void vader_fifo_write (vader_fifo_t *fifo, int64_t value)
{
int64_t prev;
opal_atomic_wmb ();
prev = opal_atomic_swap_64 (&fifo->fifo_tail, value);
opal_atomic_rmb ();
assert (prev != value);
if (OPAL_LIKELY(VADER_FIFO_FREE != prev)) {
mca_btl_vader_hdr_t *hdr = (mca_btl_vader_hdr_t *) relative2virtual (prev);
hdr->next = value;
} else {
fifo->fifo_head = value;
}
opal_atomic_wmb ();
}
/* write a frag (relative to this process' base) to another rank's fifo */
static inline void vader_fifo_write_ep (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
hdr->next = VADER_FIFO_FREE;
vader_fifo_write (ep->fifo, virtual2relative ((char *) hdr));
}
/* write a frag (relative to the remote process' base) to the remote fifo. note the remote peer must own hdr */
static inline void vader_fifo_write_back (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *ep)
{
hdr->next = VADER_FIFO_FREE;
vader_fifo_write(ep->fifo, virtual2relativepeer (ep, (char *) hdr));
}
#endif /* MCA_BTL_VADER_FIFO_H */

Просмотреть файл

@ -31,6 +31,7 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
if(frag->hdr != NULL) {
frag->hdr->src_smp_rank = MCA_BTL_VADER_LOCAL_RANK;
frag->hdr->frag = frag;
frag->hdr->flags = 0;
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);
}
@ -38,34 +39,41 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag)
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->fbox = NULL;
}
void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) item;
unsigned int frag_size = (unsigned int)(uintptr_t) ctx;
unsigned int data_size = frag_size - sizeof (mca_btl_vader_hdr_t);
assert (data_size > 0);
if (mca_btl_vader_component.segment_size < mca_btl_vader_component.segment_offset + frag_size) {
item->ptr = NULL;
return;
}
/* Set the list element here so we don't have to set it on the critical path. This only
* works if each free list has its own unique fragment size and ALL free lists are initialized
* with ompi_free_list_init_ex_new. */
if (mca_btl_vader_component.max_inline_send == data_size) {
frag->my_list = &mca_btl_vader_component.vader_frags_user;
} else if (mca_btl_vader.super.btl_eager_limit == data_size) {
frag->my_list = &mca_btl_vader_component.vader_frags_eager;
}
#if !OMPI_BTL_VADER_HAVE_XPMEM
else if (mca_btl_vader.super.btl_max_send_size == data_size) {
frag->my_list = &mca_btl_vader_component.vader_frags_max_send;
}
#endif
item->ptr = mca_btl_vader_component.my_segment + mca_btl_vader_component.segment_offset;
mca_btl_vader_component.segment_offset += frag_size;
mca_btl_vader_frag_constructor ((mca_btl_vader_frag_t *) item);
}
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag)
{
frag->base.des_src = frag->segments;
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->hdr->flags = 0;
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *)frag);
}
OBJ_CLASS_INSTANCE(mca_btl_vader_frag_t, mca_btl_base_descriptor_t,
mca_btl_vader_frag_constructor, NULL);

Просмотреть файл

@ -26,20 +26,22 @@
#include "ompi_config.h"
#define MCA_BTL_VADER_FLAG_INLINE 0
#define MCA_BTL_VADER_FLAG_SINGLE_COPY 1
#define MCA_BTL_VADER_FLAG_FBOX 2
#define MCA_BTL_VADER_FLAG_COMPLETE 4
enum {
MCA_BTL_VADER_FLAG_INLINE = 0,
MCA_BTL_VADER_FLAG_SINGLE_COPY = 1,
MCA_BTL_VADER_FLAG_COMPLETE = 2,
};
struct mca_btl_vader_frag_t;
struct mca_btl_vader_fbox_t;
struct mca_btl_vader_hdr_t {
volatile intptr_t next; /* next item in fifo. many peers may touch this */
struct mca_btl_vader_frag_t *frag;
mca_btl_base_tag_t tag; /* tag associated with this fragment (used to lookup callback) */
uint8_t flags; /* vader send flags */
int src_smp_rank; /* smp rank of owning process */
size_t len; /* length of data following this header */
uint16_t src_smp_rank; /* smp rank of owning process */
int32_t len; /* length of data following this header */
struct iovec sc_iov; /* io vector containing pointer to single-copy data */
};
typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t;
@ -51,15 +53,15 @@ struct mca_btl_vader_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segments[2];
struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_vader_fbox_t *fbox;
mca_btl_vader_hdr_t *hdr; /* in the shared memory region */
ompi_free_list_t *my_list;
};
typedef struct mca_btl_vader_frag_t mca_btl_vader_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t);
static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_free_list_t *list) {
static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_free_list_t *list,
struct mca_btl_base_endpoint_t *endpoint) {
ompi_free_list_item_t *item;
OMPI_FREE_LIST_GET_MT(list, item);
@ -71,19 +73,38 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
(*frag)->hdr->flags = MCA_BTL_VADER_FLAG_INLINE;
(*frag)->my_list = list;
(*frag)->endpoint = endpoint;
}
return OMPI_SUCCESS;
}
void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag);
static inline void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag)
{
frag->hdr->flags = 0;
frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1);
frag->base.des_src = frag->segments;
frag->base.des_src_cnt = 1;
frag->base.des_dst = frag->segments;
frag->base.des_dst_cnt = 1;
frag->fbox = NULL;
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *)frag);
}
OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t);
#define MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_eager, endpoint)
#if !OMPI_BTL_VADER_HAVE_XPMEM
#define MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_max_send, endpoint)
#endif
#define MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user, endpoint)
#define MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_eager)
#define MCA_BTL_VADER_FRAG_ALLOC_USER(frag) \
mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user)
#define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag)

Просмотреть файл

@ -16,6 +16,15 @@
#include "btl_vader_endpoint.h"
#include "btl_vader_xpmem.h"
#if OMPI_BTL_VADER_HAVE_CMA
#include <sys/uio.h>
#if OMPI_CMA_NEED_SYSCALL_DEFS
#include "opal/sys/cma.h"
#endif /* OMPI_CMA_NEED_SYSCALL_DEFS */
#endif
/**
* Initiate an synchronous get.
*
@ -23,6 +32,7 @@
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
#if OMPI_BTL_VADER_HAVE_XPMEM
int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
@ -47,3 +57,27 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
return OMPI_SUCCESS;
}
#elif OMPI_BTL_VADER_HAVE_CMA
int mca_btl_vader_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_src;
mca_btl_base_segment_t *dst = des->des_dst;
const size_t size = min(dst->seg_len, src->seg_len);
struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size};
struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size};
ssize_t ret;
ret = process_vm_readv (endpoint->seg_ds.seg_cpid, &dst_iov, 1, &src_iov, 1, 0);
if (ret != size) {
fprintf (stderr, "Read %d, expected %u, errno = %d\n", ret, size, errno);
return OMPI_ERROR;
}
mca_btl_vader_frag_complete (frag);
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -28,6 +28,8 @@
#include "btl_vader_fifo.h"
#include "btl_vader_fbox.h"
#include <string.h>
static int vader_del_procs (struct mca_btl_base_module_t *btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers);
@ -70,20 +72,8 @@ static int vader_ft_event (int state);
mca_btl_vader_t mca_btl_vader = {
{
&mca_btl_vader_component.super,
.btl_eager_limit = 0,
.btl_rndv_eager_limit = 0,
.btl_max_send_size = 0,
.btl_rdma_pipeline_send_length = 0,
.btl_rdma_pipeline_frag_size = 0,
.btl_min_rdma_pipeline_size = 0,
.btl_exclusivity = 0,
.btl_latency = 0,
.btl_bandwidth = 0,
.btl_flags = 0,
.btl_seg_size = 0,
.btl_add_procs = vader_add_procs,
.btl_del_procs = vader_del_procs,
.btl_register = NULL,
.btl_finalize = vader_finalize,
.btl_alloc = mca_btl_vader_alloc,
.btl_free = vader_free,
@ -91,10 +81,14 @@ mca_btl_vader_t mca_btl_vader = {
.btl_prepare_dst = vader_prepare_dst,
.btl_send = mca_btl_vader_send,
.btl_sendi = mca_btl_vader_sendi,
/* only support RDMA if we have CMA or XPMEM */
#if OMPI_BTL_VADER_HAVE_XPMEM || OMPI_BTL_VADER_HAVE_CMA
.btl_put = mca_btl_vader_put,
.btl_get = mca_btl_vader_get,
#endif
.btl_dump = mca_btl_base_dump,
.btl_mpool = NULL,
.btl_register_error = vader_register_error_cb,
.btl_ft_event = vader_ft_event
}
@ -106,9 +100,10 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
int rc;
/* generate the endpoints */
component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n, sizeof (struct mca_btl_base_endpoint_t));
component->endpoints = (struct mca_btl_base_endpoint_t *) calloc (n + 1, sizeof (struct mca_btl_base_endpoint_t));
component->endpoints[n].peer_smp_rank = -1;
component->segment_offset = (n - 1) * MCA_BTL_VADER_FBOX_PEER_SIZE + 4096;
component->segment_offset = (n - 1) * MCA_BTL_VADER_FBOX_PEER_SIZE + MCA_BTL_VADER_FIFO_SIZE;
/* initialize fragment descriptor free lists */
/* initialize free list for put/get/single copy/inline fragments */
@ -141,6 +136,23 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
return rc;
}
#if !OMPI_BTL_VADER_HAVE_XPMEM
/* initialize free list for buffered send fragments */
rc = ompi_free_list_init_ex_new(&component->vader_frags_max_send,
sizeof (mca_btl_vader_frag_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_vader_frag_t),
0, opal_cache_line_size,
component->vader_free_list_num,
component->vader_free_list_max,
component->vader_free_list_inc,
NULL, mca_btl_vader_frag_init,
(void *) (sizeof (mca_btl_vader_hdr_t) +
mca_btl_vader.super.btl_max_send_size));
if (OMPI_SUCCESS != rc) {
return rc;
}
#endif
/* set flag indicating btl has been inited */
vader_btl->btl_inited = true;
@ -164,24 +176,53 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct ompi_
return rc;
}
/* attatch to the remote segment */
#if OMPI_BTL_VADER_HAVE_XPMEM
/* always use xpmem if it is available */
ep->apid = xpmem_get (modex->seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
ep->rcache = mca_rcache_base_module_create("vma");
(void) vader_get_registation (ep, modex->segment_base, mca_btl_vader_component.segment_size,
MCA_MPOOL_FLAGS_PERSIST, (void **) &ep->segment_base);
#else
opal_shmem_ds_copy (&modex->seg_ds, &ep->seg_ds);
ep->segment_base = opal_shmem_segment_attach (&ep->seg_ds);
if (NULL == ep->segment_base) {
return rc;
}
#endif
ep->next_fbox_out = 0;
ep->next_fbox_in = 0;
/* attatch to the remote segment */
(void) vader_get_registation (ep, modex->segment_base, mca_btl_vader_component.segment_size,
MCA_MPOOL_FLAGS_PERSIST, (void **) &ep->segment_base);
ep->fifo = (struct vader_fifo_t *) ep->segment_base;
ep->fbox_in = ep->segment_base + 4096 + fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE;
ep->fbox_out = component->my_segment + 4096 + fbox_out_offset * MCA_BTL_VADER_FBOX_PEER_SIZE;
ep->fbox_in = (struct mca_btl_vader_fbox_t * restrict) (ep->segment_base + MCA_BTL_VADER_FIFO_SIZE +
fbox_in_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
ep->fbox_out = (struct mca_btl_vader_fbox_t * restrict) (component->my_segment + MCA_BTL_VADER_FIFO_SIZE +
fbox_out_offset * MCA_BTL_VADER_FBOX_PEER_SIZE);
} else {
/* set up the segment base so we can calculate a virtual to real for local pointers */
ep->segment_base = component->my_segment;
ep->fifo = (struct vader_fifo_t *) ep->segment_base;
}
ep->fifo = (struct vader_fifo_t *) ep->segment_base;
return OMPI_SUCCESS;
}
static int fini_vader_endpoint (struct mca_btl_base_endpoint_t *ep)
{
if (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK) {
#if OMPI_BTL_VADER_HAVE_XPMEM
xpmem_release (ep->apid);
OBJ_RELEASE(ep->rcache);
#else
opal_shmem_segment_detach (&ep->seg_ds);
#endif
}
ep->fbox_in = ep->fbox_out = NULL;
ep->segment_base = NULL;
return OMPI_SUCCESS;
}
@ -206,7 +247,6 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
int32_t proc, local_rank;
ompi_proc_t *my_proc;
int rc;
@ -234,7 +274,7 @@ static int vader_add_procs (struct mca_btl_base_module_t* btl,
}
}
for (proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) {
for (int32_t proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) {
/* check to see if this proc can be reached via shmem (i.e.,
if they're on my local host and in my job) */
if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
@ -273,6 +313,11 @@ static int vader_del_procs(struct mca_btl_base_module_t *btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers)
{
for (int i = 0 ; i < nprocs ; ++i) {
fini_vader_endpoint (peers[i]);
peers[i] = NULL;
}
return OMPI_SUCCESS;
}
@ -292,6 +337,24 @@ static int vader_del_procs(struct mca_btl_base_module_t *btl,
static int vader_finalize(struct mca_btl_base_module_t *btl)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
if (!vader_btl->btl_inited) {
return OMPI_SUCCESS;
}
for (int i = 0 ; i < 1 + MCA_BTL_VADER_NUM_LOCAL_PEERS ; ++i) {
fini_vader_endpoint (component->endpoints + i);
}
free (component->endpoints);
vader_btl->btl_inited = false;
#if !OMPI_BTL_VADER_HAVE_XPMEM
opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds);
#endif
return OMPI_SUCCESS;
}
@ -323,14 +386,18 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl
mca_btl_vader_frag_t *frag = NULL;
if (size <= (size_t) mca_btl_vader_component.max_inline_send) {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);
} else if (size <= mca_btl_vader.super.btl_eager_limit) {
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);
}
#if !OMPI_BTL_VADER_HAVE_XPMEM
else if (size <= mca_btl_vader.super.btl_max_send_size) {
(void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
}
#endif
if (OPAL_LIKELY(frag != NULL)) {
frag->segments[0].seg_len = size;
frag->endpoint = endpoint;
frag->base.des_flags = flags;
frag->base.order = order;
@ -362,7 +429,7 @@ struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t
mca_btl_vader_frag_t *frag;
void *data_ptr;
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
@ -375,8 +442,6 @@ struct mca_btl_base_descriptor_t *vader_prepare_dst(struct mca_btl_base_module_t
frag->base.order = order;
frag->base.des_flags = flags;
frag->endpoint = endpoint;
return &frag->base;
}
@ -393,10 +458,9 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
uint32_t flags)
{
const size_t total_size = reserve + *size;
struct iovec iov;
mca_btl_vader_fbox_t *fbox;
mca_btl_vader_frag_t *frag;
uint32_t iov_count = 1;
void *data_ptr, *fbox_ptr;
void *data_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
@ -404,8 +468,17 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
if (OPAL_LIKELY(reserve)) {
/* in place send fragment */
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
uint32_t iov_count = 1;
struct iovec iov;
/* non-contiguous data requires the convertor */
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag);
#if !OMPI_BTL_VADER_HAVE_XPMEM
if (total_size > mca_btl_vader.super.btl_eager_limit) {
(void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
} else
#endif
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
@ -423,12 +496,22 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
frag->segments[0].seg_len = total_size;
} else {
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
#if !OMPI_BTL_VADER_HAVE_XPMEM
if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) {
(void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint);
} else {
(void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint);
}
#else
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);
#endif
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
if (total_size > (size_t) mca_btl_vader_component.max_inline_send) {
#if OMPI_BTL_VADER_HAVE_XPMEM
/* use xpmem to send this segment if it is above the max inline send size */
if (OPAL_UNLIKELY(total_size > (size_t) mca_btl_vader_component.max_inline_send)) {
/* single copy send */
frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY;
@ -441,24 +524,30 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
frag->segments[1].seg_addr.pval = data_ptr;
frag->base.des_src_cnt = 2;
} else {
#endif
/* inline send */
/* try to reserve a fast box for this transfer */
fbox_ptr = mca_btl_vader_reserve_fbox (endpoint, total_size);
if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) {
/* try to reserve a fast box for this transfer only if the
* fragment does not belong to the caller */
fbox = mca_btl_vader_reserve_fbox (endpoint, total_size);
if (OPAL_LIKELY(fbox)) {
frag->segments[0].seg_addr.pval = fbox->data;
}
if (fbox_ptr) {
frag->hdr->flags |= MCA_BTL_VADER_FLAG_FBOX;
frag->segments[0].seg_addr.pval = fbox_ptr;
frag->fbox = fbox;
}
/* NTH: the covertor adds some latency so we bypass it here */
vader_memmove ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve),
data_ptr, *size);
frag->segments[0].seg_len = total_size;
#if OMPI_BTL_VADER_HAVE_XPMEM
}
#endif
}
} else {
/* put/get fragment */
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag);
(void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
@ -470,8 +559,6 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_
frag->base.order = order;
frag->base.des_flags = flags;
frag->endpoint = endpoint;
return &frag->base;
}

Просмотреть файл

@ -16,6 +16,15 @@
#include "btl_vader_endpoint.h"
#include "btl_vader_xpmem.h"
#if OMPI_BTL_VADER_HAVE_CMA
#include <sys/uio.h>
#if OMPI_CMA_NEED_SYSCALL_DEFS
#include "opal/sys/cma.h"
#endif /* OMPI_CMA_NEED_SYSCALL_DEFS */
#endif
/**
* Initiate an synchronous put.
*
@ -23,6 +32,7 @@
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
#if OMPI_BTL_VADER_HAVE_XPMEM
int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
@ -50,3 +60,27 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
return OMPI_SUCCESS;
}
#elif OMPI_BTL_VADER_HAVE_CMA
int mca_btl_vader_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des;
mca_btl_base_segment_t *src = des->des_src;
mca_btl_base_segment_t *dst = des->des_dst;
const size_t size = min(dst->seg_len, src->seg_len);
struct iovec src_iov = {.iov_base = src->seg_addr.pval, .iov_len = size};
struct iovec dst_iov = {.iov_base = dst->seg_addr.pval, .iov_len = size};
ssize_t ret;
ret = process_vm_writev (endpoint->seg_ds.seg_cpid, &src_iov, 1, &dst_iov, 1, 0);
if (ret != size) {
fprintf (stderr, "Wrote %d, expected %u\n", ret, size);
return OMPI_ERROR;
}
mca_btl_vader_frag_complete (frag);
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -41,8 +41,8 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor;
if (OPAL_LIKELY(frag->hdr->flags & MCA_BTL_VADER_FLAG_FBOX)) {
mca_btl_vader_fbox_send (frag->segments[0].seg_addr.pval, tag, frag->segments[0].seg_len);
if (OPAL_LIKELY(frag->fbox)) {
mca_btl_vader_fbox_send (frag->fbox, tag, frag->segments[0].seg_len);
mca_btl_vader_frag_complete (frag);
return 1;
@ -54,7 +54,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl,
frag->hdr->tag = tag;
/* post the relative address of the descriptor into the peer's fifo */
vader_fifo_write (frag->hdr, endpoint);
vader_fifo_write_ep (frag->hdr, endpoint);
if ((frag->hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) ||
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {

Просмотреть файл

@ -43,32 +43,28 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
{
size_t length = (header_size + payload_size);
mca_btl_vader_frag_t *frag;
uint32_t iov_count = 1;
struct iovec iov;
size_t max_data;
void *data_ptr = NULL;
assert (length < mca_btl_vader.super.btl_eager_limit);
assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
size_t length;
if (payload_size) {
opal_convertor_get_current_pointer (convertor, &data_ptr);
}
if (!opal_convertor_need_buffers (convertor) &&
if (!(payload_size && opal_convertor_need_buffers (convertor)) &&
mca_btl_vader_fbox_sendi (endpoint, tag, header, header_size, data_ptr, payload_size)) {
return OMPI_SUCCESS;
}
/* we won't ever return a descriptor */
*descriptor = NULL;
length = header_size + payload_size;
/* allocate a fragment, giving up if we can't get one */
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, length,
flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (OPAL_UNLIKELY(NULL == frag)) {
*descriptor = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
@ -82,21 +78,21 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl,
/* write the message data if there is any */
/* we can't use single-copy semantics here since as caller will consider the send
complete when we return */
if (OPAL_UNLIKELY(payload_size && opal_convertor_need_buffers (convertor))) {
if (payload_size) {
uint32_t iov_count = 1;
struct iovec iov;
/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size);
iov.iov_len = max_data = payload_size;
iov.iov_len = length = payload_size;
(void) opal_convertor_pack (convertor, &iov, &iov_count, &max_data);
(void) opal_convertor_pack (convertor, &iov, &iov_count, &length);
assert (max_data == payload_size);
} else if (payload_size) {
/* bypassing the convertor may speed things up a little */
memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size), data_ptr, payload_size);
assert (length == payload_size);
}
/* write the fragment pointer to peer's the FIFO. the progress function will return the fragment */
vader_fifo_write (frag->hdr, endpoint);
vader_fifo_write_ep (frag->hdr, endpoint);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -14,6 +14,8 @@
#include "btl_vader_xpmem.h"
#include "opal/mca/memchecker/base/base.h"
#if OMPI_BTL_VADER_HAVE_XPMEM
/* largest address we can attach to using xpmem */
#define VADER_MAX_ADDRESS ((uintptr_t)0x7ffffffff000)
@ -115,3 +117,5 @@ void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_b
OBJ_RELEASE (reg);
}
}
#endif /* OMPI_BTL_VADER_HAVE_XPMEM */

Просмотреть файл

@ -14,11 +14,34 @@
#include "btl_vader.h"
#if OMPI_BTL_VADER_HAVE_XPMEM
/* look up the remote pointer in the peer rcache and attach if
* necessary */
mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
size_t size, int flags, void **local_ptr);
void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
#else
static inline mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
size_t size, int flags, void **local_ptr)
{
(void) endpoint;
(void) rem_ptr;
(void) size;
(void) flags;
(void) local_ptr;
return NULL;
}
static inline void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint)
{
(void) reg;
(void) endpoint;
}
#endif /* OMPI_BTL_VADER_HAVE_XPMEM */
#endif

Просмотреть файл

@ -19,8 +19,7 @@
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_XPMEM], [
# OPAL_VAR_SCOPE_PUSH([ompi_check_xpmem_happy ompi_check_xpmem_$1_save_CPPFLAGS ompi_check_xpmem_dir])
OPAL_VAR_SCOPE_PUSH([ompi_check_xpmem_happy])
AC_ARG_WITH([xpmem],
[AC_HELP_STRING([--with-xpmem(=DIR)],
[Build with XPMEM kernel module support, searching for headers in DIR])])
@ -46,7 +45,7 @@ AC_DEFUN([OMPI_CHECK_XPMEM], [
[AS_IF([test ! -z "$with_xpmem" -a "$with_xpmem" != "no"],
[AC_MSG_ERROR([XPMEM support requested but not found. Aborting])])
$3])
# OPAL_VAR_SCOPE_POP
OPAL_VAR_SCOPE_POP
])dnl
# MCA_btl_sm_CONFIG([action-if-can-compile],
@ -55,25 +54,33 @@ AC_DEFUN([OMPI_CHECK_XPMEM], [
AC_DEFUN([MCA_ompi_btl_vader_CONFIG],[
AC_CONFIG_FILES([ompi/mca/btl/vader/Makefile])
OPAL_VAR_SCOPE_PUSH([btl_vader_xpmem_happy])
OMPI_CHECK_XPMEM([btl_vader],
[btl_vader_xpmem_happy=1],
[btl_vader_xpmem_happy=0])
OPAL_VAR_SCOPE_PUSH([btl_vader_xpmem_happy btl_vader_cma_happy])
AC_DEFINE_UNQUOTED([OMPI_BTL_VADER_HAVE_XPMEM],
[$btl_vader_xpmem_happy],
[If XPMEM support can be enabled within vader])
btl_vader_cma_happy=0
btl_vader_xpmem_happy=0
# at this point, we can only build vader if we have XPMEM support
AS_IF([test "$btl_vader_xpmem_happy" = "1"],
[$1],
[$2])
# default to using XPMEM if it is available
OMPI_CHECK_XPMEM([btl_vader], [btl_vader_xpmem_happy=1], [])
AC_DEFINE_UNQUOTED([OMPI_BTL_VADER_HAVE_XPMEM], [$btl_vader_xpmem_happy],
[If XPMEM support can be enabled within vader])
if test $btl_vader_xpmem_happy = 0 ; then
# check for CMA if requested. it won't be used if xpmem was available
OMPI_CHECK_CMA([btl_vader], [btl_vader_cma_happy=1], [])
fi
AC_DEFINE_UNQUOTED([OMPI_BTL_VADER_HAVE_CMA], [$btl_vader_cma_happy],
[If CMA support can be enabled within vader])
OPAL_VAR_SCOPE_POP
# always happy
[$1]
# substitute in the things needed to build with XPMEM support
AC_SUBST([btl_vader_CFLAGS])
AC_SUBST([btl_vader_CPPFLAGS])
AC_SUBST([btl_vader_LDFLAGS])
AC_SUBST([btl_vader_LIBS])
# OPAL_VAR_SCOPE_POP
])dnl