68c4ecbe06
This commit was SVN r32328.
424 строки
17 KiB
C
424 строки
17 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2011 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006-2007 Voltaire. All rights reserved.
|
|
* Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2010-2014 Los Alamos National Security, LLC.
|
|
* All rights reserved.
|
|
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
#include "opal_config.h"
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/btl/base/btl_base_error.h"
|
|
|
|
#include "btl_vader.h"
|
|
#include "btl_vader_frag.h"
|
|
#include "btl_vader_fifo.h"
|
|
#include "btl_vader_fbox.h"
|
|
|
|
#include <sys/mman.h>
|
|
|
|
static int mca_btl_vader_component_progress (void);
|
|
static int mca_btl_vader_component_open(void);
|
|
static int mca_btl_vader_component_close(void);
|
|
static int mca_btl_vader_component_register(void);
|
|
static mca_btl_base_module_t** mca_btl_vader_component_init(int *num_btls,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads);
|
|
|
|
/*
|
|
* Shared Memory (VADER) component instance.
|
|
*/
|
|
mca_btl_vader_component_t mca_btl_vader_component = {
|
|
.super = {
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
.btl_version = {
|
|
MCA_BTL_DEFAULT_VERSION("vader"),
|
|
.mca_open_component = mca_btl_vader_component_open,
|
|
.mca_close_component = mca_btl_vader_component_close,
|
|
.mca_register_component_params = mca_btl_vader_component_register,
|
|
},
|
|
.btl_data = {
|
|
/* The component is checkpoint ready */
|
|
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
},
|
|
|
|
.btl_init = mca_btl_vader_component_init,
|
|
.btl_progress = mca_btl_vader_component_progress,
|
|
} /* end super */
|
|
};
|
|
|
|
static int mca_btl_vader_component_register (void)
|
|
{
|
|
(void) mca_base_var_group_component_register(&mca_btl_vader_component.super.btl_version,
|
|
"XPMEM shared memory byte transport later");
|
|
|
|
/* register VADER component variables */
|
|
mca_btl_vader_component.vader_free_list_num = 8;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"free_list_num", "Initial number of fragments "
|
|
"to allocate for shared memory communication.",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.vader_free_list_num);
|
|
mca_btl_vader_component.vader_free_list_max = 16384;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"free_list_max", "Maximum number of fragments "
|
|
"to allocate for shared memory communication.",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.vader_free_list_max);
|
|
mca_btl_vader_component.vader_free_list_inc = 64;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"free_list_inc", "Number of fragments to create "
|
|
"on each allocation.", MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.vader_free_list_inc);
|
|
|
|
mca_btl_vader_component.memcpy_limit = 524288;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"memcpy_limit", "Message size to switch from using "
|
|
"memove to memcpy. The relative speed of these two "
|
|
"routines can vary by size.", MCA_BASE_VAR_TYPE_INT,
|
|
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.memcpy_limit);
|
|
mca_btl_vader_component.log_attach_align = 21;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"log_align", "Log base 2 of the alignment to use for xpmem "
|
|
"segments (default: 21, minimum: 12, maximum: 25)",
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.log_attach_align);
|
|
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM && 64 == MCA_BTL_VADER_BITNESS
|
|
mca_btl_vader_component.segment_size = 1 << 24;
|
|
#else
|
|
mca_btl_vader_component.segment_size = 1 << 22;
|
|
#endif
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"segment_size", "Maximum size of all shared "
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM && 64 == MCA_BTL_VADER_BITNESS
|
|
"memory buffers (default: 16M)",
|
|
#else
|
|
"memory buffers (default: 4M)",
|
|
#endif
|
|
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.segment_size);
|
|
|
|
mca_btl_vader_component.max_inline_send = 256;
|
|
(void) mca_base_component_var_register(&mca_btl_vader_component.super.btl_version,
|
|
"max_inline_send", "Maximum size to transfer "
|
|
"using copy-in copy-out semantics",
|
|
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
|
|
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5,
|
|
MCA_BASE_VAR_SCOPE_LOCAL,
|
|
&mca_btl_vader_component.max_inline_send);
|
|
|
|
mca_btl_vader.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM
|
|
mca_btl_vader.super.btl_eager_limit = 32 * 1024;
|
|
mca_btl_vader.super.btl_rndv_eager_limit = mca_btl_vader.super.btl_eager_limit;
|
|
mca_btl_vader.super.btl_max_send_size = mca_btl_vader.super.btl_eager_limit;
|
|
mca_btl_vader.super.btl_min_rdma_pipeline_size = mca_btl_vader.super.btl_eager_limit;
|
|
#else
|
|
mca_btl_vader.super.btl_eager_limit = 4 * 1024;
|
|
mca_btl_vader.super.btl_rndv_eager_limit = 32 * 1024;
|
|
mca_btl_vader.super.btl_max_send_size = 32 * 1024;
|
|
mca_btl_vader.super.btl_min_rdma_pipeline_size = 32 * 1024;
|
|
#endif
|
|
|
|
mca_btl_vader.super.btl_rdma_pipeline_send_length = mca_btl_vader.super.btl_eager_limit;
|
|
mca_btl_vader.super.btl_rdma_pipeline_frag_size = mca_btl_vader.super.btl_eager_limit;
|
|
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM || OPAL_BTL_VADER_HAVE_CMA
|
|
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
|
|
#else
|
|
mca_btl_vader.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE;
|
|
#endif
|
|
|
|
mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM || OPAL_BTL_VADER_HAVE_CMA
|
|
mca_btl_vader.super.btl_bandwidth = 40000; /* Mbs */
|
|
#else
|
|
mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */
|
|
#endif
|
|
mca_btl_vader.super.btl_latency = 1; /* Microsecs */
|
|
|
|
/* Call the BTL based to register its MCA params */
|
|
mca_btl_base_param_register(&mca_btl_vader_component.super.btl_version,
|
|
&mca_btl_vader.super);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Called by MCA framework to open the component, registers
|
|
* component parameters.
|
|
*/
|
|
|
|
static int mca_btl_vader_component_open(void)
|
|
{
|
|
/* initialize objects */
|
|
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t);
|
|
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
|
OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t);
|
|
#endif
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
|
|
/*
|
|
* component cleanup - sanity checking of queue lengths
|
|
*/
|
|
|
|
static int mca_btl_vader_component_close(void)
|
|
{
|
|
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager);
|
|
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user);
|
|
#if !OPAL_BTL_VADER_HAVE_XPMEM
|
|
OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send);
|
|
#endif
|
|
|
|
if (NULL != mca_btl_vader_component.my_segment) {
|
|
munmap (mca_btl_vader_component.my_segment, mca_btl_vader_component.segment_size);
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int mca_btl_base_vader_modex_send (void)
|
|
{
|
|
struct vader_modex_t modex;
|
|
int modex_size;
|
|
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM
|
|
modex.seg_id = mca_btl_vader_component.my_seg_id;
|
|
modex.segment_base = mca_btl_vader_component.my_segment;
|
|
|
|
modex_size = sizeof (modex);
|
|
#else
|
|
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
|
|
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
|
|
#endif
|
|
|
|
return opal_modex_send(&mca_btl_vader_component.super.btl_version, &modex, modex_size);
|
|
}
|
|
|
|
/*
|
|
* VADER component initialization
|
|
*/
|
|
static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
mca_btl_vader_component_t *component = &mca_btl_vader_component;
|
|
mca_btl_base_module_t **btls = NULL;
|
|
int rc;
|
|
|
|
*num_btls = 0;
|
|
|
|
/* disable if there are no local peers */
|
|
if (0 == MCA_BTL_VADER_NUM_LOCAL_PEERS) {
|
|
BTL_VERBOSE(("No peers to communicate with. Disabling vader."));
|
|
return NULL;
|
|
}
|
|
|
|
/* limit segment alignment to be between 4k and 16M */
|
|
|
|
if (mca_btl_vader_component.log_attach_align < 12) {
|
|
mca_btl_vader_component.log_attach_align = 12;
|
|
} else if (mca_btl_vader_component.log_attach_align > 25) {
|
|
mca_btl_vader_component.log_attach_align = 25;
|
|
}
|
|
|
|
btls = (mca_btl_base_module_t **) calloc (1, sizeof (mca_btl_base_module_t *));
|
|
if (NULL == btls) {
|
|
return NULL;
|
|
}
|
|
|
|
/* ensure a sane segment size */
|
|
if (mca_btl_vader_component.segment_size < (2 << 20)) {
|
|
mca_btl_vader_component.segment_size = (2 << 20);
|
|
}
|
|
|
|
if (mca_btl_vader_component.segment_size > (1ul << MCA_BTL_VADER_OFFSET_BITS)) {
|
|
mca_btl_vader_component.segment_size = 2ul << MCA_BTL_VADER_OFFSET_BITS;
|
|
}
|
|
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM
|
|
/* create an xpmem segment for the entire memory space */
|
|
component->my_seg_id = xpmem_make (0, VADER_MAX_ADDRESS, XPMEM_PERMIT_MODE, (void *)0666);
|
|
if (-1 == component->my_seg_id) {
|
|
BTL_VERBOSE(("Could not create xpmem segment"));
|
|
free (btls);
|
|
return NULL;
|
|
}
|
|
|
|
component->my_segment = mmap (NULL, mca_btl_vader_component.segment_size, PROT_READ |
|
|
PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
|
if ((void *)-1 == component->my_segment) {
|
|
BTL_VERBOSE(("Could not create anonymous memory segment"));
|
|
free (btls);
|
|
return NULL;
|
|
}
|
|
#else
|
|
{
|
|
char *sm_file;
|
|
|
|
rc = asprintf(&sm_file, "%s" OPAL_PATH_SEP "vader_segment.%s.%d", opal_process_info.proc_session_dir,
|
|
opal_process_info.nodename, MCA_BTL_VADER_LOCAL_RANK);
|
|
if (0 > rc) {
|
|
free (btls);
|
|
return NULL;
|
|
}
|
|
|
|
rc = opal_shmem_segment_create (&mca_btl_vader_component.seg_ds, sm_file, mca_btl_vader_component.segment_size);
|
|
free (sm_file);
|
|
if (OPAL_SUCCESS != rc) {
|
|
BTL_VERBOSE(("Could not create shared memory segment"));
|
|
free (btls);
|
|
return NULL;
|
|
}
|
|
|
|
component->my_segment = opal_shmem_segment_attach (&mca_btl_vader_component.seg_ds);
|
|
if (NULL == component->my_segment) {
|
|
BTL_VERBOSE(("Could not attach to just created shared memory segment"));
|
|
goto failed;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
component->segment_offset = 0;
|
|
|
|
memset (component->my_segment + MCA_BTL_VADER_FIFO_SIZE, 0, MCA_BTL_VADER_NUM_LOCAL_PEERS *
|
|
MCA_BTL_VADER_FBOX_PEER_SIZE);
|
|
|
|
/* initialize my fifo */
|
|
rc = vader_fifo_init ((struct vader_fifo_t *) component->my_segment);
|
|
if (OPAL_SUCCESS != rc) {
|
|
BTL_VERBOSE(("Error initializing FIFO"));
|
|
goto failed;
|
|
}
|
|
|
|
rc = mca_btl_base_vader_modex_send ();
|
|
if (OPAL_SUCCESS != rc) {
|
|
BTL_VERBOSE(("Error sending modex"));
|
|
goto failed;
|
|
}
|
|
|
|
*num_btls = 1;
|
|
|
|
/* get pointer to the btls */
|
|
btls[0] = (mca_btl_base_module_t *) &mca_btl_vader;
|
|
|
|
/* set flag indicating btl not inited */
|
|
mca_btl_vader.btl_inited = false;
|
|
|
|
return btls;
|
|
failed:
|
|
#if OPAL_BTL_VADER_HAVE_XPMEM
|
|
munmap (component->my_segment, mca_btl_vader_component.segment_size);
|
|
#else
|
|
opal_shmem_unlink (&mca_btl_vader_component.seg_ds);
|
|
#endif
|
|
|
|
if (btls) {
|
|
free (btls);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static inline int mca_btl_vader_poll_fifo (void)
|
|
{
|
|
const mca_btl_active_message_callback_t *reg;
|
|
struct mca_btl_base_endpoint_t *endpoint;
|
|
mca_btl_vader_hdr_t *hdr;
|
|
|
|
/* poll the fifo until it is empty or a limit has been hit (8 is arbitrary) */
|
|
for (int fifo_count = 0 ; fifo_count < 16 ; ++fifo_count) {
|
|
mca_btl_vader_frag_t frag = {.base = {.des_local = frag.segments, .des_local_count = 1}};
|
|
|
|
hdr = vader_fifo_read (mca_btl_vader_component.my_fifo, &endpoint);
|
|
if (NULL == hdr) {
|
|
return fifo_count;
|
|
}
|
|
|
|
if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) {
|
|
mca_btl_vader_frag_complete (hdr->frag);
|
|
continue;
|
|
}
|
|
|
|
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
|
frag.segments[0].seg_addr.pval = (void *) (hdr + 1);
|
|
frag.segments[0].seg_len = hdr->len;
|
|
|
|
if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) {
|
|
mca_mpool_base_registration_t *xpmem_reg;
|
|
|
|
xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base,
|
|
hdr->sc_iov.iov_len, 0,
|
|
&frag.segments[1].seg_addr.pval);
|
|
|
|
frag.segments[1].seg_len = hdr->sc_iov.iov_len;
|
|
|
|
/* recv upcall */
|
|
frag.base.des_local_count = 2;
|
|
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
|
vader_return_registration (xpmem_reg, endpoint);
|
|
} else {
|
|
reg->cbfunc(&mca_btl_vader.super, hdr->tag, &(frag.base), reg->cbdata);
|
|
}
|
|
|
|
/* return the fragment */
|
|
hdr->flags = MCA_BTL_VADER_FLAG_COMPLETE;
|
|
vader_fifo_write_back (hdr, endpoint);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int mca_btl_vader_component_progress (void)
|
|
{
|
|
bool fboxed;
|
|
|
|
/* check for messages in fast boxes */
|
|
for (int spin_count = 5 ; spin_count ; --spin_count) {
|
|
fboxed = (int) mca_btl_vader_check_fboxes ();
|
|
if (fboxed) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (VADER_FIFO_FREE == mca_btl_vader_component.my_fifo->fifo_head) {
|
|
return (int) fboxed;
|
|
}
|
|
|
|
return mca_btl_vader_poll_fifo () + (int) fboxed;
|
|
}
|