1
1

Add support for the Intel scif interface.

Depends on #3847.

cmr=v1.7.4:reviewer=rhc

This commit was SVN r29490.
Этот коммит содержится в:
Nathan Hjelm 2013-10-23 15:59:14 +00:00
родитель 2e2794fa15
Коммит cde3b05ed3
14 изменённых файлов: 2482 добавлений и 0 удалений

49
ompi/mca/btl/scif/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
# -*- indent-tabs-mode:nil -*-
#
# Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
# reserved.
#
# Additional copyrights may follow
#
# $HEADER$
#
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
AM_CPPFLAGS = $(btl_scif_CPPFLAGS)
if MCA_BUILD_ompi_btl_scif_DSO
component_noinst =
component_install = mca_btl_scif.la
else
component_noinst = libmca_btl_scif.la
component_install =
endif
scif_SOURCES = \
btl_scif_component.c \
btl_scif_module.c \
btl_scif_add_procs.c \
btl_scif_endpoint.h \
btl_scif_endpoint.c \
btl_scif_frag.c \
btl_scif_frag.h \
btl_scif_send.c \
btl_scif_put.c \
btl_scif_get.c \
btl_scif.h
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_btl_scif_la_SOURCES = $(scif_SOURCES)
nodist_mca_btl_scif_la_SOURCES = $(scif_nodist_SOURCES)
mca_btl_scif_la_LIBADD = $(btl_scif_LIBS)
mca_btl_scif_la_LDFLAGS = -module -avoid-version $(btl_scif_LDFLAGS)
noinst_LTLIBRARIES = $(component_noinst)
libmca_btl_scif_la_SOURCES = $(scif_SOURCES)
nodist_libmca_btl_scif_la_SOURCES = $(scif_nodist_SOURCES)
libmca_btl_scif_la_LIBADD = $(btl_scif_LIBS)
libmca_btl_scif_la_LDFLAGS = -module -avoid-version $(btl_scif_LDFLAGS)

237
ompi/mca/btl/scif/btl_scif.h Обычный файл
Просмотреть файл

@ -0,0 +1,237 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_SCIF_H
#define MCA_BTL_SCIF_H
#include "ompi_config.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/grdma/mpool_grdma.h"
#include "opal/util/output.h"
#include "opal_stdint.h"
#include "ompi/proc/proc.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/class/ompi_free_list.h"
#include <scif.h>
#include <errno.h>
#include <stdint.h>
#include <sys/types.h>
#include <assert.h>
#include <sys/time.h>
/* Turn on timers for debug builds */
#if OPAL_ENABLE_DEBUG
/* #define SCIF_TIMING */
#endif
#if defined(SCIF_TIMING)
#include <sys/time.h>
#include <math.h>
static inline void timerspecsub (struct timespec *end, struct timespec *start,
struct timespec *diff) {
diff->tv_nsec = end->tv_nsec - start->tv_nsec;
diff->tv_sec = end->tv_sec - start->tv_sec;
if (diff->tv_nsec < 0) {
--diff->tv_sec;
diff->tv_nsec += 1000000000;
}
}
#define SCIF_UPDATE_TIMER(agg, max, start) \
do { \
struct timespec _te, _diff; \
double _tmpd; \
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &_te); \
timerspecsub(&_te, &(start), &_diff); \
_tmpd = (double) _diff.tv_sec + (double) _diff.tv_nsec / 1000000000.0; \
(agg) += _tmpd; \
(max) = fmax ((max), _tmpd); \
} while (0)
#endif
typedef struct mca_btl_scif_modex_t {
struct scif_portID port_id;
} mca_btl_scif_modex_t;
typedef struct mca_btl_scif_module_t {
mca_btl_base_module_t super;
/* listening endpoint */
scif_epd_t scif_fd;
/* listening port */
struct scif_portID port_id;
size_t endpoint_count;
struct mca_btl_base_endpoint_t *endpoints;
opal_list_t failed_frags;
/* fragments for DMA */
ompi_free_list_t dma_frags;
/* fragments for eager send */
ompi_free_list_t eager_frags;
pthread_t listen_thread;
} mca_btl_scif_module_t;
typedef struct mca_btl_scif_component_t {
/* base BTL component */
mca_btl_base_component_2_0_0_t super;
/* DMA free list settings */
int scif_free_list_num;
int scif_free_list_max;
int scif_free_list_inc;
unsigned int segment_size;
bool rma_use_cpu;
bool rma_sync;
#if defined(SCIF_TIMING)
/* performance timers */
double aquire_buffer_time;
double aquire_buffer_time_max;
double send_time;
double send_time_max;
double sendi_time;
double sendi_time_max;
double get_time;
double get_time_max;
unsigned long get_count;
double put_time;
double put_time_max;
unsigned long put_count;
#endif
} mca_btl_scif_component_t;
int mca_btl_scif_module_init (void);
/**
* BML->BTL notification of change in the process list.
*
* location: btl_scif_add_procs.c
*
* @param btl (IN) BTL module
* @param nprocs (IN) Number of processes
* @param procs (IN) Array of processes
* @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL.
* @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this BTL.
* @return OMPI_SUCCESS or error status on failure.
*/
int
mca_btl_scif_add_procs (struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable);
/**
* Notification of change to the process list.
*
* location: btl_scif_add_procs.c
*
* @param btl (IN) BTL module
* @param nprocs (IN) Number of processes
* @param proc (IN) Set of processes
* @param peer (IN) Set of peer addressing information.
* @return Status indicating if cleanup was successful
*/
int
mca_btl_scif_del_procs (struct mca_btl_base_module_t *btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers);
/**
* Initiate an asynchronous send.
*
* location: btl_scif_send.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transfered
* @param tag (IN) The tag value used to notify the peer.
*/
int
mca_btl_scif_send (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *btl_peer,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag);
int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor);
/**
* Initiate a get operation.
*
* location: btl_scif_get.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_scif_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
/**
* Initiate a put operation.
*
* location: btl_scif_put.c
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int
mca_btl_scif_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des);
mca_btl_base_descriptor_t *
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags);
int mca_btl_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
typedef struct mca_btl_scif_reg_t {
mca_mpool_base_registration_t base;
off_t *registrations;
} mca_btl_scif_reg_t;
/* Global structures */
OMPI_MODULE_DECLSPEC extern mca_btl_scif_component_t mca_btl_scif_component;
OMPI_MODULE_DECLSPEC extern mca_btl_scif_module_t mca_btl_scif_module;
#endif

243
ompi/mca/btl/scif/btl_scif_add_procs.c Обычный файл
Просмотреть файл

@ -0,0 +1,243 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "btl_scif.h"
#include "btl_scif_frag.h"
static int
mca_btl_scif_setup_mpools (mca_btl_scif_module_t *scif_module);
static void *mca_btl_scif_connect_accept (void *arg);
int mca_btl_scif_add_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers,
opal_bitmap_t *reachable) {
mca_btl_scif_module_t *scif_module = (mca_btl_scif_module_t *) btl;
size_t procs_on_board, i, board_proc;
ompi_proc_t *my_proc = ompi_proc_local();
int rc;
/* determine how many procs are on this board */
for (i = 0, procs_on_board = 0 ; i < nprocs ; ++i) {
struct ompi_proc_t *ompi_proc = procs[i];
if (my_proc == ompi_proc) {
continue;
}
if (!OPAL_PROC_ON_LOCAL_HOST(ompi_proc->proc_flags) ||
my_proc == ompi_proc) {
/* scif can only be used with procs on this board */
continue;
}
procs_on_board++;
}
/* allocate space for the detected peers and setup the mpool */
if (NULL == scif_module->endpoints) {
scif_module->endpoints = calloc (procs_on_board, sizeof (mca_btl_base_endpoint_t));
if (OPAL_UNLIKELY(NULL == scif_module->endpoints)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
rc = mca_btl_scif_setup_mpools (scif_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_ERROR(("btl/scif error setting up mpools/free lists"));
return rc;
}
}
for (i = 0, board_proc = 0 ; i < nprocs ; ++i) {
struct ompi_proc_t *ompi_proc = procs[i];
if (my_proc == ompi_proc) {
continue;
}
if (!OPAL_PROC_ON_LOCAL_HOST(ompi_proc->proc_flags) ||
my_proc == ompi_proc) {
peers[i] = NULL;
/* scif can only be used with procs on this board */
continue;
}
/* Initialize endpoints */
rc = mca_btl_scif_ep_init (scif_module->endpoints + board_proc, (mca_btl_scif_module_t *) btl, ompi_proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_ERROR(("btl/scif error initializing endpoint"));
return rc;
}
scif_module->endpoints[board_proc].id = board_proc;
/* Set the reachable bit */
rc = opal_bitmap_set_bit (reachable, i);
/* Store a reference to this peer */
peers[i] = scif_module->endpoints + board_proc;
board_proc++;
}
BTL_VERBOSE(("%lu procs on board\n", (unsigned long) procs_on_board));
scif_module->endpoint_count = procs_on_board;
/* start listening thread */
rc = pthread_create (&mca_btl_scif_module.listen_thread, NULL, mca_btl_scif_connect_accept, NULL);
if (0 > rc) {
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static void *mca_btl_scif_connect_accept (void *arg)
{
struct scif_pollepd pollepd = {.epd = mca_btl_scif_module.scif_fd, .events = SCIF_POLLIN, .revents = 0};
int rc;
BTL_VERBOSE(("btl/scif: listening for new connections"));
/* listen for connections */
while (1) {
pollepd.revents = 0;
rc = scif_poll (&pollepd, 1, -1);
if (1 == rc) {
if (SCIF_POLLIN != pollepd.revents) {
break;
}
rc = mca_btl_scif_ep_connect_start_passive ();
if (OMPI_SUCCESS != rc) {
BTL_VERBOSE(("btl/scif: error accepting scif connection"));
continue;
}
} else {
break;
}
}
BTL_VERBOSE(("btl/scif: stopped listening for new connections"));
return NULL;
}
int mca_btl_scif_del_procs (struct mca_btl_base_module_t *btl,
size_t nprocs, struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t **peers) {
/* do nothing for now */
return OMPI_SUCCESS;
}
static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
{
mca_btl_scif_reg_t *scif_reg = (mca_btl_scif_reg_t *)reg;
size_t size = (size_t)((uintptr_t) reg->bound - (uintptr_t) reg->base);
int i;
/* register the fragment with all connected endpoints */
for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
if ((off_t)-1 != scif_reg->registrations[i] &&
MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
(void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
scif_reg->registrations[i], size);
}
}
free (scif_reg->registrations);
return OMPI_SUCCESS;
}
static int scif_reg_mem (void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg)
{
mca_btl_scif_reg_t *scif_reg = (mca_btl_scif_reg_t *)reg;
int rc = OMPI_SUCCESS;
unsigned int i;
scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count,
sizeof (off_t));
memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t));
/* register the pointer with all connected endpoints */
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd,
base, size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) {
/* cleanup */
scif_dereg_mem (reg_data, reg);
rc = OMPI_ERR_OUT_OF_RESOURCE;
break;
}
}
}
return rc;
}
static int
mca_btl_scif_setup_mpools (mca_btl_scif_module_t *scif_module)
{
struct mca_mpool_base_resources_t mpool_resources;
int rc;
/* initialize the grdma mpool */
mpool_resources.pool_name = "scif";
mpool_resources.reg_data = (void *) scif_module;
mpool_resources.sizeof_reg = sizeof (mca_btl_scif_reg_t);
mpool_resources.register_mem = scif_reg_mem;
mpool_resources.deregister_mem = scif_dereg_mem;
scif_module->super.btl_mpool =
mca_mpool_base_module_create("grdma", scif_module, &mpool_resources);
if (NULL == scif_module->super.btl_mpool) {
BTL_ERROR(("error creating grdma mpool"));
return OMPI_ERROR;
}
/* setup free lists for fragments. dma fragments will be used for
* rma operations and in-place sends. eager frags will be used for
* buffered sends. */
rc = ompi_free_list_init_new (&scif_module->dma_frags,
sizeof (mca_btl_scif_dma_frag_t), 64,
OBJ_CLASS(mca_btl_scif_dma_frag_t),
128, getpagesize (),
mca_btl_scif_component.scif_free_list_num,
mca_btl_scif_component.scif_free_list_max,
mca_btl_scif_component.scif_free_list_inc,
NULL);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
}
rc = ompi_free_list_init_new (&scif_module->eager_frags,
sizeof (mca_btl_scif_eager_frag_t), 8,
OBJ_CLASS(mca_btl_scif_eager_frag_t),
128 + scif_module->super.btl_eager_limit, 64,
mca_btl_scif_component.scif_free_list_num,
mca_btl_scif_component.scif_free_list_max,
mca_btl_scif_component.scif_free_list_inc,
NULL);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_ERROR(("error creating eager receive fragment free list"));
return rc;
}
return OMPI_SUCCESS;
}

379
ompi/mca/btl/scif/btl_scif_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,379 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_frag.h"
#include "opal/include/opal/align.h"
#include "opal/memoryhooks/memory.h"
#include "ompi/runtime/params.h"
#include "opal/mca/base/mca_base_pvar.h"
#include <scif.h>
static int btl_scif_component_register(void);
static int btl_scif_component_open(void);
static int btl_scif_component_close(void);
static mca_btl_base_module_t **mca_btl_scif_component_init(int *, bool, bool);
static int mca_btl_scif_component_progress(void);
mca_btl_scif_component_t mca_btl_scif_component = {
{
/* First, the mca_base_component_t struct containing meta information
about the component itself */
.btl_version = {
MCA_BTL_BASE_VERSION_2_0_0,
.mca_component_name = "scif",
.mca_component_major_version = OMPI_MAJOR_VERSION,
.mca_component_minor_version = OMPI_MINOR_VERSION,
.mca_component_release_version = OMPI_RELEASE_VERSION,
.mca_open_component = btl_scif_component_open,
.mca_close_component = btl_scif_component_close,
.mca_query_component = NULL,
.mca_register_component_params = btl_scif_component_register,
},
.btl_data = {
.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.btl_init = mca_btl_scif_component_init,
.btl_progress = mca_btl_scif_component_progress,
}
};
static int btl_scif_component_register(void)
{
(void) mca_base_var_group_component_register(&mca_btl_scif_component.super.btl_version,
"SCIF byte transport layer");
mca_btl_scif_component.scif_free_list_num = 8;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"free_list_num", "Initial fragment free list size",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.scif_free_list_num);
mca_btl_scif_component.scif_free_list_max = 16384;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"free_list_max", "Maximum fragment free list size",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.scif_free_list_max);
mca_btl_scif_component.scif_free_list_inc = 64;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"free_list_inc", "Fragment free list size increment",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.scif_free_list_inc);
mca_btl_scif_component.segment_size = 8 * 1024;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"segment_size", "Size of memory segment to "
"allocate for each remote process (default: "
"8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.segment_size);
mca_btl_scif_component.rma_use_cpu = false;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"rma_use_cpu", "Use CPU instead of DMA "
"for RMA copies (default: false)", MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.rma_use_cpu);
mca_btl_scif_component.rma_sync = true;
(void) mca_base_component_var_register(&mca_btl_scif_component.super.btl_version,
"rma_sync", "Use synchronous RMA instead of "
"an RMA fence (default: true)", MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_btl_scif_component.rma_sync);
#if defined(SCIF_TIMING)
mca_btl_scif_component.aquire_buffer_time = 0.0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"aquire_buffer_time", "Aggregate time spent "
"aquiring send buffers", OPAL_INFO_LVL_9,
MCA_BASE_PVAR_CLASS_AGGREGATE, MCA_BASE_VAR_TYPE_DOUBLE,
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, MCA_BASE_PVAR_FLAG_READONLY |
MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL, NULL, NULL,
&mca_btl_scif_component.aquire_buffer_time);
mca_btl_scif_component.send_time = 0.0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"send_time", "Aggregate time spent writing to "
"send buffers", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_AGGREGATE,
MCA_BASE_VAR_TYPE_DOUBLE, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.send_time);
mca_btl_scif_component.sendi_time = 0.0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"sendi_time", "Aggregate time spent writing to "
"send buffers in sendi", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_AGGREGATE,
MCA_BASE_VAR_TYPE_DOUBLE, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.sendi_time);
mca_btl_scif_component.get_time = 0.0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"get_time", "Aggregate time spent in DMA read (scif_readfrom)",
OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_AGGREGATE,
MCA_BASE_VAR_TYPE_DOUBLE, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.get_time);
mca_btl_scif_component.get_count = 0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"get_count", "Number of times btl_scif_get was called",
OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.get_count);
mca_btl_scif_component.put_time = 0.0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"put_time", "Aggregate time spent in DMA write (scif_writeto)",
OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_AGGREGATE,
MCA_BASE_VAR_TYPE_DOUBLE, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.put_time);
mca_btl_scif_component.put_count = 0;
(void) mca_base_component_pvar_register(&mca_btl_scif_component.super.btl_version,
"put_count", "Number of times btl_scif_put was called",
OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER,
MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, MCA_BASE_VAR_BIND_NO_OBJECT,
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
NULL, NULL, NULL, &mca_btl_scif_component.put_count);
#endif
mca_btl_scif_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
mca_btl_scif_module.super.btl_eager_limit = 1 * 1024;
mca_btl_scif_module.super.btl_rndv_eager_limit = 1 * 1024;
mca_btl_scif_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
mca_btl_scif_module.super.btl_max_send_size = 1 * 1024;
mca_btl_scif_module.super.btl_rdma_pipeline_send_length = 1 * 1024;
/* threshold for put */
mca_btl_scif_module.super.btl_min_rdma_pipeline_size = 1 * 1024;
mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND |
MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE;
mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t);
mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */
mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */
/* Call the BTL based to register its MCA params */
mca_btl_base_param_register(&mca_btl_scif_component.super.btl_version,
&mca_btl_scif_module.super);
return OMPI_SUCCESS;
}
static int btl_scif_component_open(void)
{
return OMPI_SUCCESS;
}
static int btl_scif_component_close(void)
{
return OMPI_SUCCESS;
}
static void mca_btl_scif_autoset_leave_pinned (void) {
int value = opal_mem_hooks_support_level();
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) {
/* Set leave pinned to 1 if leave pinned pipeline is not set */
if (-1 == ompi_mpi_leave_pinned) {
ompi_mpi_leave_pinned = !ompi_mpi_leave_pinned_pipeline;
}
} else {
ompi_mpi_leave_pinned = 0;
ompi_mpi_leave_pinned_pipeline = 0;
}
}
static int mca_btl_scif_modex_send (void)
{
mca_btl_scif_modex_t modex = {.port_id = mca_btl_scif_module.port_id};
return ompi_modex_send (&mca_btl_scif_component.super.btl_version, &modex, sizeof (modex));
}
static mca_btl_base_module_t **mca_btl_scif_component_init (int *num_btl_modules,
bool enable_progress_threads,
bool enable_mpi_threads)
{
struct mca_btl_base_module_t **base_modules;
int rc;
BTL_VERBOSE(("btl/scif initializing"));
signal (SIGSEGV, SIG_DFL);
/* we currently need the memory hooks to determine when
* registrations are no longer valid. */
mca_btl_scif_autoset_leave_pinned ();
if (32768 < mca_btl_scif_module.super.btl_eager_limit) {
mca_btl_scif_module.super.btl_eager_limit = 32768;
}
/* the segment should be large enough to hold at least one eager packet */
if (4 * mca_btl_scif_module.super.btl_eager_limit > mca_btl_scif_component.segment_size) {
mca_btl_scif_component.segment_size = 4 * mca_btl_scif_module.super.btl_eager_limit;
}
/* round up to a multiple of 4096 */
mca_btl_scif_component.segment_size = (mca_btl_scif_component.segment_size + 0xfff) & ~0xfff;
base_modules = (struct mca_btl_base_module_t **)
calloc (1, sizeof (struct mca_btl_base_module_t *));
if (OPAL_UNLIKELY(NULL == base_modules)) {
BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__));
return NULL;
}
/* initialize the module */
rc = mca_btl_scif_module_init ();
if (OMPI_SUCCESS != rc) {
BTL_VERBOSE(("btl/scif error initializing module"));
free (base_modules);
return NULL;
}
base_modules[0] = &mca_btl_scif_module.super;
rc = mca_btl_scif_modex_send ();
if (OMPI_SUCCESS != rc) {
BTL_VERBOSE(("btl/scif error sending modex"));
free (base_modules);
return NULL;
}
*num_btl_modules = 1;
BTL_VERBOSE(("btl/scif done initializing modules"));
return base_modules;
}
static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep)
{
const mca_btl_active_message_callback_t *reg;
unsigned int start = ep->recv_buffer.start;
unsigned int end = ep->recv_buffer.endp[0];
mca_btl_scif_base_frag_t frag;
mca_btl_scif_frag_hdr_t *hdr;
/* changing this value does not appear to have a signifigant impact
* on performance */
int frags_per_loop = 5;
if (end == start) {
return 0;
}
end &= ~ (1 << 31);
start &= ~ (1 << 31);
/* force all prior reads to complete before continuing */
opal_atomic_rmb ();
do {
hdr = (mca_btl_scif_frag_hdr_t *) (ep->recv_buffer.buffer + start);
/* force all prior reads to complete before continuing */
MB();
BTL_VERBOSE(("got frag with header {.tag = %d, .size = %d} from offset %u",
hdr->tag, hdr->size, start));
#if defined(SCIF_USE_SEQ)
if (hdr->seq != ep->seq_expected) {
break;
}
ep->seq_expected++;
#endif
/* message to skip the rest of the buffer */
if (0xff != hdr->tag) {
reg = mca_btl_base_active_message_trigger + hdr->tag;
/* fragment fits entirely in the remaining buffer space. some
* btl users do not handle fragmented data so we can't split
* the fragment without introducing another copy here. this
* limitation has not appeared to cause any performance
* problems. */
frag.base.des_dst_cnt = 1;
frag.segments[0].base.seg_len = hdr->size;
frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1);
frag.base.des_dst = &frag.segments[0].base;
/* call the registered callback function */
reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata);
}
start = (start + hdr->size + sizeof (*hdr) + 63) & ~63;
/* skip unusable space at the end of the buffer */
if (mca_btl_scif_component.segment_size == start) {
start = 64;
ep->recv_buffer.start = ((ep->recv_buffer.start & (1 << 31)) ^ (1 << 31)) | 64;
} else {
ep->recv_buffer.start = (ep->recv_buffer.start & (1 << 31)) | start;
}
} while (start != end && --frags_per_loop);
/* let the sender know where we stopped */
ep->recv_buffer.startp[0] = ep->recv_buffer.start;
/* return the number of fragments processed */
return 5 - frags_per_loop;
}
static int mca_btl_scif_progress_sends (mca_btl_base_endpoint_t *ep)
{
/* try sending any wait listed fragments */
if (OPAL_UNLIKELY(0 != opal_list_get_size (&ep->frag_wait_list))) {
return mca_btl_progress_send_wait_list (ep);
}
return 0;
}
static int mca_btl_scif_component_progress (void)
{
unsigned int i;
int count = 0;
/* progress all connected endpoints */
for (i = 0, count = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
/* poll all connected endpoints */
count += mca_btl_scif_progress_recvs (mca_btl_scif_module.endpoints + i);
/* if any fragments are waiting try to send them now */
count += mca_btl_scif_progress_sends (mca_btl_scif_module.endpoints + i);
}
}
return count;
}

297
ompi/mca/btl/scif/btl_scif_endpoint.c Обычный файл
Просмотреть файл

@ -0,0 +1,297 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_endpoint.h"
#include "opal/mca/memchecker/base/base.h"
static void mca_btl_scif_ep_construct (mca_btl_base_endpoint_t *ep) {
memset ((char *) ep + sizeof(ep->super), 0, sizeof (*ep) - sizeof (ep->super));
OBJ_CONSTRUCT(&ep->lock, opal_mutex_t);
OBJ_CONSTRUCT(&ep->frag_wait_list, opal_list_t);
}
static void mca_btl_scif_ep_destruct (mca_btl_base_endpoint_t *ep) {
if (ep->send_buffer.buffer) {
scif_munmap (ep->send_buffer.buffer, mca_btl_scif_component.segment_size);
}
if (ep->recv_buffer.buffer) {
scif_unregister (ep->scif_epd, ep->recv_buffer.scif_offset, mca_btl_scif_component.segment_size);
free (ep->recv_buffer.buffer);
}
if (ep->scif_epd) {
scif_close (ep->scif_epd);
}
OBJ_DESTRUCT(&ep->lock);
OBJ_DESTRUCT(&ep->frag_wait_list);
}
OBJ_CLASS_INSTANCE(mca_btl_base_endpoint_t, opal_list_item_t,
mca_btl_scif_ep_construct, mca_btl_scif_ep_destruct);
static void mca_btl_scif_ep_free_buffer (mca_btl_base_endpoint_t *ep) {
if (ep->recv_buffer.buffer) {
scif_unregister (ep->scif_epd, ep->recv_buffer.scif_offset, mca_btl_scif_component.segment_size);
free (ep->recv_buffer.buffer);
ep->recv_buffer.buffer = NULL;
ep->recv_buffer.scif_offset = (off_t) -1;
}
}
static inline int mca_btl_scif_ep_get_buffer (mca_btl_base_endpoint_t *ep) {
int rc;
rc = posix_memalign ((void **) &ep->recv_buffer.buffer, getpagesize(), mca_btl_scif_component.segment_size);
if (0 > rc) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
memset (ep->recv_buffer.buffer, 0, mca_btl_scif_component.segment_size);
ep->recv_buffer.scif_offset = scif_register (ep->scif_epd, ep->recv_buffer.buffer,
mca_btl_scif_component.segment_size, 0,
SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
if (SCIF_REGISTER_FAILED == ep->recv_buffer.scif_offset) {
BTL_VERBOSE(("failed to register a scif buffer of size %d. errno = %d",
mca_btl_scif_component.segment_size, errno));
free (ep->recv_buffer.buffer);
ep->recv_buffer.buffer = NULL;
return OMPI_ERROR;
}
ep->recv_buffer.startp = (uint32_t *) ep->recv_buffer.buffer;
ep->recv_buffer.endp = ep->recv_buffer.startp + 1;
ep->recv_buffer.startp[0] = ep->recv_buffer.endp[0] = 64;
BTL_VERBOSE(("allocated buffer of size %d bytes. with scif registration %lu",
mca_btl_scif_component.segment_size, (unsigned long) ep->recv_buffer.scif_offset));
return OMPI_SUCCESS;
}
/* must be called with the endpoint lock held */
static int mca_btl_scif_ep_connect_finish (mca_btl_base_endpoint_t *ep, bool passive) {
int rc;
rc = mca_btl_scif_ep_get_buffer (ep);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
BTL_VERBOSE(("error allocating buffer for scif peer"));
return rc;
}
if (passive) {
rc = scif_recv (ep->scif_epd, &ep->send_buffer.scif_offset,
sizeof (ep->send_buffer.scif_offset), SCIF_RECV_BLOCK);
if (OPAL_LIKELY(-1 != rc)) {
rc = scif_send (ep->scif_epd, &ep->recv_buffer.scif_offset,
sizeof (ep->recv_buffer.scif_offset), SCIF_SEND_BLOCK);
}
} else {
rc = scif_send (ep->scif_epd, &ep->recv_buffer.scif_offset,
sizeof (ep->recv_buffer.scif_offset), SCIF_SEND_BLOCK);
if (OPAL_LIKELY(-1 != rc)) {
rc = scif_recv (ep->scif_epd, &ep->send_buffer.scif_offset,
sizeof (ep->send_buffer.scif_offset), SCIF_RECV_BLOCK);
}
}
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error exchanging connection data with peer %d", ep->peer_proc->proc_name.vpid));
mca_btl_scif_ep_free_buffer (ep);
return OMPI_ERROR;
}
BTL_VERBOSE(("remote peer %d has scif offset %lu", ep->peer_proc->proc_name.vpid,
(unsigned long) ep->send_buffer.scif_offset));
ep->send_buffer.buffer = scif_mmap (0, mca_btl_scif_component.segment_size,
SCIF_PROT_READ | SCIF_PROT_WRITE,
0, ep->scif_epd, ep->send_buffer.scif_offset);
if (OPAL_UNLIKELY(NULL == ep->send_buffer.buffer)) {
BTL_VERBOSE(("error in scif_mmap"));
mca_btl_scif_ep_free_buffer (ep);
return OMPI_ERROR;
}
opal_memchecker_base_mem_defined (ep->send_buffer.buffer, mca_btl_scif_component.segment_size);
BTL_VERBOSE(("remote peer %d buffer mapped to local pointer %p", ep->peer_proc->proc_name.vpid,
ep->send_buffer.buffer));
/* setup the circular send buffers */
ep->send_buffer.start = ep->send_buffer.end = 64;
ep->send_buffer.startp = (uint32_t *) ep->send_buffer.buffer;
ep->send_buffer.endp = ep->send_buffer.startp + 1;
ep->recv_buffer.start = 64;
/* connection complete */
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTED;
BTL_VERBOSE(("btl/scif connection to remote peer %d established", ep->peer_proc->proc_name.vpid));
return OMPI_SUCCESS;
}
int mca_btl_scif_ep_connect_start_passive (void) {
mca_btl_base_endpoint_t *ep = NULL;
orte_process_name_t remote_name;
struct scif_portID port_id;
unsigned int i;
scif_epd_t epd;
int rc;
/* accept the connection request. if the endpoint is already connecting we
* may close this endpoint and alloc mca_btl_scif_ep_connect_start_active
* to finish the connection. */
rc = scif_accept (mca_btl_scif_module.scif_fd, &port_id, &epd, SCIF_ACCEPT_SYNC);
if (OPAL_UNLIKELY(0 > rc)) {
BTL_VERBOSE(("error accepting connecton from scif peer. %d", errno));
return OMPI_ERROR;
}
/* determine which peer sent the connection request */
rc = scif_recv (epd, &remote_name, sizeof (remote_name), SCIF_RECV_BLOCK);
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error in scif_recv"));
scif_close (epd);
return OMPI_ERROR;
}
BTL_VERBOSE(("got connection request from vpid %d on port %u on node %u",
remote_name.vpid, port_id.port, port_id.node));
for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) {
if (mca_btl_scif_module.endpoints[i].peer_proc->proc_name.vpid == remote_name.vpid) {
ep = mca_btl_scif_module.endpoints + i;
break;
}
}
/* peer not found */
if (i == mca_btl_scif_module.endpoint_count) {
BTL_VERBOSE(("remote peer %d unknown", remote_name.vpid));
scif_close (epd);
return OMPI_ERROR;
}
/* similtaneous connections (active side) */
if ((MCA_BTL_SCIF_EP_STATE_CONNECTING == ep->state &&
ep->port_id.port < mca_btl_scif_module.port_id.port) ||
MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state) {
BTL_VERBOSE(("active connection in progress. connection request from peer %d rejected", remote_name.vpid));
scif_close (epd);
return OMPI_SUCCESS;
}
opal_mutex_lock (&ep->lock);
if (MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state) {
opal_mutex_unlock (&ep->lock);
scif_close (epd);
return OMPI_SUCCESS;
}
BTL_VERBOSE(("accepted connection from port %d", ep->port_id.port));
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTING;
ep->scif_epd = epd;
rc = mca_btl_scif_ep_connect_finish (ep, true);
if (OMPI_SUCCESS != rc) {
scif_close (ep->scif_epd);
ep->scif_epd = -1;
ep->state = MCA_BTL_SCIF_EP_STATE_INIT;
}
opal_mutex_unlock (&ep->lock);
return rc;
}
static inline int mca_btl_scif_ep_connect_start_active (mca_btl_base_endpoint_t *ep) {
int rc = OMPI_SUCCESS;
BTL_VERBOSE(("initiaiting connection to remote peer %d with port: %u on local scif node: %u",
ep->peer_proc->proc_name.vpid, ep->port_id.port, ep->port_id.node));
opal_mutex_lock (&ep->lock);
do {
if (MCA_BTL_SCIF_EP_STATE_INIT != ep->state) {
/* the accept thread has already finished this connection */
rc = OMPI_SUCCESS;
break;
}
ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTING;
ep->scif_epd = scif_open ();
if (OPAL_UNLIKELY(SCIF_OPEN_FAILED == ep->scif_epd)) {
BTL_VERBOSE(("error creating new scif endpoint"));
rc = OMPI_ERROR;
break;
}
rc = scif_connect (ep->scif_epd, &ep->port_id);
if (OPAL_UNLIKELY(-1 == rc)) {
/* the connection attempt failed. this could mean the peer is currently
* processing connections. we will to try again later. */
BTL_VERBOSE(("error connecting to scif peer. %d", errno));
rc = OMPI_ERR_RESOURCE_BUSY;
break;
}
rc = scif_send (ep->scif_epd, OMPI_PROC_MY_NAME, sizeof (*OMPI_PROC_MY_NAME), SCIF_SEND_BLOCK);
if (OPAL_UNLIKELY(-1 == rc)) {
BTL_VERBOSE(("error in scif_send"));
rc = OMPI_ERROR;
break;
}
/* build connection data */
rc = mca_btl_scif_ep_connect_finish (ep, false);
} while (0);
if (OMPI_SUCCESS != rc) {
scif_close (ep->scif_epd);
ep->scif_epd = -1;
ep->state = MCA_BTL_SCIF_EP_STATE_INIT;
}
opal_mutex_unlock (&ep->lock);
return rc;
}
int mca_btl_scif_ep_connect (mca_btl_base_endpoint_t *ep) {
int rc;
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED == ep->state)) {
return OMPI_SUCCESS;
} else if (MCA_BTL_SCIF_EP_STATE_CONNECTING == ep->state) {
return OMPI_ERR_RESOURCE_BUSY;
}
if (MCA_BTL_SCIF_EP_STATE_INIT == ep->state) {
rc = mca_btl_scif_ep_connect_start_active (ep);
if (OMPI_SUCCESS != rc) {
return rc;
}
}
return OMPI_SUCCESS;
}

101
ompi/mca/btl/scif/btl_scif_endpoint.h Обычный файл
Просмотреть файл

@ -0,0 +1,101 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_SCIF_ENDPOINT_H
#define MCA_BTL_SCIF_ENDPOINT_H
#include "btl_scif.h"
typedef enum mca_btl_scif_endpoint_state_t {
MCA_BTL_SCIF_EP_STATE_INIT,
MCA_BTL_SCIF_EP_STATE_CONNECTING,
MCA_BTL_SCIF_EP_STATE_CONNECTED
} mca_btl_scif_endpoint_state_t;
typedef struct mca_btl_scif_endpoint_buffer_t {
unsigned char *buffer;
off_t scif_offset;
unsigned int start, end;
uint32_t *startp, *endp;
} mca_btl_scif_endpoint_buffer_t;
typedef struct mca_btl_base_endpoint_t {
opal_list_item_t super;
mca_btl_scif_module_t *btl;
/* location in the module endpoints array */
int id;
opal_mutex_t lock;
/* scif endpoint */
scif_epd_t scif_epd;
/* connection information */
struct scif_portID port_id;
/* buffer information */
mca_btl_scif_endpoint_buffer_t send_buffer;
mca_btl_scif_endpoint_buffer_t recv_buffer;
/* current connect state */
mca_btl_scif_endpoint_state_t state;
/* frags waiting for resources */
opal_list_t frag_wait_list;
/* associated process */
ompi_proc_t *peer_proc;
#if defined(SCIF_USE_SEQ)
uint32_t seq_next;
uint32_t seq_expected;
#endif
} mca_btl_base_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_base_endpoint_t);
int mca_btl_scif_ep_connect (mca_btl_base_endpoint_t *ep);
int mca_btl_scif_ep_connect_start_passive (void);
static inline int mca_btl_scif_ep_init (mca_btl_base_endpoint_t *endpoint,
mca_btl_scif_module_t *btl,
ompi_proc_t *peer_proc) {
mca_btl_scif_modex_t *modex;
size_t msg_size;
int rc;
OBJ_CONSTRUCT(endpoint, mca_btl_base_endpoint_t);
endpoint->state = MCA_BTL_SCIF_EP_STATE_INIT;
rc = ompi_modex_recv (&mca_btl_scif_component.super.btl_version, peer_proc,
(void **) &modex, &msg_size);
assert (msg_size == sizeof (endpoint->port_id));
endpoint->port_id = modex->port_id;
endpoint->peer_proc = peer_proc;
endpoint->btl = btl;
#if defined(SCIF_USE_SEQ)
endpoint->seq_next = 0x00001010;
endpoint->seq_expected = 0x00001010;
#endif
return OMPI_SUCCESS;
}
static inline int mca_btl_scif_ep_release (mca_btl_base_endpoint_t *ep)
{
OBJ_DESTRUCT(ep);
return OMPI_SUCCESS;
}
#endif /* MCA_BTL_SCIF_ENDPOINT_H */

31
ompi/mca/btl/scif/btl_scif_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,31 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_frag.h"
static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag)
{
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
}
static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag)
{
memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base));
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
}
OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t,
mca_btl_scif_base_frag_constructor, NULL);
OBJ_CLASS_INSTANCE(mca_btl_scif_dma_frag_t, mca_btl_base_descriptor_t,
mca_btl_scif_base_frag_constructor, NULL);

108
ompi/mca/btl/scif/btl_scif_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,108 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(MCA_BTL_SCIF_FRAG_H)
#define MCA_BTL_SCIF_FRAG_H
#include "btl_scif.h"
#include "btl_scif_endpoint.h"
typedef struct mca_btl_scif_segment_t {
mca_btl_base_segment_t base;
/* scif offset */
off_t scif_offset;
/* original pointer */
uint64_t orig_ptr;
} mca_btl_scif_segment_t;
typedef struct mca_btl_scif_frag_hdr_t {
#if defined(SCIF_USE_SEQ)
uint32_t seq;
#endif
uint8_t tag;
uint8_t flags;
uint16_t size;
} mca_btl_scif_frag_hdr_t;
struct mca_btl_scif_base_frag_t;
typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int);
typedef struct mca_btl_scif_base_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_scif_frag_hdr_t hdr;
mca_btl_scif_segment_t segments[2];
mca_btl_base_endpoint_t *endpoint;
mca_btl_scif_reg_t *registration;
ompi_free_list_t *my_list;
} mca_btl_scif_base_frag_t;
typedef mca_btl_scif_base_frag_t mca_btl_scif_dma_frag_t;
typedef mca_btl_scif_base_frag_t mca_btl_scif_eager_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_scif_dma_frag_t);
OBJ_CLASS_DECLARATION(mca_btl_scif_eager_frag_t);
static inline int mca_btl_scif_frag_alloc (mca_btl_base_endpoint_t *ep,
ompi_free_list_t *list,
mca_btl_scif_base_frag_t **frag)
{
ompi_free_list_item_t *item = NULL;
OMPI_FREE_LIST_GET_MT(list, item);
*frag = (mca_btl_scif_base_frag_t *) item;
if (OPAL_LIKELY(NULL != item)) {
(*frag)->my_list = list;
(*frag)->endpoint = ep;
return OMPI_SUCCESS;
}
return OMPI_ERR_OUT_OF_RESOURCE;
}
static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag)
{
if (frag->registration) {
frag->endpoint->btl->super.btl_mpool->mpool_deregister(frag->endpoint->btl->super.btl_mpool,
&frag->registration->base);
frag->registration = NULL;
}
frag->segments[0].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[0].base.seg_len = 0;
frag->segments[1].base.seg_len = 0;
OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag);
return OMPI_SUCCESS;
}
static inline void mca_btl_scif_frag_complete (mca_btl_scif_base_frag_t *frag, int rc) {
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
/* call callback if specified */
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}
if (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
mca_btl_scif_frag_return (frag);
}
}
#define MCA_BTL_SCIF_FRAG_ALLOC_EAGER(ep, frag) \
mca_btl_scif_frag_alloc((ep), &(ep)->btl->eager_frags, &(frag))
#define MCA_BTL_SCIF_FRAG_ALLOC_DMA(ep, frag) \
mca_btl_scif_frag_alloc((ep), &(ep)->btl->dma_frags, &(frag))
#endif /* MCA_BTL_SCIF_FRAG_H */

82
ompi/mca/btl/scif/btl_scif_get.c Обычный файл
Просмотреть файл

@ -0,0 +1,82 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "btl_scif_frag.h"
#include <sys/time.h>
#define lmin(a,b) ((a) < (b) ? (a) : (b))
/**
* Initiate a get operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_src;
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_dst;
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
int rc, mark, flags = 0;
off_t roffset, loffset;
size_t to_get;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
mca_btl_scif_component.get_count++;
#endif
BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des,
(unsigned long) src->scif_offset));
roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU;
}
if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC;
}
/* start the read */
rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags);
if (OPAL_UNLIKELY(-1 == rc)) {
return OMPI_ERROR;
}
/* always call the callback function */
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (!(flags & SCIF_RMA_SYNC)) {
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_readfrom */
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
scif_fence_wait (endpoint->scif_epd, mark);
}
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
mca_btl_scif_component.get_time_max, ts);
#endif
/* since we completed the fence the RMA operation is complete */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OMPI_SUCCESS);
return OMPI_SUCCESS;
}

368
ompi/mca/btl/scif/btl_scif_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,368 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "btl_scif.h"
#include "btl_scif_frag.h"
#include "btl_scif_endpoint.h"
static int
mca_btl_scif_free (struct mca_btl_base_module_t *btl,
mca_btl_base_descriptor_t *des);
static int
mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl);
static mca_btl_base_descriptor_t *
mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags);
static struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags);
mca_btl_scif_module_t mca_btl_scif_module = {
.super = {
.btl_component = &mca_btl_scif_component.super,
.btl_add_procs = mca_btl_scif_add_procs,
.btl_del_procs = mca_btl_scif_del_procs,
.btl_register = NULL,
.btl_finalize = mca_btl_scif_module_finalize,
.btl_alloc = mca_btl_scif_alloc,
.btl_free = mca_btl_scif_free,
.btl_prepare_src = mca_btl_scif_prepare_src,
.btl_prepare_dst = mca_btl_scif_prepare_dst,
.btl_send = mca_btl_scif_send,
.btl_sendi = mca_btl_scif_sendi,
.btl_put = mca_btl_scif_put,
.btl_get = mca_btl_scif_get,
.btl_dump = NULL,
.btl_mpool = NULL,
.btl_register_error = NULL,
.btl_ft_event = NULL,
}
};
int mca_btl_scif_module_init (void)
{
int rc;
/* create an endpoint to listen for connections */
mca_btl_scif_module.scif_fd = scif_open ();
if (-1 == mca_btl_scif_module.scif_fd) {
BTL_VERBOSE(("scif_open failed. errno = %d", errno));
return OMPI_ERROR;
}
/* bind the endpoint to a port */
mca_btl_scif_module.port_id.port = scif_bind (mca_btl_scif_module.scif_fd, 0);
if (-1 == mca_btl_scif_module.port_id.port) {
BTL_VERBOSE(("scif_bind failed. errno = %d", errno));
scif_close (mca_btl_scif_module.scif_fd);
mca_btl_scif_module.scif_fd = -1;
return OMPI_ERROR;
}
/* determine this processes node id */
rc = scif_get_nodeIDs (NULL, 0, &mca_btl_scif_module.port_id.node);
if (-1 == rc) {
BTL_VERBOSE(("btl/scif error getting node id of this node"));
return OMPI_ERROR;
}
/* Listen for connections */
/* TODO - base the maximum backlog off something */
rc = scif_listen (mca_btl_scif_module.scif_fd, 64);
if (-1 == rc) {
BTL_VERBOSE(("scif_listen failed. errno = %d", errno));
scif_close (mca_btl_scif_module.scif_fd);
mca_btl_scif_module.scif_fd = -1;
return OMPI_ERROR;
}
BTL_VERBOSE(("btl/scif: listening @ port %u on node %u\n",
mca_btl_scif_module.port_id.port, mca_btl_scif_module.port_id.node));
OBJ_CONSTRUCT(&mca_btl_scif_module.dma_frags, ompi_free_list_t);
OBJ_CONSTRUCT(&mca_btl_scif_module.eager_frags, ompi_free_list_t);
return OMPI_SUCCESS;
}
static int
mca_btl_scif_module_finalize (struct mca_btl_base_module_t *btl)
{
mca_btl_scif_module_t *scif_module = (mca_btl_scif_module_t *) btl;
unsigned int i;
OBJ_DESTRUCT(&mca_btl_scif_module.dma_frags);
OBJ_DESTRUCT(&mca_btl_scif_module.eager_frags);
/* close all open connections and release endpoints */
if (NULL != scif_module->endpoints) {
for (i = 0 ; i < scif_module->endpoint_count ; ++i) {
mca_btl_scif_ep_release (scif_module->endpoints + i);
}
free (scif_module->endpoints);
scif_module->endpoint_count = 0;
scif_module->endpoints = NULL;
}
/* close the listening endpoint */
if (-1 != mca_btl_scif_module.scif_fd) {
scif_close (mca_btl_scif_module.scif_fd);
}
mca_btl_scif_module.scif_fd = -1;
return OMPI_SUCCESS;
}
mca_btl_base_descriptor_t *
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
uint8_t order, size_t size, uint32_t flags)
{
mca_btl_scif_base_frag_t *frag = NULL;
BTL_VERBOSE(("allocating fragment of size: %u", (unsigned int)size));
if (size <= mca_btl_scif_module.super.btl_eager_limit) {
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
}
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
BTL_VERBOSE(("btl/scif_module allocated frag of size: %u, flags: %x. frag = %p",
(unsigned int)size, flags, (void *) frag));
frag->base.des_flags = flags;
frag->base.order = order;
frag->base.des_src = &frag->segments[0].base;
frag->base.des_src_cnt = 1;
frag->base.des_dst = &frag->segments[0].base;
frag->base.des_dst_cnt = 1;
frag->segments[0].base.seg_len = size;
return &frag->base;
}
static int
mca_btl_scif_free (struct mca_btl_base_module_t *btl,
mca_btl_base_descriptor_t *des)
{
return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des);
}
static inline int mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
void *data_ptr, size_t size,
mca_mpool_base_registration_t *registration,
uint8_t order, uint32_t flags,
mca_btl_scif_base_frag_t **frag_out)
{
mca_btl_scif_base_frag_t *frag;
mca_btl_scif_reg_t *scif_reg;
int rc;
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* the endpoint needs to be connected before the fragment can be
* registered. */
rc = mca_btl_scif_ep_connect (endpoint);
if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* not yet connected */
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0,
(mca_mpool_base_registration_t **) &registration);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_btl_scif_frag_return (frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}
frag->registration = (mca_btl_scif_reg_t *) registration;
}
scif_reg = (mca_btl_scif_reg_t *) registration;
/* register the memory location with this peer if it isn't already */
if ((off_t) -1 == scif_reg->registrations[endpoint->id]) {
size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1;
scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base,
seg_size, 0, SCIF_PROT_READ |
SCIF_PROT_WRITE, 0);
BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu",
(unsigned long) scif_reg->registrations[endpoint->id]));
}
if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) {
mca_btl_scif_frag_return (frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}
frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr;
frag->segments[0].base.seg_len = size;
frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] +
(off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base);
/* save the original pointer so the offset can be adjusted if needed (this is
* required for osc/rdma) */
frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr;
frag->base.order = order;
frag->base.des_flags = flags;
*frag_out = frag;
return OMPI_SUCCESS;
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src_dma (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t *size,
uint32_t flags)
{
mca_btl_scif_base_frag_t *frag;
void *data_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
rc = mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration,
order, flags, &frag);
if (OMPI_SUCCESS != rc) {
return NULL;
}
frag->base.des_src = &frag->segments->base;
frag->base.des_src_cnt = 1;
return &frag->base;
}
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
mca_btl_scif_base_frag_t *frag = NULL;
uint32_t iov_count = 1;
struct iovec iov;
size_t max_size = *size;
int rc;
if (OPAL_LIKELY((mca_btl_scif_module.super.btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) &&
!opal_convertor_need_buffers (convertor) &&
reserve <= 128)) {
/* inplace send */
void *data_ptr;
opal_convertor_get_current_pointer (convertor, &data_ptr);
(void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
frag->segments[0].base.seg_len = reserve;
frag->segments[1].base.seg_addr.pval = data_ptr;
frag->segments[1].base.seg_len = *size;
frag->base.des_src_cnt = 2;
} else {
/* buffered send */
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
if (*size) {
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve);
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
if (OPAL_UNLIKELY(rc < 0)) {
mca_btl_scif_frag_return (frag);
return NULL;
}
}
frag->segments[0].base.seg_len = reserve + *size;
frag->base.des_src_cnt = 1;
}
frag->base.des_src = &frag->segments->base;
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
if (OPAL_LIKELY(reserve)) {
return mca_btl_scif_prepare_src_send (btl, endpoint, convertor,
order, reserve, size, flags);
} else {
return mca_btl_scif_prepare_src_dma (btl, endpoint, registration,
convertor, order, size, flags);
}
}
static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
mca_mpool_base_registration_t *registration,
opal_convertor_t *convertor, uint8_t order,
size_t reserve, size_t *size, uint32_t flags)
{
mca_btl_scif_base_frag_t *frag;
void *data_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
rc = mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration,
order, flags, &frag);
if (OMPI_SUCCESS != rc) {
return NULL;
}
frag->base.des_dst = &frag->segments->base;
frag->base.des_dst_cnt = 1;
return &frag->base;
}

168
ompi/mca/btl/scif/btl_scif_prepare.h Обычный файл
Просмотреть файл

@ -0,0 +1,168 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(MCA_BTL_SCIF_PREPARE_H)
#define MCA_BTL_SCIF_PREPARE_H
#include "ompi_config.h"
#include "btl_scif.h"
#include "btl_scif_frag.h"
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send_inplace (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
bool use_eager_get = (*size + reserve) > mca_btl_scif_component.smsg_max_data;
mca_btl_scif_base_frag_t *frag = NULL;
mca_btl_scif_reg_t *registration = NULL;
void *data_ptr;
int rc;
opal_convertor_get_current_pointer (convertor, &data_ptr);
(void) MCA_BTL_SCIF_FRAG_ALLOC_RDMA(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
BTL_VERBOSE(("preparing src for send fragment. size = %u",
(unsigned int)(*size + reserve)));
if (OPAL_UNLIKELY(true == use_eager_get)) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0,
(mca_mpool_base_registration_t **)&registration);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_btl_scif_frag_return (frag);
return NULL;
}
frag->flags = MCA_BTL_SCIF_FRAG_EAGER | MCA_BTL_SCIF_FRAG_IGNORE;
frag->registration = registration;
frag->segments[1].memory_handle = registration->memory_hdl;
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
} else {
frag->hdr_size = reserve + sizeof (frag->hdr.send);
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
}
frag->segments[0].base.seg_len = reserve;
frag->segments[1].base.seg_addr.pval = data_ptr;
frag->segments[1].base.seg_len = *size;
frag->base.des_src = &frag->segments->base;
frag->base.des_src_cnt = 2;
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send_buffered (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
bool use_eager_get = (*size + reserve) > mca_btl_scif_component.smsg_max_data;
mca_btl_scif_reg_t *registration = NULL;
mca_btl_scif_base_frag_t *frag = NULL;
uint32_t iov_count = 1;
struct iovec iov;
size_t max_size = *size;
int rc;
if (OPAL_UNLIKELY(true == use_eager_get)) {
(void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER_SEND(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
frag->flags = MCA_BTL_SCIF_FRAG_EAGER | MCA_BTL_SCIF_FRAG_IGNORE;
registration = (mca_btl_scif_reg_t *) frag->base.super.registration;
frag->segments[1].memory_handle = registration->memory_hdl;
frag->hdr_size = reserve + sizeof (frag->hdr.eager);
frag->segments[0].base.seg_addr.pval = frag->hdr.eager_ex.pml_header;
} else {
(void) MCA_BTL_SCIF_FRAG_ALLOC_SMSG(endpoint, frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
}
frag->hdr_size = reserve + sizeof (frag->hdr.send);
frag->segments[0].base.seg_addr.pval = frag->hdr.send_ex.pml_header;
}
frag->flags |= MCA_BTL_SCIF_FRAG_BUFFERED;
if (*size) {
iov.iov_len = *size;
iov.iov_base = (IOVBASE_TYPE *) frag->base.super.ptr;
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size);
if (OPAL_UNLIKELY(rc < 0)) {
mca_btl_scif_frag_return (frag);
return NULL;
}
}
frag->segments[0].base.seg_len = reserve;
frag->segments[1].base.seg_addr.pval = frag->base.super.ptr;
frag->segments[1].base.seg_len = *size;
frag->base.des_src = &frag->segments->base;
frag->base.des_src_cnt = 2;
frag->base.order = order;
frag->base.des_flags = flags;
return &frag->base;
}
static inline struct mca_btl_base_descriptor_t *
mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl,
mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
uint8_t order, size_t reserve, size_t *size,
uint32_t flags)
{
bool use_eager_get = (*size + reserve) > mca_btl_scif_component.smsg_max_data;
bool send_in_place;
void *data_ptr;
opal_convertor_get_current_pointer (convertor, &data_ptr);
send_in_place = !(opal_convertor_need_buffers(convertor) ||
(use_eager_get && ((uintptr_t)data_ptr & 3)));
if (send_in_place) {
return mca_btl_scif_prepare_src_send_inplace (btl, endpoint, convertor, order,
reserve, size, flags);
} else {
return mca_btl_scif_prepare_src_send_buffered (btl, endpoint, convertor, order,
reserve, size, flags);
}
}
#endif

78
ompi/mca/btl/scif/btl_scif_put.c Обычный файл
Просмотреть файл

@ -0,0 +1,78 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "btl_scif_frag.h"
#define lmin(a,b) ((a) < (b) ? (a) : (b))
/**
* Initiate a put operation.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
int mca_btl_scif_put (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *des) {
mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_src;
mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_dst;
size_t len = lmin (src->base.seg_len, dst->base.seg_len);
int rc, mark, flags = 0;
off_t roffset, loffset;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
mca_btl_scif_component.put_count++;
#endif
BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des));
roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
if (mca_btl_scif_component.rma_use_cpu) {
flags = SCIF_RMA_USECPU;
}
if (mca_btl_scif_component.rma_sync) {
flags |= SCIF_RMA_SYNC;
}
/* start the write */
rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags);
if (OPAL_UNLIKELY(-1 == rc)) {
return OMPI_ERROR;
}
/* always call the callback function */
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
/* according to the scif documentation is is better to use a fence rather
* than using the SCIF_RMA_SYNC flag with scif_writeto */
if (!(flags & SCIF_RMA_SYNC)) {
scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
scif_fence_wait (endpoint->scif_epd, mark);
}
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time,
mca_btl_scif_component.put_time_max, ts);
#endif
/* since we completed the fence the RMA operation is complete */
mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OMPI_SUCCESS);
return OMPI_SUCCESS;
}

301
ompi/mca/btl/scif/btl_scif_send.c Обычный файл
Просмотреть файл

@ -0,0 +1,301 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_scif.h"
#include "btl_scif_frag.h"
#define BUFFER_FREE(s,e,hbm) (((s) > (e) || ((s) == (e) && !hbm)) ? (s) - (e) : (mca_btl_scif_component.segment_size - (e)))
/* attempt to reserve a contiguous segment from the remote endpoint */
static inline int mca_btl_scif_send_get_buffer (mca_btl_base_endpoint_t *endpoint, size_t size, unsigned char * restrict *dst)
{
/* the high bit helps determine if the buffer is empty or full */
bool hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
const unsigned int segment_size = mca_btl_scif_component.segment_size;
unsigned int start = endpoint->send_buffer.start & ~ (1 << 31);
unsigned int end = endpoint->send_buffer.end & ~ (1 << 31);
unsigned int buffer_free = BUFFER_FREE(start, end, hbm);
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
/* need space for the fragment + the header */
size += sizeof (mca_btl_scif_frag_hdr_t);
/* check if we need to free up space for this fragment */
if (OPAL_UNLIKELY(buffer_free < size)) {
BTL_VERBOSE(("not enough room for a fragment of size %u. in use buffer segment: {start: %x, end: %x, high bit matches: %d}\n",
(unsigned) size, start, end, (int) hbm));
/* read the current start pointer from the remote peer */
start = endpoint->send_buffer.start = endpoint->send_buffer.startp[0];
start &= ~ (1 << 31);
hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
buffer_free = BUFFER_FREE(start, end, hbm);
opal_atomic_rmb ();
/* if this is the end of the buffer. does the fragment fit? */
if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) {
mca_btl_scif_frag_hdr_t hdr;
hdr.size = buffer_free - sizeof (mca_btl_scif_frag_hdr_t);
hdr.tag = 0xff;
#if defined(SCIF_USE_SEQ)
hdr.seq = endpoint->seq_next++;
((uint64_t *) (endpoint->send_buffer.buffer + end))[0] = *((uint64_t *) &hdr);
#else
((uint32_t *) (endpoint->send_buffer.buffer + end))[0] = *((uint32_t *) &hdr);
#endif
/* toggle the high bit */
end = 64;
endpoint->send_buffer.end = ((endpoint->send_buffer.end & (1 << 31)) ^ (1 << 31)) | end;
hbm = (endpoint->send_buffer.start >> 31) == (endpoint->send_buffer.end >> 31);
buffer_free = BUFFER_FREE(start, end, hbm);
}
if (OPAL_UNLIKELY(buffer_free < size)) {
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.aquire_buffer_time, mca_btl_scif_component.aquire_buffer_time_max, ts);
#endif
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
BTL_VERBOSE(("writing fragment of size %u to offset %u {start: %x, end: %x} of peer's buffer. free = %u",
(unsigned int) size, end, start, end, buffer_free));
*dst = endpoint->send_buffer.buffer + end;
/* align the buffer on a 64 byte boundary */
end = (end + size + 63) & ~63;
if (OPAL_UNLIKELY(segment_size == end)) {
endpoint->send_buffer.end = ((endpoint->send_buffer.end & (1 << 31)) ^ (1 << 31)) | 64;
} else {
endpoint->send_buffer.end = (endpoint->send_buffer.end & (1 << 31)) | end;
}
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.aquire_buffer_time, mca_btl_scif_component.aquire_buffer_time_max, ts);
#endif
return OMPI_SUCCESS;
}
static void mark_buffer (struct mca_btl_base_endpoint_t *endpoint)
{
if (endpoint->port_id.node != mca_btl_scif_module.port_id.node) {
/* force the PCIe bus to flush by reading from the remote node */
volatile uint32_t start = endpoint->send_buffer.startp[0];
endpoint->send_buffer.endp[0] = endpoint->send_buffer.end;
endpoint->send_buffer.start = endpoint->send_buffer.startp[0];
} else {
MB();
endpoint->send_buffer.endp[0] = endpoint->send_buffer.end;
}
}
static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint,
mca_btl_scif_base_frag_t *frag)
{
size_t size = frag->hdr.size;
uint8_t tag = frag->hdr.tag;
unsigned char * restrict dst;
BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag,
OMPI_PROC_MY_NAME->vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len));
if (OPAL_LIKELY(OMPI_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) {
unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval;
#if defined(SCIF_TIMING)
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len);
if (frag->segments[1].base.seg_len) {
memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len,
frag->segments[1].base.seg_addr.pval,
frag->segments[1].base.seg_len);
}
#if defined(SCIF_USE_SEQ)
frag->hdr.seq = endpoint->seq_next++;
/* write the tag to signal the fragment is available */
((uint64_t *) dst)[0] = *((uint64_t *) &frag->hdr);
#else
((uint32_t *) dst)[0] = *((uint32_t *) &frag->hdr);
#endif
opal_atomic_wmb ();
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.send_time, mca_btl_scif_component.send_time_max, ts);
#endif
/* fragment is gone */
mca_btl_scif_frag_complete (frag, OMPI_SUCCESS);
return 1;
}
return OMPI_ERR_OUT_OF_RESOURCE;
}
int mca_btl_scif_send (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor;
size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len;
int rc;
frag->hdr.tag = tag;
frag->hdr.size = size;
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
rc = mca_btl_scif_ep_connect (endpoint);
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
/* something went wrong. have the pml try again later. */
if (MCA_BTL_SCIF_EP_STATE_INIT == endpoint->state) {
return OMPI_ERR_RESOURCE_BUSY;
}
/* the receiver was not ready to handle the fragment. queue up the fragment. */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) descriptor);
return OMPI_SUCCESS;
}
}
rc = mca_btl_scif_send_frag (endpoint, frag);
if (OPAL_LIKELY(1 == rc)) {
mark_buffer (endpoint);
return 1;
}
/* the receiver was not ready to handle the fragment. queue up the fragment. */
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) descriptor);
return OMPI_SUCCESS;
}
int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
{
size_t length = (header_size + payload_size);
unsigned char * restrict base;
mca_btl_scif_frag_hdr_t hdr;
size_t max_data;
int rc;
#if defined(SCIF_TIMING)
struct timespec ts;
#endif
assert (length < mca_btl_scif_module.super.btl_eager_limit);
assert (0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK));
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
rc = mca_btl_scif_ep_connect (endpoint);
if (OPAL_UNLIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) {
return OMPI_ERR_RESOURCE_BUSY;
}
}
rc = mca_btl_scif_send_get_buffer (endpoint, length, &base);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
*descriptor = NULL;
return OMPI_ERR_OUT_OF_RESOURCE;
}
#if defined(SCIF_TIMING)
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
#endif
/* fill in the fragment header (except for the tag) */
hdr.size = length;
hdr.tag = tag;
#if defined(SCIF_USE_SEQ)
hdr.seq = endpoint->seq_next++;
#endif
/* write the match header (with MPI comm/tag/etc. info) */
memcpy (base + sizeof (hdr), header, header_size);
if (payload_size) {
uint32_t iov_count = 1;
struct iovec iov[1];
iov[0].iov_base = base + sizeof (hdr) + header_size;
iov[0].iov_len = payload_size;
/* move the data */
opal_convertor_pack (convertor, iov, &iov_count, &max_data);
assert (max_data == payload_size);
}
#if defined(SCIF_USE_SEQ)
/* signal the remote side that this fragment is available */
((uint64_t *)base)[0] = *((uint64_t *) &hdr);
#else
((uint32_t *)base)[0] = *((uint32_t *) &hdr);
#endif
opal_atomic_wmb ();
mark_buffer (endpoint);
#if defined(SCIF_TIMING)
SCIF_UPDATE_TIMER(mca_btl_scif_component.sendi_time, mca_btl_scif_component.sendi_time_max, ts);
#endif
return OMPI_SUCCESS;
}
int mca_btl_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
{
mca_btl_scif_base_frag_t *frag;
int rc = OMPI_SUCCESS;
while (NULL !=
(frag = (mca_btl_scif_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list))) {
rc = mca_btl_scif_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS > rc)) {
if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc)) {
opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
} else {
mca_btl_scif_frag_complete (frag, rc);
}
break;
}
}
mark_buffer (endpoint);
return OMPI_SUCCESS;
}

40
ompi/mca/btl/scif/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
AC_DEFUN([MCA_ompi_btl_scif_CONFIG],[
AC_CONFIG_FILES([ompi/mca/btl/scif/Makefile])
AC_ARG_WITH([scif], [AC_HELP_STRING([--with-scif(=DIR)]),
[Build with SCIF, searching for headers in DIR])])
OMPI_CHECK_WITHDIR([scif], [$with_scif], [include/scif.h])
btl_scif_happy="no"
if test "$with_scif" != "no" ; then
if test -n "$with_scif" -a "$with_scif" != "yes" ; then
ompi_check_scif_dir=$with_scif
fi
OMPI_CHECK_PACKAGE([btl_scif], [scif.h], [scif], [scif_open], [],
[$ompi_check_scif_dir], [], [btl_scif_happy="yes"], [])
if test "$btl_scif_happy" != "yes" ; then
AC_MSG_ERROR([SCIF support requested but not found. Aborting])
fi
fi
AS_IF([test "$btl_scif_happy" = "yes"], [$1], [$2])
# substitute in the things needed to build scif
AC_SUBST([btl_scif_CPPFLAGS])
AC_SUBST([btl_scif_LDFLAGS])
AC_SUBST([btl_scif_LIBS])
])dnl