1
1

osc/rdma: add true RDMA one-sided component

This commit adds support for performing one-sided operations over
supported hardware (currently Infiniband and Cray Gemini/Aries). This
component is still undergoing active development.

Current features:

 - Use network atomic operations (fadd, cswap) for implementing
   locking and PSCW synchronization.

 - Aggregate small contiguous puts.

 - Reduced memory footprint by storing window data (pointer, keys,
   etc) at the lowest rank on each node. The data is fetched as each
   process needs to communicate with a new peer. This is a trade-off
   between the performance of the first operation on a peer and the
   memory utilization of a window.

TODO:

 - Add support for the accumulate_ops info key. If it is known that
   the same op or same op/no op is used it may be possible to use
   hardware atomics for fetch-and-op and compare-and-swap.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-09-16 12:54:36 -06:00
родитель 131681acc6
Коммит d8df9d414d
26 изменённых файлов: 7162 добавлений и 1 удалений

64
ompi/mca/osc/rdma/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,64 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
rdma_sources = \
osc_rdma.h \
osc_rdma_module.c \
osc_rdma_comm.h \
osc_rdma_comm.c \
osc_rdma_accumulate.c \
osc_rdma_accumulate.h \
osc_rdma_component.c \
osc_rdma_frag.h \
osc_rdma_frag.c \
osc_rdma_request.h \
osc_rdma_request.c \
osc_rdma_active_target.h \
osc_rdma_active_target.c \
osc_rdma_passive_target.h \
osc_rdma_passive_target.c \
osc_rdma_lock.h \
osc_rdma_peer.h \
osc_rdma_peer.c \
osc_rdma_dynamic.h \
osc_rdma_dynamic.c \
osc_rdma_sync.h \
osc_rdma_sync.c \
osc_rdma_types.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_osc_rdma_DSO
component_noinst =
component_install = mca_osc_rdma.la
else
component_noinst = libmca_osc_rdma.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_osc_rdma_la_SOURCES = $(rdma_sources)
mca_osc_rdma_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_osc_rdma_la_SOURCES = $(rdma_sources)
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version

26
ompi/mca/osc/rdma/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,26 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
# ----------------------------------------
# Only require the tag if we're actually going to be built, since bml
# is one of the ones frequently disabled for large installs.
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
])dnl
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
[$1]
])dnl

506
ompi/mca/osc/rdma/osc_rdma.h Обычный файл
Просмотреть файл

@ -0,0 +1,506 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_H
#define OMPI_OSC_RDMA_H
#include "ompi_config.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/threads.h"
#include "opal/util/output.h"
#include "opal/mca/shmem/shmem.h"
#include "opal/mca/shmem/base/base.h"
#include "ompi/win/win.h"
#include "ompi/communicator/communicator.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/request/request.h"
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/mca/btl/btl.h"
#include "ompi/memchecker.h"
#include "ompi/op/op.h"
#include "opal/align.h"
#include "osc_rdma_types.h"
#include "osc_rdma_sync.h"
#include "osc_rdma_peer.h"
#include "opal_stdint.h"
/**
* @brief osc rdma component structure
*/
struct ompi_osc_rdma_component_t {
/** Extend the basic osc component interface */
ompi_osc_base_component_t super;
/** lock access to modules */
opal_mutex_t lock;
/** cid -> module mapping */
opal_hash_table_t modules;
/** free list of ompi_osc_rdma_frag_t structures */
opal_free_list_t frags;
/** Free list of requests */
opal_free_list_t requests;
/** RDMA component buffer size */
unsigned int buffer_size;
/** aggregation limit */
unsigned int aggregation_limit;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
/** Maximum number of segments that can be attached to a dynamic window */
unsigned int max_attach;
/** Default value of the no_locks info key for new windows */
bool no_locks;
/** Priority of the osc/rdma component */
unsigned int priority;
/** aggregation free list */
opal_free_list_t aggregate;
};
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
struct ompi_osc_rdma_frag_t;
/**
* @brief osc rdma module structure
*
* Each MPI window is associated with a single osc module. This struct
* stores the data relevant to the osc/rdma component.
*/
struct ompi_osc_rdma_module_t {
/** Extend the basic osc module interface */
ompi_osc_base_module_t super;
/** pointer back to MPI window */
struct ompi_win_t *win;
/** Mutex lock protecting module data */
opal_mutex_t lock;
/* window configuration */
/** value of same_disp_unit info key for this window */
bool same_disp_unit;
/** value of same_size info key for this window */
bool same_size;
/** window should have accumulate ordering... */
bool accumulate_ordering;
/** passive-target synchronization will not be used in this window */
bool no_locks;
/** flavor of this window */
int flavor;
/** size of local window */
size_t size;
/** Local displacement unit. */
int disp_unit;
/** global leader */
ompi_osc_rdma_peer_t *leader;
/** pointer to free on cleanup (may be NULL) */
void *free_after;
/** local state structure (shared memory) */
ompi_osc_rdma_state_t *state;
/** node-level communication data (shared memory) */
unsigned char *node_comm_info;
/* only relevant on the lowest rank on each node (shared memory) */
ompi_osc_rdma_rank_data_t *rank_array;
/** communicator created with this window. This is the cid used
* in the component's modules mapping. */
ompi_communicator_t *comm;
/* temporary communicators for window initialization */
ompi_communicator_t *local_leaders;
ompi_communicator_t *shared_comm;
/** node id of this rank */
int node_id;
/** number of nodes */
int node_count;
/** handle valid for local state (valid for local data for MPI_Win_allocate) */
mca_btl_base_registration_handle_t *state_handle;
/** registration handle for the window base (only used for MPI_Win_create) */
mca_btl_base_registration_handle_t *base_handle;
/** size of a region */
size_t region_size;
/** size of the state structure */
size_t state_size;
/** offset in the shared memory segment where the state array starts */
size_t state_offset;
/* ********************* sync data ************************ */
/** global sync object (PSCW, fence, lock all) */
ompi_osc_rdma_sync_t all_sync;
/** current group associate with pscw exposure epoch */
struct ompi_group_t *pw_group;
/** list of unmatched post messages */
opal_list_t pending_posts;
/* ********************* LOCK data ************************ */
/** number of outstanding locks */
osc_rdma_counter_t passive_target_access_epoch;
/** origin side list of locks currently outstanding */
opal_hash_table_t outstanding_locks;
/** array of locks (small jobs) */
ompi_osc_rdma_sync_t **outstanding_lock_array;
/* ******************* peer storage *********************** */
/** hash table of allocated peers */
opal_hash_table_t peer_hash;
/** array of allocated peers (small jobs) */
ompi_osc_rdma_peer_t **peer_array;
/** lock for peer hash table/array */
opal_mutex_t peer_lock;
/** BTL in use */
struct mca_btl_base_module_t *selected_btl;
/** registered fragment used for locally buffered RDMA transfers */
struct ompi_osc_rdma_frag_t *rdma_frag;
/** registration handles for dynamically attached regions. These are not stored
* in the state structure as it is entirely local. */
ompi_osc_rdma_handle_t *dynamic_handles;
/** shared memory segment. this segment holds this node's portion of the rank -> node
* mapping array, node communication data (node_comm_info), state for all local ranks,
* and data for all local ranks (MPI_Win_allocate only) */
void *segment_base;
/** opal shared memory structure for the shared memory segment */
opal_shmem_ds_t seg_ds;
/* performance values */
/** number of times a put had to be retried */
unsigned long put_retry_count;
/** number of time a get had to be retried */
unsigned long get_retry_count;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
int ompi_osc_rdma_free (struct ompi_win_t *win);
/* peer functions */
/**
* @brief cache a peer object
*
* @param[in] module osc rdma module
* @param[in] peer peer object to cache
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_OUT_OF_RESOURCE on failure
*/
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
/**
* @brief check if a peer object is cached for a remote rank
*
* @param[in] module osc rdma module
* @param[in] peer_id remote peer rank
*
* @returns peer object on success
* @returns NULL if a peer object is not cached for the peer
*/
static inline ompi_osc_rdma_peer_t *ompi_osc_module_get_peer (ompi_osc_rdma_module_t *module, int peer_id)
{
if (NULL == module->peer_array) {
ompi_osc_rdma_peer_t *peer = NULL;
(void) opal_hash_table_get_value_uint32 (&module->peer_hash, peer_id, (void **) &peer);
return peer;
}
return module->peer_array[peer_id];
}
/**
* @brief get the peer object for a remote rank
*
* @param[in] module osc rdma module
* @param[in] peer_id remote peer rank
*/
static inline ompi_osc_rdma_peer_t *ompi_osc_rdma_module_peer (ompi_osc_rdma_module_t *module, int peer_id)
{
ompi_osc_rdma_peer_t *peer;
peer = ompi_osc_module_get_peer (module, peer_id);
if (NULL != peer) {
return peer;
}
return ompi_osc_rdma_peer_lookup (module, peer_id);
}
/**
* @brief check if this process has this process is in a passive target access epoch
*
* @param[in] module osc rdma module
*/
static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *module)
{
return 0 != module->passive_target_access_epoch;
}
static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr,
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
{
if (module->selected_btl->btl_register_mem) {
*handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags);
if (OPAL_UNLIKELY(NULL == *handle)) {
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "failed to register pointer with selected BTL. base: %p, "
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line));
return OMPI_ERR_OUT_OF_RESOURCE;
}
} else {
*handle = NULL;
}
return OMPI_SUCCESS;
}
#define ompi_osc_rdma_register(...) _ompi_osc_rdma_register(__VA_ARGS__, __LINE__, __FILE__)
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
{
if (handle) {
module->selected_btl->btl_deregister_mem (module->selected_btl, handle);
}
}
#define ompi_osc_rdma_deregister(...) _ompi_osc_rdma_deregister(__VA_ARGS__, __LINE__, __FILE__)
static inline void ompi_osc_rdma_progress (ompi_osc_rdma_module_t *module) {
module->selected_btl->btl_component->btl_progress ();
}
/**
* Find the first outstanding lock of the target.
*
* @param[in] module osc rdma module
* @param[in] target target rank
* @param[out] peer peer object associated with the target
*
* @returns an outstanding lock on success
*
* This function looks for an outstanding lock to the target. If a lock exists it is returned.
*/
static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_lock_find (ompi_osc_rdma_module_t *module, int target,
ompi_osc_rdma_peer_t **peer)
{
ompi_osc_rdma_sync_t *outstanding_lock = NULL;
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
outstanding_lock = module->outstanding_lock_array[target];
} else {
(void) opal_hash_table_get_value_uint32 (&module->outstanding_locks, (uint32_t) target, (void **) &outstanding_lock);
}
if (NULL != outstanding_lock && peer) {
*peer = outstanding_lock->peer_list.peer;
}
return outstanding_lock;
}
/**
* Add an outstanding lock
*
* @param[in] module osc rdma module
* @param[in] lock lock object
*
* This function inserts a lock object to the list of outstanding locks. The caller must be holding the module
* lock.
*/
static inline void ompi_osc_rdma_module_lock_insert (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock)
{
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
module->outstanding_lock_array[lock->sync.lock.target] = lock;
} else {
(void) opal_hash_table_set_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target, (void *) lock);
}
}
/**
* Remove an outstanding lock
*
* @param[in] module osc rdma module
* @param[in] lock lock object
*
* This function removes a lock object to the list of outstanding locks. The caller must be holding the module
* lock.
*/
static inline void ompi_osc_rdma_module_lock_remove (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock)
{
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
module->outstanding_lock_array[lock->sync.lock.target] = NULL;
} else {
(void) opal_hash_table_remove_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target);
}
}
/**
* Lookup a synchronization object associated with the target
*
* @param[in] module osc rdma module
* @param[in] target target rank
* @param[out] peer peer object
*
* @returns NULL if the target is not locked, fenced, or part of a pscw sync
* @returns synchronization object on success
*
* This function returns the synchronization object associated with an access epoch for
* the target. If the target is not part of any current access epoch then NULL is returned.
*/
static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer)
{
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc/rdma: looking for synchronization object for target %d", target));
switch (module->all_sync.type) {
case OMPI_OSC_RDMA_SYNC_TYPE_NONE:
if (!module->no_locks) {
return ompi_osc_rdma_module_lock_find (module, target, peer);
}
return NULL;
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
case OMPI_OSC_RDMA_SYNC_TYPE_LOCK:
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc/rdma: found fence/lock_all access epoch for target %d", target));
/* fence epoch is now active */
module->all_sync.epoch_active = true;
*peer = ompi_osc_rdma_module_peer (module, target);
return &module->all_sync;
case OMPI_OSC_RDMA_SYNC_TYPE_PSCW:
if (ompi_osc_rdma_sync_pscw_peer (module, target, peer)) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc/rdma: found PSCW access epoch target for %d", target));
return &module->all_sync;
}
}
return NULL;
}
/**
* @brief complete all outstanding rdma operations to all peers
*
* @param[in] module osc rdma module
*/
static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
{
ompi_osc_rdma_aggregation_t *aggregation, *next;
ompi_osc_rdma_module_t *module = sync->module;
if (opal_list_get_size (&sync->aggregations)) {
OPAL_THREAD_SCOPED_LOCK(&sync->lock,
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
});
}
do {
module->selected_btl->btl_component->btl_progress ();
} while (sync->outstanding_rdma);
}
/**
* @brief check if an access epoch is active
*
* @param[in] module osc rdma module
*
* @returns true if any type of access epoch is active
* @returns false otherwise
*
* This function is used to check for conflicting access epochs.
*/
static inline bool ompi_osc_rdma_access_epoch_active (ompi_osc_rdma_module_t *module)
{
return (module->all_sync.epoch_active || ompi_osc_rdma_in_passive_epoch (module));
}
static inline void ompi_osc_rdma_aggregation_return (ompi_osc_rdma_aggregation_t *aggregation)
{
if (aggregation->sync) {
opal_list_remove_item (&aggregation->sync->aggregations, (opal_list_item_t *) aggregation);
}
opal_free_list_return(&mca_osc_rdma_component.aggregate, (opal_free_list_item_t *) aggregation);
}
#endif /* OMPI_OSC_RDMA_H */

907
ompi/mca/osc/rdma/osc_rdma_accumulate.c Обычный файл
Просмотреть файл

@ -0,0 +1,907 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma_accumulate.h"
#include "osc_rdma_request.h"
#include "osc_rdma_comm.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, int target_count,
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module,
ompi_osc_rdma_request_t *request)
{
int ret = OMPI_SUCCESS;
do {
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
if (NULL != result_buffer) {
/* get accumulate */
ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
result_buffer, result_count, result_datatype);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
if (&ompi_mpi_op_no_op.op != op) {
if (&ompi_mpi_op_replace.op != op) {
ret = ompi_osc_base_sndrcv_op (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
target_count, target_datatype, op);
} else {
ret = ompi_datatype_sndrcv (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
target_count, target_datatype);
}
}
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
} while (0);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret));
return ret;
}
if (request) {
/* NTH: is it ok to use an ompi error code here? */
ompi_osc_rdma_request_complete (request, ret);
}
return ret;
}
static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer,
ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer,
uint64_t target_address, mca_btl_base_registration_handle_t *target_handle,
ompi_osc_rdma_module_t *module)
{
ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size);
if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) {
memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size);
}
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
return OMPI_SUCCESS;
}
/* completion of an accumulate put */
static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
ompi_osc_rdma_sync_t *sync = request->sync;
ompi_osc_rdma_peer_t *peer = request->peer;
ompi_osc_rdma_frag_complete (request->frag);
ompi_osc_rdma_request_complete (request, status);
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_release_exclusive (sync->module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
ompi_osc_rdma_sync_rdma_dec (sync);
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
}
/* completion of an accumulate get operation */
static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
intptr_t source = (intptr_t) local_address + request->offset;
ompi_osc_rdma_sync_t *sync = request->sync;
ompi_osc_rdma_module_t *module = sync->module;
assert (OMPI_SUCCESS == status);
if (OMPI_SUCCESS == status && OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) {
if (NULL == request->result_addr) {
/* result buffer is not necessarily contiguous. use the opal datatype engine to
* copy the data over in this case */
struct iovec iov = {.iov_base = (void *) source, request->len};
uint32_t iov_count = 1;
size_t size = request->len;
opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size);
opal_convertor_cleanup (&request->convertor);
} else {
/* copy contiguous data to the result buffer */
ompi_datatype_sndrcv ((void *) source, request->len, MPI_BYTE, request->result_addr,
request->result_count, request->result_dt);
}
if (&ompi_mpi_op_no_op.op == request->op) {
/* this is a no-op. nothing more to do except release resources and the accumulate lock */
ompi_osc_rdma_acc_put_complete (btl, endpoint, local_address, local_handle, context, data, status);
return;
}
}
/* accumulate the data */
if (&ompi_mpi_op_replace.op != request->op) {
ompi_op_reduce (request->op, request->origin_addr, (void *) source, request->origin_count, request->origin_dt);
}
/* initiate the put of the accumulated data */
status = module->selected_btl->btl_put (module->selected_btl, endpoint, (void *) source,
request->target_address, local_handle,
(mca_btl_base_registration_handle_t *) request->ctx,
request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
request, NULL);
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
assert (OPAL_SUCCESS == status);
}
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count, ompi_datatype_t *source_datatype,
void *result, int result_count, ompi_datatype_t *result_datatype,
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, int target_count,
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
unsigned long len = target_count * target_datatype->super.size;
ompi_osc_rdma_frag_t *frag = NULL;
unsigned long aligned_len, offset;
char *ptr = NULL;
int ret;
offset = target_address & btl_alignment_mask;;
aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"Could not allocate an rdma fragment for get accumulate"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&module->lock);
/* to ensure order wait until the previous accumulate completes */
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
OPAL_THREAD_UNLOCK(&module->lock);
ompi_osc_rdma_progress (module);
OPAL_THREAD_LOCK(&module->lock);
}
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
OPAL_THREAD_UNLOCK(&module->lock);
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
/* set up the request */
request->frag = frag;
request->origin_addr = (void *) source;
request->origin_dt = source_datatype;
request->origin_count = source_count;
request->ctx = (void *) target_handle;
request->result_addr = result;
request->result_count = result_count;
request->result_dt = result_datatype;
request->offset = (ptrdiff_t) target_address & btl_alignment_mask;
request->target_address = target_address;
request->len = len;
request->op = op;
request->sync = sync;
ompi_osc_rdma_sync_rdma_inc (sync);
if (&ompi_mpi_op_replace.op != op || result) {
/* align the target address */
target_address = target_address & ~btl_alignment_mask;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
"initiating btl get local: {%p, %p}, remote: {0x%" PRIx64 ", %p}...",
ptr, (void *) frag->handle, target_address, (void *) target_handle));
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
target_address, frag->handle, target_handle, aligned_len,
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_get_complete,
request, NULL);
} else {
/* copy the put accumulate data */
memcpy (ptr, source, len);
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
"initiating btl put..."));
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr,
target_address, frag->handle, target_handle, len, 0,
MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
request, NULL);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "btl operation failed with ret = %d", ret));
ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL);
return ret;
}
static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count,
ompi_datatype_t *source_datatype, void *result_buffer, int result_count,
ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, int target_count,
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX];
const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3);
uint32_t source_primitive_count, target_primitive_count;
opal_convertor_t source_convertor, target_convertor;
uint32_t source_iov_count, target_iov_count;
uint32_t source_iov_index, target_iov_index;
ompi_datatype_t *source_primitive, *target_primitive;
/* needed for opal_convertor_raw but not used */
size_t source_size, target_size;
ompi_osc_rdma_request_t *subreq;
size_t result_position;
ptrdiff_t lb, extent;
int ret, acc_len;
bool done;
(void) ompi_datatype_get_extent (target_datatype, &lb, &extent);
target_address += lb;
/* fast path for accumulate on built-in types */
if (OPAL_LIKELY((!source_count || ompi_datatype_is_predefined (source_datatype)) &&
ompi_datatype_is_predefined (target_datatype) &&
(!result_count || ompi_datatype_is_predefined (result_datatype)) &&
(target_datatype->super.size * target_count <= acc_limit))) {
if (NULL == request) {
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
if (NULL == request) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
request->internal = true;
request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
}
if (source_datatype) {
(void) ompi_datatype_get_extent (source_datatype, &lb, &extent);
source_buffer = (void *)((intptr_t) source_buffer + lb);
}
if (result_datatype) {
(void) ompi_datatype_get_extent (result_datatype, &lb, &extent);
result_buffer = (void *)((intptr_t) result_buffer + lb);
}
ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer,
result_count, result_datatype, peer, target_address,
target_handle, target_count, target_datatype, op,
request);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
if (source_datatype) {
/* the convertors will handle the lb */
(void) ompi_datatype_get_extent (source_datatype, &lb, &extent);
source_buffer = (void *)((intptr_t) source_buffer - lb);
}
if (result_datatype) {
(void) ompi_datatype_get_extent (result_datatype, &lb, &extent);
result_buffer = (void *)((intptr_t) result_buffer - lb);
}
}
/* the convertor will handle lb from here */
(void) ompi_datatype_get_extent (target_datatype, &lb, &extent);
target_address -= lb;
/* get the primitive datatype info */
ret = ompi_osc_base_get_primitive_type_info (target_datatype, &target_primitive, &target_primitive_count);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* target datatype is not made up of a single basic datatype */
return ret;
}
if (source_datatype) {
ret = ompi_osc_base_get_primitive_type_info (source_datatype, &source_primitive, &source_primitive_count);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* target datatype is not made up of a single basic datatype */
return ret;
}
if (OPAL_UNLIKELY(source_primitive != target_primitive)) {
return MPI_ERR_TYPE;
}
}
/* prepare convertors for the source and target. these convertors will be used to determine the
* contiguous segments within the source and target. */
/* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */
if (source_datatype) {
OBJ_CONSTRUCT(&source_convertor, opal_convertor_t);
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer,
0, &source_convertor);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* target_datatype can never be NULL */
OBJ_CONSTRUCT(&target_convertor, opal_convertor_t);
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &target_datatype->super, target_count,
(void *) (intptr_t) target_address, 0, &target_convertor);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (request) {
/* keep the request from completing until all the transfers have started */
request->outstanding_requests = 1;
}
target_iov_index = 0;
target_iov_count = 0;
result_position = 0;
do {
/* decode segments of the source data */
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
source_iov_index = 0;
/* opal_convertor_raw returns done when it has reached the end of the data */
if (!source_datatype) {
done = true;
source_iovec[0].iov_len = (size_t) -1;
source_iovec[0].iov_base = NULL;
source_iov_count = 1;
} else {
done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
}
/* loop on the target segments until we have exhaused the decoded source data */
while (source_iov_index != source_iov_count) {
if (target_iov_index == target_iov_count) {
/* decode segments of the target buffer */
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
target_iov_index = 0;
(void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
}
/* we already checked that the target was large enough. this should be impossible */
assert (0 != target_iov_count);
/* determine how much to put in this operation */
acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len);
acc_len = min((size_t) acc_len, acc_limit);
/* execute the get */
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
if (NULL == subreq) {
ompi_osc_rdma_progress (module);
continue;
}
subreq->internal = true;
subreq->parent_request = request;
if (request) {
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
}
if (result_datatype) {
/* prepare a convertor for this part of the result */
opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count,
result_buffer, 0, &subreq->convertor);
opal_convertor_set_position (&subreq->convertor, &result_position);
subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC;
} else {
subreq->type = OMPI_OSC_RDMA_TYPE_ACC;
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
"target index = %d, target = {%p, %lu}, source_index = %d, source = {%p, %lu}, result = %p, result position = %lu, "
"acc_len = %d, count = %lu",
target_iov_index, target_iovec[target_iov_index].iov_base, (unsigned long) target_iovec[target_iov_index].iov_len,
source_iov_index, source_iovec[source_iov_index].iov_base, (unsigned long) source_iovec[source_iov_index].iov_len,
result_buffer, (unsigned long) result_position, acc_len, (unsigned long)(acc_len / target_primitive->super.size)));
ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
target_primitive, NULL, 0, NULL, peer, (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base,
target_handle, acc_len / target_primitive->super.size, target_primitive, op, subreq);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
/* something bad happened. need to figure out how to handle these errors */
return ret;
}
/* progress and try again */
ompi_osc_rdma_progress (module);
continue;
}
/* adjust io vectors */
target_iovec[target_iov_index].iov_len -= acc_len;
source_iovec[source_iov_index].iov_len -= acc_len;
target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len);
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
result_position += acc_len;
source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len);
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
}
} while (!done);
if (request) {
/* release our reference so the request can complete */
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
}
if (source_datatype) {
opal_convertor_cleanup (&source_convertor);
OBJ_DESTRUCT(&source_convertor);
}
opal_convertor_cleanup (&target_convertor);
OBJ_DESTRUCT(&target_convertor);
return OMPI_SUCCESS;
}
#if 0
static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
void *result_buffer = (void *)(intptr_t) ((int64_t *) local_address)[1];
/* copy the result */
memcpy (result_buffer, local_address, 8);
ompi_osc_rdma_sync_rdma_dec (sync);
ompi_osc_rdma_frag_complete (frag);
}
static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer,
void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer,
uint64_t target_address, mca_btl_base_registration_handle_t *target_handle)
{
ompi_osc_rdma_module_t *module = sync->module;
ompi_osc_rdma_frag_t *frag = NULL;
char *ptr;
int ret;
/* XXX -- TODO -- Update the BTL interface to allow for other CAS sizes */
if (datatype->super.size != 8) {
return OMPI_ERR_NOT_SUPPORTED;
}
ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* store the destination in the temporary buffer */
((int64_t *) ptr)[1] = (intptr_t) result_buffer;
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address,
frag->handle, target_handle, ((int64_t *)compare_buffer)[0],
*((int64_t *) source_buffer), 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_atomic_complete, module, frag);
if (OPAL_UNLIKELY(0 > ret)) {
return ret;
}
if (1 != ret) {
ompi_osc_rdma_sync_rdma_inc (sync);
} else {
memcpy (result_buffer, ptr, 8);
ompi_osc_rdma_frag_complete (frag);
}
return OMPI_SUCCESS;
}
#endif
/**
* ompi_osc_rdma_cas_get_complete:
* Note: This function will not work as is in a heterogeneous environment.
*/
static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
ompi_osc_rdma_sync_t *sync = request->sync;
ompi_osc_rdma_module_t *module = sync->module;
intptr_t source = (intptr_t) local_address + request->offset;
ompi_osc_rdma_frag_t *frag = request->frag;
ompi_osc_rdma_peer_t *peer = request->peer;
int ret;
if (OMPI_SUCCESS == status) {
/* copy data to the user buffer (for gacc) */
memcpy (request->result_addr, (void *) source, request->len);
memcpy ((void *) source, request->origin_addr, request->len);
if (0 == memcmp ((void *) source, request->compare_addr, request->len)) {
/* the target and compare buffers match so write the source to the target */
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address,
request->target_address, local_handle,
(mca_btl_base_registration_handle_t *) request->ctx,
request->len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_acc_put_complete, request, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, "could not start put to complete accumulate "
"operation. opal return code: %d", ret));
}
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
assert (OPAL_SUCCESS == ret);
} else {
/* this is a no-op. nothing more to do except release the accumulate lock */
ompi_osc_rdma_frag_complete (frag);
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_release_exclusive (module, request->peer,
offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
/* the request is now complete and the outstanding rdma operation is complete */
ompi_osc_rdma_request_complete (request, status);
ompi_osc_rdma_sync_rdma_dec (sync);
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
}
}
}
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer,
ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle)
{
ompi_osc_rdma_module_t *module = sync->module;
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
unsigned long offset, aligned_len, len = datatype->super.size;
ompi_osc_rdma_frag_t *frag = NULL;
ompi_osc_rdma_request_t *request;
char *ptr = NULL;
int ret;
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
if (NULL == request) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
request->internal = true;
request->type = OMPI_OSC_RDMA_TYPE_CSWAP;
request->sync = sync;
OPAL_THREAD_LOCK(&module->lock);
/* to ensure order wait until the previous accumulate completes */
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
OPAL_THREAD_UNLOCK(&module->lock);
ompi_osc_rdma_progress (module);
OPAL_THREAD_LOCK(&module->lock);
}
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
OPAL_THREAD_UNLOCK(&module->lock);
offset = target_address & btl_alignment_mask;;
aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"Could not allocate an rdma fragment for get accumulate. Falling back on point-to-point"));
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
/* set up the request */
request->frag = frag;
request->origin_addr = (void *) source_buffer;
request->ctx = (void *) target_handle;
request->result_addr = result_buffer;
request->compare_addr = compare_buffer;
request->result_dt = datatype;
request->offset = (ptrdiff_t) offset;
request->target_address = target_address;
request->len = len;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get..."));
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
target_address, frag->handle, target_handle,
aligned_len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_get_complete, request, NULL);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ompi_osc_rdma_frag_complete (frag);
return ret;
}
ompi_osc_rdma_sync_rdma_inc (sync);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr,
struct ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp,
struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
mca_btl_base_registration_handle_t *target_handle;
ompi_osc_rdma_sync_t *sync;
uint64_t target_address;
int ret;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s",
(unsigned long) origin_addr, (unsigned long) compare_addr, (unsigned long) result_addr,
dt->name, target_rank, (int) target_disp, win->w_name));
ret = osc_rdma_get_remote_segment (module, peer, target_disp, 8, &target_address, &target_handle);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
return ret;
}
#if 0
if (MCA_OSC_RDMA_SAME_OP <= module->accumulate_ops) {
/* the user has indicated that they will only use the same op (or same op and no op)
* for operations on overlapping memory ranges. that indicates it is safe to go ahead
* and use network atomic operations. */
ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt,
peer, target_address, target_handle);
if (OMPI_SUCCESS == ret) {
return OMPI_SUCCESS;
}
} else
#endif
if (ompi_osc_rdma_peer_local_base (peer)) {
return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt,
peer, target_address, target_handle, module);
}
return cas_rdma (sync, origin_addr, compare_addr, result_addr, dt, peer, target_address,
target_handle);
}
static inline
int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count,
struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer,
int target_rank, MPI_Aint target_disp, int target_count,
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_registration_handle_t *target_handle;
uint64_t target_address;
int ret;
/* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */
if ((result_addr && 0 == result_count) || 0 == target_count) {
if (request) {
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
}
return OMPI_SUCCESS;
}
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count,
&target_address, &target_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (ompi_osc_rdma_peer_local_base (peer)) {
/* local/self optimization */
return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
result_datatype, peer, target_address, target_handle, target_count,
target_datatype, op, module, request);
}
return ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count,
result_datatype, peer, target_address, target_handle, target_count,
target_datatype, op, request);
}
int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr, int result_count,
struct ompi_datatype_t *result_datatype,
int target_rank, MPI_Aint target_disp,
int target_count, struct ompi_datatype_t *target_datatype,
struct ompi_op_t *op, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *sync;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"get_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name,
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
win->w_name));
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype,
result_addr, result_count, result_datatype,
peer, target_rank, target_disp, target_count,
target_datatype, op, NULL);
}
int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr, int result_count,
struct ompi_datatype_t *result_datatype,
int target_rank, MPI_Aint target_disp,
int target_count, struct ompi_datatype_t *target_datatype,
struct ompi_op_t *op, struct ompi_win_t *win,
ompi_request_t **request)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_t *rdma_request;
ompi_osc_rdma_sync_t *sync;
int ret;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name,
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
win->w_name));
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
if (OPAL_UNLIKELY(NULL == rdma_request)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, result_addr,
result_count, result_datatype, peer, target_rank, target_disp,
target_count, target_datatype, op, rdma_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
return ret;
}
*request = &rdma_request->super;
return OMPI_SUCCESS;
}
int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target_rank,
OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *sync;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "fop: %p, %s, %d, %lu, %s, %s",
result_addr, dt->name, target_rank, (unsigned long) target_disp, op->o_name, win->w_name));
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, target_rank,
target_disp, 1, dt, op, NULL);
}
int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype, int target_rank,
OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
struct ompi_win_t *win, struct ompi_request_t **request)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_t *rdma_request;
ompi_osc_rdma_sync_t *sync;
int ret;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "racc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name));
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
if (OPAL_UNLIKELY(NULL == rdma_request)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
NULL, peer, target_rank, target_disp, target_count, target_datatype,
op, rdma_request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
return ret;
}
*request = &rdma_request->super;
return OMPI_SUCCESS;
}
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype, int target_rank,
OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *sync;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name));
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
NULL, peer, target_rank, target_disp, target_count, target_datatype,
op, NULL);
}

57
ompi/mca/osc/rdma/osc_rdma_accumulate.h Обычный файл
Просмотреть файл

@ -0,0 +1,57 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OSC_RDMA_ACCUMULATE_H)
#define OSC_RDMA_ACCUMULATE_H
#include "osc_rdma.h"
int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr,
void *result_addr, struct ompi_datatype_t *dt,
int target, OPAL_PTRDIFF_TYPE target_disp,
struct ompi_win_t *win);
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp,
int target_count, struct ompi_datatype_t *target_dt,
struct ompi_op_t *op, struct ompi_win_t *win);
int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr,
struct ompi_datatype_t *dt, int target,
OPAL_PTRDIFF_TYPE target_disp,
struct ompi_op_t *op, struct ompi_win_t *win);
int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr, int result_count,
struct ompi_datatype_t *result_datatype,
int target_rank, MPI_Aint target_disp,
int target_count, struct ompi_datatype_t *target_datatype,
struct ompi_op_t *op, struct ompi_win_t *win);
int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp,
int target_count, struct ompi_datatype_t *target_dt,
struct ompi_op_t *op, struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr, int result_count,
struct ompi_datatype_t *result_datatype,
int target_rank, MPI_Aint target_disp,
int target_count, struct ompi_datatype_t *target_datatype,
struct ompi_op_t *op, struct ompi_win_t *win,
struct ompi_request_t **request);
#endif /* OSC_RDMA_ACCUMULATE_H */

652
ompi/mca/osc/rdma/osc_rdma_active_target.c Обычный файл
Просмотреть файл

@ -0,0 +1,652 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_active_target.h"
#include "mpi.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
/**
* ompi_osc_rdma_pending_post_t:
*
* Describes a post operation that was encountered outside it's
* matching start operation.
*/
struct ompi_osc_rdma_pending_post_t {
opal_list_item_t super;
int rank;
};
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
/**
* Dummy completion function for atomic operations
*/
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
volatile bool *atomic_complete = (volatile bool *) context;
if (atomic_complete) {
*atomic_complete = true;
}
}
/**
* compare_ranks:
*
* @param[in] ptra Pointer to integer item
* @param[in] ptrb Pointer to integer item
*
* @returns 0 if *ptra == *ptrb
* @returns -1 if *ptra < *ptrb
* @returns 1 otherwise
*
* This function is used to sort the rank list. It can be removed if
* groups are always in order.
*/
static int compare_ranks (const void *ptra, const void *ptrb)
{
int a = *((int *) ptra);
int b = *((int *) ptrb);
if (a < b) {
return -1;
} else if (a > b) {
return 1;
}
return 0;
}
/**
* ompi_osc_rdma_get_comm_ranks:
*
* @param[in] module - OSC RDMA module
* @param[in] sub_group - Group with ranks to translate
*
* @returns an array of translated ranks on success or NULL on failure
*
* Translate the ranks given in {sub_group} into ranks in the
* communicator used to create {module}.
*/
static ompi_osc_rdma_peer_t **ompi_osc_rdma_get_peers (ompi_osc_rdma_module_t *module, ompi_group_t *sub_group)
{
int size = ompi_group_size(sub_group);
ompi_osc_rdma_peer_t **peers;
int *ranks1, *ranks2;
int ret;
ranks1 = malloc (sizeof(int) * size);
ranks2 = malloc (sizeof(int) * size);
peers = malloc (sizeof (ompi_osc_rdma_peer_t *) * size);
if (NULL == ranks1 || NULL == ranks2 || NULL == peers) {
free (ranks1);
free (ranks2);
free (peers);
}
for (int i = 0 ; i < size ; ++i) {
ranks1[i] = i;
}
ret = ompi_group_translate_ranks (sub_group, size, ranks1, module->comm->c_local_group,
ranks2);
free (ranks1);
if (OMPI_SUCCESS != ret) {
free (ranks2);
free (peers);
return NULL;
}
qsort (ranks2, size, sizeof (int), compare_ranks);
for (int i = 0 ; i < size ; ++i) {
peers[i] = ompi_osc_rdma_module_peer (module, ranks2[i]);
if (NULL == peers[i]) {
free (peers);
peers = NULL;
break;
}
OBJ_RETAIN(peers[i]);
}
free (ranks2);
return peers;
}
static void ompi_osc_rdma_release_peers (ompi_osc_rdma_peer_t **peers, int npeers)
{
for (int i = 0 ; i < npeers ; ++i) {
OBJ_RELEASE(peers[i]);
}
free (peers);
}
static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank, ompi_osc_rdma_peer_t **peers, int npeers) {
ompi_osc_rdma_state_t *state = module->state;
ompi_osc_rdma_pending_post_t *pending_post;
/* look for the posting peer in the group */
for (int j = 0 ; j < npeers ; ++j) {
if (rank == peers[j]->rank) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"got expected post from %d. still expecting posts from %d processes",
rank, (int) (npeers - state->num_post_msgs - 1)));
++state->num_post_msgs;
return;
}
}
/* post does not belong to this start epoch. save it for later */
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "got unexpected post from %d "
". queueing for later", rank));
pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
pending_post->rank = rank;
OPAL_THREAD_SCOPED_LOCK(&module->lock, opal_list_append (&module->pending_posts, &pending_post->super));
}
int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t **peers;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_state_t *state = module->state;
volatile bool atomic_complete;
ompi_osc_rdma_frag_t *frag;
osc_rdma_counter_t *temp;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post_atomic entering..."));
/* check if we are already in a post epoch */
if (module->pw_group) {
return OMPI_ERR_RMA_SYNC;
}
/* save the group */
OBJ_RETAIN(group);
ompi_group_increment_proc_count(group);
OPAL_THREAD_LOCK(&module->lock);
/* ensure we're not already in a post */
if (NULL != module->pw_group) {
OPAL_THREAD_UNLOCK(&(module->lock));
return OMPI_ERR_RMA_SYNC;
}
module->pw_group = group;
/* Update completion counter. Can't have received any completion
messages yet; complete won't send a completion header until
we've sent a post header. */
state->num_complete_msgs = 0;
OPAL_THREAD_UNLOCK(&module->lock);
/* allocate a temporary buffer for atomic response */
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) {
return OMPI_SUCCESS;
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* translate group ranks into the communicator */
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
if (OPAL_UNLIKELY(NULL == peers)) {
ompi_osc_rdma_frag_complete (frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"sending post messages"));
/* send a hello counter to everyone in group */
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
int post_index;
if (peer->rank == my_rank) {
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
continue;
}
/* get a post index */
atomic_complete = false;
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);
if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
break;
}
ompi_osc_rdma_progress (module);
} while (1);
} else {
*temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
}
post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1);
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
sizeof (osc_rdma_counter_t) * post_index;
do {
OPAL_OUTPUT_VERBOSE((80, ompi_osc_base_framework.framework_output,
"Attempting to post to index %d @ rank %d", post_index, peer->rank));
/* try to post. if the value isn't 0 then another rank is occupying this index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
atomic_complete = false;
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle,
0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);
if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
} else {
ompi_osc_rdma_progress (module);
continue;
}
} else {
*temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
}
if (OPAL_LIKELY(0 == *temp)) {
break;
}
/* prevent circular wait by checking for post messages received */
for (int j = 0 ; j < OMPI_OSC_RDMA_POST_PEER_MAX ; ++j) {
/* no post at this index (yet) */
if (0 == state->post_peers[j]) {
continue;
}
ompi_osc_rdma_handle_post (module, state->post_peers[j] - 1, NULL, 0);
state->post_peers[j] = 0;
}
usleep (100);
} while (1);
}
ompi_osc_rdma_frag_complete (frag);
ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"post complete"));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_pending_post_t *pending_post, *next;
ompi_osc_rdma_state_t *state = module->state;
ompi_osc_rdma_sync_t *sync = &module->all_sync;
int group_size = ompi_group_size (group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering..."));
OPAL_THREAD_LOCK(&module->lock);
/* check if we are already in an access epoch */
if (ompi_osc_rdma_access_epoch_active (module)) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* mark all procs in this group as being in an access epoch */
sync->num_peers = ompi_group_size (group);
sync->sync.pscw.group = group;
/* haven't processed any post messaes yet */
state->num_post_msgs = 0;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering with group size %d...",
sync->num_peers));
if (0 == ompi_group_size (group)) {
/* nothing more to do. this is an empty start epoch */
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
opal_atomic_wmb ();
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;
/* prevent us from entering a passive-target, fence, or another pscw access epoch until
* the matching complete is called */
sync->epoch_active = true;
/* translate the group ranks into the communicator */
sync->peer_list.peers = ompi_osc_rdma_get_peers (module, group);
if (NULL == sync->peer_list.peers) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* save the group */
OBJ_RETAIN(group);
ompi_group_increment_proc_count(group);
if (!(assert & MPI_MODE_NOCHECK)) {
/* look through list of pending posts */
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
for (int i = 0 ; i < group_size ; ++i) {
ompi_osc_rdma_peer_t *peer = sync->peer_list.peers[i];
if (pending_post->rank == peer->rank) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"found queued post from %d. still expecting posts from %d processes",
peer->rank, (int) (group_size - state->num_post_msgs - 1)));
opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post);
/* only one thread can process post messages so there is no need of atomics here */
++state->num_post_msgs;
break;
}
}
}
/* wait for all post messages to arrive */
while (state->num_post_msgs != group_size) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"Waiting for post messages. Have %d of %d",
(int) state->num_post_msgs, group_size));
for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) {
/* no post at this index (yet) */
if (0 == state->post_peers[i]) {
continue;
}
ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, group_size);
state->post_peers[i] = 0;
}
ompi_osc_rdma_progress (module);
}
} else {
state->num_post_msgs = group_size;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_start complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *sync = &module->all_sync;
ompi_osc_rdma_peer_t **peers;
ompi_group_t *group;
int group_size;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete entering..."));
OPAL_THREAD_LOCK(&module->lock);
if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW != sync->type) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* phase 1 cleanup sync object */
group = sync->sync.pscw.group;
group_size = sync->num_peers;
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
sync->epoch_active = false;
/* phase 2 cleanup group */
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
peers = sync->peer_list.peers;
if (NULL == peers) {
/* empty peer list */
OPAL_THREAD_UNLOCK(&(module->lock));
OBJ_RELEASE(group);
return OMPI_SUCCESS;
}
sync->peer_list.peers = NULL;
OPAL_THREAD_UNLOCK(&(module->lock));
ompi_osc_rdma_sync_rdma_complete (sync);
/* for each process in the group increment their number of complete messages */
for (int i = 0 ; i < group_size ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs);
int ret;
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle,
1, MCA_BTL_ATOMIC_ADD, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, NULL, NULL);
} else {
/* don't care about the read value so use the scratch lock */
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, &module->state->scratch_lock,
target, module->state_handle, peer->state_handle, 1, MCA_BTL_ATOMIC_ADD,
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL);
}
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
break;
}
} while (1);
} else {
(void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1);
}
}
/* release our reference to peers in this group */
ompi_osc_rdma_release_peers (peers, group_size);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete complete"));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_wait_atomic (ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_state_t *state = module->state;
ompi_group_t *group;
int group_size;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering..."));
OPAL_THREAD_LOCK(&module->lock);
if (NULL == module->pw_group) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait_atomic no post group"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
group_size = ompi_group_size (module->pw_group);
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait_atomic group size %d, complete messages %d",
group_size, (int) state->num_complete_msgs));
while (group_size != state->num_complete_msgs) {
ompi_osc_rdma_progress (module);
opal_atomic_mb ();
}
OPAL_THREAD_LOCK(&module->lock);
state->num_complete_msgs = 0;
group = module->pw_group;
module->pw_group = NULL;
OPAL_THREAD_UNLOCK(&module->lock);
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait complete"));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_state_t *state = module->state;
ompi_group_t *group;
int group_size;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_test_atomic entering..."));
OPAL_THREAD_LOCK(&module->lock);
if (NULL == module->pw_group) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_test_atomic no post group"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
group_size = ompi_group_size (module->pw_group);
*flag = (group_size == state->num_complete_msgs);
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_test_atomic flag %d", *flag));
if (!*flag) {
ompi_osc_rdma_progress (module);
return OMPI_SUCCESS;
}
state->num_complete_msgs = 0;
OPAL_THREAD_LOCK(&(module->lock));
group = module->pw_group;
module->pw_group = NULL;
OPAL_THREAD_UNLOCK(&(module->lock));
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence start"));
/* can't enter an active target epoch while a lock is active */
if (ompi_osc_rdma_in_passive_epoch (module)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: could not enter fence. already in an access epoch"));
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
/* active sends are now active (we will close the epoch if NOSUCCEED is specified) */
if (0 == (assert & MPI_MODE_NOSUCCEED)) {
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_FENCE;
module->all_sync.num_peers = ompi_comm_size (module->comm);
/* NTH: should add a fast access array for peers here later. for now just use the
* hash table. */
}
/* technically it is possible to enter a lock epoch (which will close the fence epoch) if
* no communication has occurred. this flag will be set on the next put, get, accumulate, etc. */
module->all_sync.epoch_active = false;
/* short-circuit the noprecede case */
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence end (short circuit)"));
/* no communication can occur until a peer has entered the same fence epoch. for now
* a barrier is used to ensure this is the case. */
ret = module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
/* ensure all writes to my memory are complete */
ret = module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module);
if (assert & MPI_MODE_NOSUCCEED) {
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence end: %d", ret));
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}

42
ompi/mca/osc/rdma/osc_rdma_active_target.h Обычный файл
Просмотреть файл

@ -0,0 +1,42 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OSC_RDMA_ACTIVE_TARGET_H)
#define OSC_RDMA_ACTIVE_TARGET_H
#include "osc_rdma.h"
#include "osc_rdma_sync.h"
#include "osc_rdma_lock.h"
int ompi_osc_rdma_fence_atomic (int assert, struct ompi_win_t *win);
int ompi_osc_rdma_start_atomic (struct ompi_group_t *group,
int assert, struct ompi_win_t *win);
int ompi_osc_rdma_complete_atomic (struct ompi_win_t *win);
int ompi_osc_rdma_post_atomic (struct ompi_group_t *group,
int assert, struct ompi_win_t *win);
int ompi_osc_rdma_wait_atomic (struct ompi_win_t *win);
int ompi_osc_rdma_test_atomic (struct ompi_win_t *win, int *flag);
#endif /* OSC_RDMA_ACTIVE_TARGET_H */

874
ompi/mca/osc/rdma/osc_rdma_comm.c Обычный файл
Просмотреть файл

@ -0,0 +1,874 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma_comm.h"
#include "osc_rdma_sync.h"
#include "osc_rdma_request.h"
#include "osc_rdma_dynamic.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
assert (OPAL_SUCCESS == status);
((bool *) context)[0] = true;
}
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
void *data, size_t len)
{
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
volatile bool read_complete = false;
char *ptr = data;
int ret;
if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) {
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "error allocating a fragment!"));
return ret;
}
local_handle = frag->handle;
}
assert (!(source_address & (module->selected_btl->btl_get_alignment - 1)));
do {
ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, source_address,
local_handle, source_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
break;
}
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl get failed with ret = %d", ret));
if (frag) {
ompi_osc_rdma_frag_complete (frag);
}
return ret;
}
/* block until the callback is called */
while (!read_complete) {
ompi_osc_rdma_progress (module);
}
opal_memchecker_base_mem_defined (ptr, len);
if (frag) {
memcpy (data, ptr, len);
/* done with the fragment */
ompi_osc_rdma_frag_complete (frag);
}
return OMPI_SUCCESS;
}
/**
* @brief function signature for the rdma transfer function used by ompi_osc_rdma_master_noncontig()
*
* @param[in] peer peer object for remote peer
* @param[in] remote_address base of remote region (destination for put, source for get)
* @param[in] remote_handle btl registration handle for remote region (must be valid for the entire region)
* @param[in] local_address base of local region (source for put, destination for get)
* @param[in] size number of bytes to transfer
* @param[in] module osc rdma module
* @param[in] request osc rdma request if used (can be NULL)
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_OUT_OF_RESOURCE on temporary error
* @returns other OMPI error on fatal error
*
* This function does the work of scheduling a contiguous transfer between the local and remote regions.
*/
typedef int (*ompi_osc_rdma_fn_t) (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
mca_btl_base_registration_handle_t *remote_handle, void *local_address, size_t size,
ompi_osc_rdma_request_t *request);
/**
* @brief break down rdma transaction into contiguous regions
*
* @param[in] local_address base of local region (source for put, destination for get)
* @param[in] local_count number of elements in local region
* @param[in] local_datatype datatype of local region
* @param[in] peer peer object for remote peer
* @param[in] remote_address base of remote region (destination for put, source for get)
* @param[in] remote_handle btl registration handle for remote region (must be valid for the entire region)
* @param[in] remote_count number of elements in remote region
* @param[in] remote_datatype datatype of remote region
* @param[in] module osc rdma module
* @param[in] request osc rdma request if used (can be NULL)
* @param[in] max_rdma_len maximum length of an rdma request (usually btl limitation)
* @param[in] rdma_fn function to use for contiguous rdma operations
* @param[in] alloc_reqs true if rdma_fn requires a valid request object (any allocated objects will be marked internal)
*
* This function does the work of breaking a non-contiguous rdma transfer into contiguous components. It will
* continue to submit rdma transfers until the entire region is transferred or a fatal error occurs.
*/
static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count, ompi_datatype_t *local_datatype,
ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
mca_btl_base_registration_handle_t *remote_handle, int remote_count,
ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs)
{
ompi_osc_rdma_module_t *module = sync->module;
struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
opal_convertor_t local_convertor, remote_convertor;
uint32_t local_iov_count, remote_iov_count;
uint32_t local_iov_index, remote_iov_index;
/* needed for opal_convertor_raw but not used */
size_t local_size, remote_size, rdma_len;
ompi_osc_rdma_request_t *subreq;
int ret;
bool done;
subreq = NULL;
/* prepare convertors for the source and target. these convertors will be used to determine the
* contiguous segments within the source and target. */
OBJ_CONSTRUCT(&remote_convertor, opal_convertor_t);
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &remote_datatype->super, remote_count,
(void *) (intptr_t) remote_address, 0, &remote_convertor);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
OBJ_CONSTRUCT(&local_convertor, opal_convertor_t);
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &local_datatype->super, local_count,
local_address, 0, &local_convertor);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (request) {
/* keep the request from completing until all the transfers have started */
request->outstanding_requests = 1;
}
local_iov_index = 0;
local_iov_count = 0;
do {
/* decode segments of the remote data */
remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
remote_iov_index = 0;
/* opal_convertor_raw returns done when it has reached the end of the data */
done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size);
/* loop on the target segments until we have exhaused the decoded source data */
while (remote_iov_index != remote_iov_count) {
if (local_iov_index == local_iov_count) {
/* decode segments of the target buffer */
local_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
local_iov_index = 0;
(void) opal_convertor_raw (&local_convertor, local_iovec, &local_iov_count, &local_size);
}
/* we already checked that the target was large enough. this should be impossible */
assert (0 != local_iov_count);
OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, "local index = %d, local = {%p, %lu}, "
"source_index = %d, source = {%p, %lu}", local_iov_index, local_iovec[local_iov_index].iov_base,
(unsigned long) local_iovec[local_iov_index].iov_len, remote_iov_index, remote_iovec[remote_iov_index].iov_base,
(unsigned long) remote_iovec[remote_iov_index].iov_len));
/* determine how much to transfer in this operation */
rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len);
/* execute the get */
if (!subreq && alloc_reqs) {
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
if (NULL == subreq) {
ompi_osc_rdma_progress (module);
continue;
}
subreq->internal = true;
subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
subreq->parent_request = request;
if (request) {
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
}
} else if (!alloc_reqs) {
subreq = request;
}
ret = rdma_fn (sync, peer, (uint64_t) (intptr_t) remote_iovec[remote_iov_index].iov_base, remote_handle,
local_iovec[local_iov_index].iov_base, rdma_len, subreq);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
if (request) {
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
}
if (alloc_reqs) {
OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
}
/* something bad happened. need to figure out best way to handle rma errors */
return ret;
}
/* progress and try again */
ompi_osc_rdma_progress (module);
continue;
}
subreq = NULL;
/* adjust io vectors */
local_iovec[local_iov_index].iov_len -= rdma_len;
remote_iovec[remote_iov_index].iov_len -= rdma_len;
local_iovec[local_iov_index].iov_base = (void *)((intptr_t) local_iovec[local_iov_index].iov_base + rdma_len);
remote_iovec[remote_iov_index].iov_base = (void *)((intptr_t) remote_iovec[remote_iov_index].iov_base + rdma_len);
local_iov_index += (0 == local_iovec[local_iov_index].iov_len);
remote_iov_index += (0 == remote_iovec[remote_iov_index].iov_len);
}
} while (!done);
if (request) {
/* release our reference so the request can complete */
if (1 == request->outstanding_requests) {
ompi_osc_rdma_request_complete (request, OMPI_SUCCESS);
}
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
}
/* clean up convertors */
opal_convertor_cleanup (&local_convertor);
OBJ_DESTRUCT(&local_convertor);
opal_convertor_cleanup (&remote_convertor);
OBJ_DESTRUCT(&remote_convertor);
return OMPI_SUCCESS;
}
static inline int ompi_osc_rdma_master (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count,
ompi_datatype_t *local_datatype, ompi_osc_rdma_peer_t *peer,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
int remote_count, ompi_datatype_t *remote_datatype,
ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
{
size_t rdma_len;
ptrdiff_t lb, extent;
int ret;
rdma_len = local_datatype->super.size * local_count;
/* fast path for contiguous rdma */
if (OPAL_LIKELY(ompi_datatype_is_contiguous_memory_layout (local_datatype, local_count) &&
ompi_datatype_is_contiguous_memory_layout (remote_datatype, remote_count) &&
rdma_len <= max_rdma_len)) {
if (NULL == request && alloc_reqs) {
ompi_osc_rdma_module_t *module = sync->module;
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
if (NULL == request) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
request->internal = true;
request->type = OMPI_OSC_RDMA_TYPE_RDMA;
}
/* ignore failure here */
(void) ompi_datatype_get_extent (local_datatype, &lb, &extent);
local_address = (void *)((intptr_t) local_address + lb);
(void) ompi_datatype_get_extent (remote_datatype, &lb, &extent);
remote_address += lb;
do {
ret = rdma_fn (sync, peer, remote_address, remote_handle, local_address, rdma_len, request);
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_progress (sync->module);
} while (1);
}
return ompi_osc_rdma_master_noncontig (sync, local_address, local_count, local_datatype, peer, remote_address,
remote_handle, remote_count, remote_datatype, request,
max_rdma_len, rdma_fn, alloc_reqs);
}
static int ompi_osc_rdma_copy_local (const void *source, int source_count, ompi_datatype_t *source_datatype,
void *target, int target_count, ompi_datatype_t *target_datatype,
ompi_osc_rdma_request_t *request)
{
int ret;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "performing local copy from %p -> %p", source, target));
opal_atomic_mb ();
ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype);
if (request) {
ompi_osc_rdma_request_complete (request, ret);
}
return ret;
}
static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
ompi_osc_rdma_request_t *request = NULL;
assert (OPAL_SUCCESS == status);
/* the lowest bit is used as a flag indicating this put operation has a request */
if ((intptr_t) context & 0x1) {
request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
sync = request->sync;
/* NTH -- TODO: better error handling */
ompi_osc_rdma_request_complete (request, status);
}
if (frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, local_handle);
}
ompi_osc_rdma_sync_rdma_dec (sync);
}
static void ompi_osc_rdma_aggregate_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_aggregation_t *aggregation = (ompi_osc_rdma_aggregation_t *) context;
ompi_osc_rdma_sync_t *sync = aggregation->sync;
ompi_osc_rdma_frag_t *frag = aggregation->frag;
ompi_osc_rdma_request_t *request = NULL, *next;
assert (OPAL_SUCCESS == status);
ompi_osc_rdma_frag_complete (frag);
OPAL_LIST_FOREACH_SAFE(request, next, &aggregation->requests, ompi_osc_rdma_request_t) {
opal_list_remove_item (&aggregation->requests, (opal_list_item_t *) request);
ompi_osc_rdma_request_complete (request, status);
}
ompi_osc_rdma_aggregation_return (aggregation);
/* make sure the aggregation is returned before marking the operation as complete */
opal_atomic_wmb ();
ompi_osc_rdma_sync_rdma_dec (sync);
}
static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *ptr,
mca_btl_base_registration_handle_t *local_handle, size_t size,
mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
ompi_osc_rdma_module_t *module = sync->module;
int ret;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating the btl put of %llu bytes to remote "
"address %" PRIx64 ", sync object %p...", (unsigned long long) size, target_address, (void *) sync));
/* flag outstanding rma requests */
ompi_osc_rdma_sync_rdma_inc (sync);
do {
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
cb, context, cbdata);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
++module->put_retry_count;
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
break;
}
/* spin a bit on progress */
for (int i = 0 ; i < 10 ; ++i) {
ompi_osc_rdma_progress (module);
}
} while (1);
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl put failed with code %d", ret));
return ret;
}
static void ompi_osc_rdma_aggregate_append (ompi_osc_rdma_aggregation_t *aggregation, ompi_osc_rdma_request_t *request,
void *source_buffer, size_t size)
{
size_t offset = aggregation->buffer_used;
memcpy (aggregation->buffer + offset, source_buffer, size);
aggregation->buffer_used += size;
if (request) {
opal_list_append (&aggregation->requests, (opal_list_item_t *) request);
}
}
static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
ompi_osc_rdma_request_t *request, int type)
{
ompi_osc_rdma_module_t *module = sync->module;
ompi_osc_rdma_aggregation_t *aggregation;
int ret;
aggregation = (ompi_osc_rdma_aggregation_t *) opal_free_list_get (&mca_osc_rdma_component.aggregate);
if (OPAL_UNLIKELY(NULL == aggregation)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
ret = ompi_osc_rdma_frag_alloc (module, mca_osc_rdma_component.aggregation_limit, &aggregation->frag,
&aggregation->buffer);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_free_list_return(&mca_osc_rdma_component.aggregate, (opal_free_list_item_t *) aggregation);
return ret;
}
peer->aggregate = aggregation;
aggregation->target_address = target_address;
aggregation->target_handle = target_handle;
aggregation->buffer_size = mca_osc_rdma_component.aggregation_limit;
aggregation->sync = sync;
aggregation->peer = peer;
aggregation->type = type;
aggregation->buffer_used = 0;
ompi_osc_rdma_aggregate_append (aggregation, request, source_buffer, size);
OPAL_THREAD_SCOPED_LOCK(&sync->lock, opal_list_append (&sync->aggregations, (opal_list_item_t *) aggregation));
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
char *ptr = source_buffer;
void *cbcontext;
int ret;
if (aggregation) {
if (size <= (aggregation->buffer_size - aggregation->buffer_used) && (target_handle == aggregation->target_handle) &&
(target_address == aggregation->target_address + aggregation->buffer_used)) {
assert (OMPI_OSC_RDMA_TYPE_PUT == aggregation->type);
ompi_osc_rdma_aggregate_append (aggregation, request, source_buffer, size);
return OMPI_SUCCESS;
}
/* can't aggregate this operation. flush the previous segment */
ret = ompi_osc_rdma_peer_aggregate_flush (peer);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
if (size <= (mca_osc_rdma_component.aggregation_limit >> 2)) {
ret = ompi_osc_rdma_aggregate_alloc (sync, peer, target_address, target_handle, source_buffer, size, request,
OMPI_OSC_RDMA_TYPE_PUT);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
return ret;
}
}
if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) {
ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
} else {
memcpy (ptr, source_buffer, size);
local_handle = frag->handle;
}
}
/* increment the outstanding request counter in the request object */
if (request) {
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
cbcontext = (void *) ((intptr_t) request | 1);
request->sync = sync;
} else {
cbcontext = (void *) sync;
}
ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, ompi_osc_rdma_put_complete,
cbcontext, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
return ret;
}
static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
intptr_t source = (intptr_t) local_address + request->offset;
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
ompi_osc_rdma_sync_t *sync = request->sync;
void *origin_addr = request->origin_addr;
OPAL_OUTPUT_VERBOSE((status ? 10 : 60, ompi_osc_base_framework.framework_output, "btl get operation complete with status %d",
status));
assert (OPAL_SUCCESS == status);
if (NULL != frag) {
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "copying %lu bytes from temporary buffer %p to destination %p",
request->len, (void *) source, origin_addr));
memcpy (origin_addr, (void *) source, request->len);
}
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, local_handle);
}
ompi_osc_rdma_sync_rdma_dec (sync);
ompi_osc_rdma_request_complete (request, status);
}
int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
{
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
int ret;
if (NULL == aggregation) {
return OMPI_SUCCESS;
}
assert (OMPI_OSC_RDMA_TYPE_PUT == aggregation->type);
ret = ompi_osc_rdma_put_real (aggregation->sync, peer, aggregation->target_address, aggregation->target_handle,
aggregation->buffer, aggregation->frag->handle, aggregation->buffer_used,
ompi_osc_rdma_aggregate_put_complete, (void *) aggregation, NULL);
peer->aggregate = NULL;
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_cleanup_rdma (aggregation->sync, aggregation->frag, NULL, NULL);
ompi_osc_rdma_aggregation_return (aggregation);
return ret;
}
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
osc_rdma_size_t aligned_len;
osc_rdma_base_t aligned_source_base, aligned_source_bound;
char *ptr = target_buffer;
int ret;
aligned_source_base = source_address & ~btl_alignment_mask;
aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
aligned_len = aligned_source_bound - aligned_source_base;
request->offset = source_address - aligned_source_base;
request->len = size;
request->origin_addr = target_buffer;
request->sync = sync;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating get from remote ptr %" PRIx64 " to local ptr %p",
source_address, target_buffer));
if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* check for alignment */
if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
(void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE,
&local_handle);
}
if (OPAL_UNLIKELY(NULL == local_handle)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
} else {
local_handle = frag->handle;
}
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get... source: %" PRIx64
" (handle 0x%llx, 0x%llx), %" PRIu64 ", destination: %p, %" PRIu64, source_address,
((unsigned long long *) source_handle)[0], ((unsigned long long *) source_handle)[1],
aligned_len, ptr, aligned_len));
ompi_osc_rdma_sync_rdma_inc (sync);
do {
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, aligned_source_base, local_handle,
source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
request, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
++module->get_retry_count;
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
break;
}
/* spin a bit on progress */
for (int i = 0 ; i < 10 ; ++i) {
ompi_osc_rdma_progress (module);
}
} while (1);
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl get failed with ret = %d", ret));
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
return ret;
}
static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
struct ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer,
OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_registration_handle_t *target_handle;
uint64_t target_address;
int ret;
/* short-circuit case */
if (0 == origin_count || 0 == target_count) {
if (request) {
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
}
return OMPI_SUCCESS;
}
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count,
&target_address, &target_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* optimize communication with peers that we can do direct load and store operations on */
if (ompi_osc_rdma_peer_local_base (peer)) {
return ompi_osc_rdma_copy_local (origin_addr, origin_count, origin_datatype, (void *) (intptr_t) target_address,
target_count, target_datatype, request);
}
return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle,
target_count, target_datatype, request, module->selected_btl->btl_put_limit,
ompi_osc_rdma_put_contig, false);
}
static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE source_disp, int source_count,
struct ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_registration_handle_t *source_handle;
uint64_t source_address;
int ret;
/* short-circuit case */
if (0 == origin_count || 0 == source_count) {
if (request) {
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
}
return OMPI_SUCCESS;
}
ret = osc_rdma_get_remote_segment (module, peer, source_disp, source_datatype->super.size * source_count,
&source_address, &source_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* optimize self/local communication */
if (ompi_osc_rdma_peer_local_base (peer)) {
return ompi_osc_rdma_copy_local ((void *) (intptr_t) source_address, source_count, source_datatype,
origin_addr, origin_count, origin_datatype, request);
}
return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address,
source_handle, source_count, source_datatype, request,
module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true);
}
int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_datatype, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *sync;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
(int) target_disp, target_count, target_datatype->name, win->w_name));
return ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
target_count, target_datatype, NULL);
}
int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_datatype, struct ompi_win_t *win,
struct ompi_request_t **request)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_t *rdma_request;
ompi_osc_rdma_sync_t *sync;
int ret;
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "rput: 0x%lx, %d, %s, %d, %d, "
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
origin_datatype->name, target_rank, (int) target_disp, target_count,
target_datatype->name, win->w_name));
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
if (NULL == rdma_request) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
rdma_request->type = OMPI_OSC_RDMA_TYPE_PUT;
ret = ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
target_count, target_datatype, rdma_request);
if (OMPI_SUCCESS != ret) {
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
return ret;
}
*request = (ompi_request_t *) rdma_request;
return OMPI_SUCCESS;
}
int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count,
struct ompi_datatype_t *source_datatype, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *sync;
sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "get: 0x%lx, %d, %s, %d, %d, "
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
origin_datatype->name, source_rank, (int) source_disp, source_count,
source_datatype->name, win->w_name));
return ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
source_disp, source_count, source_datatype, NULL);
}
int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count,
struct ompi_datatype_t *source_datatype, struct ompi_win_t *win,
struct ompi_request_t **request)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_t *rdma_request;
ompi_osc_rdma_sync_t *sync;
int ret;
sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
if (OPAL_UNLIKELY(NULL == sync)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "rget: 0x%lx, %d, %s, %d, %d, "
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
origin_datatype->name, source_rank, (int) source_disp, source_count,
source_datatype->name, win->w_name));
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
if (NULL == rdma_request) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
rdma_request->type = OMPI_OSC_RDMA_TYPE_GET;
ret = ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
source_disp, source_count, source_datatype, rdma_request);
if (OMPI_SUCCESS != ret) {
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
return ret;
}
*request = (ompi_request_t *) rdma_request;
return OMPI_SUCCESS;
}

136
ompi/mca/osc/rdma/osc_rdma_comm.h Обычный файл
Просмотреть файл

@ -0,0 +1,136 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OMPI_OSC_RDMA_COMM_H)
#define OMPI_OSC_RDMA_COMM_H
#include "osc_rdma_dynamic.h"
#include "osc_rdma_request.h"
#include "osc_rdma_sync.h"
#include "osc_rdma_lock.h"
#define OMPI_OSC_RDMA_DECODE_MAX 64
#define min(a,b) ((a) < (b) ? (a) : (b))
#define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0)
/* helper functions */
static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_frag_t *frag,
mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
{
if (frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, handle);
}
if (request) {
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
}
ompi_osc_rdma_sync_rdma_dec (sync);
}
/**
* @brief find a remote segment associate with the memory region
*
* @param[in] module osc rdma module
* @param[in] peer peer object for remote peer
* @param[in] target_disp displacement in remote region
* @param[in] length length of remote region
* @param[out] remote_address remote address
* @param[out] remote_handle btl handle for remote region (valid over entire region)
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_RANGE if the address range is not valid at the remote window
* @returns other OMPI error on error
*/
static inline int osc_rdma_get_remote_segment (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE target_disp,
size_t length, uint64_t *remote_address, mca_btl_base_registration_handle_t **remote_handle)
{
ompi_osc_rdma_region_t *region;
int ret;
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "getting remote address for peer %d target_disp %lu",
peer->rank, (unsigned long) target_disp));
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
ret = ompi_osc_rdma_find_dynamic_region (module, peer, (uint64_t) target_disp, length, &region);
if (OMPI_SUCCESS != ret) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"could not retrieve region for %" PRIx64 " from window rank %d", (uint64_t) target_disp, peer->rank));
return ret;
}
*remote_address = (uint64_t) target_disp;
*remote_handle = (mca_btl_base_registration_handle_t *) region->btl_handle_data;
} else {
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
int disp_unit = (module->same_disp_unit) ? module->disp_unit : ex_peer->disp_unit;
size_t size = (module->same_size) ? module->size : (size_t) ex_peer->size;
*remote_address = ex_peer->super.base +disp_unit * target_disp;
if (OPAL_UNLIKELY(*remote_address + length > (ex_peer->super.base + size))) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "remote address range 0x%" PRIx64 " - 0x%" PRIx64
" is out of range. Valid address range is 0x%" PRIx64 " - 0x%" PRIx64 " (%" PRIu64 " bytes)",
*remote_address, *remote_address + length, ex_peer->super.base, ex_peer->super.base + size,
(uint64_t) size));
return OMPI_ERR_RMA_RANGE;
}
*remote_handle = ex_peer->super.base_handle;
}
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output,
"remote address: 0x%" PRIx64 ", handle: %p", *remote_address, (void *) *remote_handle));
return OMPI_SUCCESS;
}
/* prototypes for implementations of MPI RMA window functions. these will be called from the
* mpi interface (ompi/mpi/c) */
int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_dt, struct ompi_win_t *win);
int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_dt, struct ompi_win_t *win);
int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
struct ompi_request_t **request);
/**
* @brief read data from a remote memory region (blocking)
*
* @param[in] module osc rdma module
* @param[in] endpoint btl endpoint
* @param[in] source_address remote address to read from
* @param[in] source_handle btl registration handle for remote region (must be valid for the entire region)
* @param[in] data local buffer to store to
* @param[in] len number of bytes to read
*
* This is an internal function for reading data from a remote peer. It is used to read peer and state
* data that is stored on the remote peer. The peer object does not have to be fully initialized to
* work. Only the btl endpoint is needed.
*/
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
void *data, size_t len);
#endif /* OMPI_OSC_RDMA_COMM_H */

1196
ompi/mca/osc/rdma/osc_rdma_component.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

372
ompi/mca/osc/rdma/osc_rdma_dynamic.c Обычный файл
Просмотреть файл

@ -0,0 +1,372 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma_comm.h"
#include "osc_rdma_lock.h"
#include "mpi.h"
/**
* ompi_osc_rdma_find_region_containing:
*
* @param[in] regions sorted list of regions
* @param[in] min_index minimum index to search (call with 0)
* @param[in] max_index maximum index to search (call with length - 1)
* @param[in] base base of region to search for
* @param[in] bound bound of region to search for
* @param[in] region_size size of an ompi_osc_rdma_region_t object
* @param[out] region_index index of region if found (may be NULL)
*
* @returns an index on success or -1 on failure
*
* This function searches through a sorted list of rdma regions {regions} and finds
* the region that contains the region specified by {base} and {bound}. If a
* matching region is found the index of that region is returned else the function
* returns -1.
*/
static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi_osc_rdma_region_t *regions, int min_index,
int max_index, intptr_t base, intptr_t bound,
size_t region_size, int *region_index)
{
int mid_index = (max_index + min_index) >> 1;
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
intptr_t region_bound;
if (min_index > max_index) {
return NULL;
}
region_bound = (intptr_t) (region->base + region->len);
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
min_index, max_index));
if (region->base > base) {
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
} else if (bound <= region_bound) {
if (region_index) {
*region_index = mid_index;
}
return region;
}
return ompi_osc_rdma_find_region_containing (regions, mid_index+1, max_index, base, bound, region_size, region_index);
}
/* binary search for insertion point */
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
size_t region_size, int *region_index)
{
int mid_index = (max_index + min_index) >> 1;
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
if (max_index < min_index) {
*region_index = mid_index;
return region;
}
if (region->base > base) {
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
} else {
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
}
}
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
ompi_osc_rdma_region_t *region;
osc_rdma_counter_t region_count;
osc_rdma_counter_t region_id;
intptr_t page_size = getpagesize ();
int region_index;
int ret;
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_RMA_FLAVOR;
}
if (0 == len) {
/* shot-circuit 0-byte case */
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
region_count = module->state->region_count & 0xffffffffL;
region_id = module->state->region_count >> 32;
if (region_count == mca_osc_rdma_component.max_attach) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_ATTACH;
}
/* see if a matching region already exists */
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
(intptr_t) base + len, module->region_size, &region_index);
if (NULL != region) {
++module->dynamic_handles[region_index].refcnt;
OPAL_THREAD_UNLOCK(&module->lock);
/* no need to invalidate remote caches */
return OMPI_SUCCESS;
}
/* region is in flux */
module->state->region_count = -1;
opal_atomic_wmb ();
ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
/* do a binary seach for where the region should be inserted */
if (region_count) {
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
module->region_size, &region_index);
if (region_index < region_count) {
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
if (module->selected_btl->btl_register_mem) {
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
}
}
} else {
region_index = 0;
region = (ompi_osc_rdma_region_t *) module->state->regions;
}
/* it is wasteful to register less than a page. this may allow the remote side to access more
* memory but the MPI standard covers this with calling the calling behavior erroneous */
region->base = OPAL_ALIGN((intptr_t) base - page_size + 1, page_size, intptr_t);
region->len = OPAL_ALIGN(len, page_size, size_t);
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Attaching memory region %p-%p at index %d",
base, (void *)((intptr_t) base + len), region_index));
if (module->selected_btl->btl_register_mem) {
mca_btl_base_registration_handle_t *handle;
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
&handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_ATTACH;
}
memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
module->dynamic_handles[region_index].btl_handle = handle;
} else {
module->dynamic_handles[region_index].btl_handle = NULL;
}
module->dynamic_handles[region_index].refcnt = 1;
for (int i = 0 ; i < region_count + 1 ; ++i) {
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + i * module->region_size);
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Cache[%d] = {%p, %lu}",
i, (void *) region->base, (unsigned long) region->len));
}
opal_atomic_mb ();
/* the region state has changed */
module->state->region_count = ((region_id + 1) << 32) | (region_count + 1);
ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
osc_rdma_counter_t region_count, region_id;
ompi_osc_rdma_region_t *region;
int region_index;
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_WIN;
}
OPAL_THREAD_LOCK(&module->lock);
/* the upper 4 bytes of the region count are an instance counter */
region_count = module->state->region_count & 0xffffffffL;
region_id = module->state->region_count >> 32;
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
module->region_size, &region_index);
if (NULL == region) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERROR;
}
if (--module->dynamic_handles[region_index].refcnt > 0) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
/* lock the region so it can't change while a peer is reading it */
ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Detaching memory region %p-%p at index %d",
base, (void *)((intptr_t) base + region->len), region_index));
if (module->selected_btl && module->selected_btl->btl_register_mem) {
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
if (region_index < region_count - 1) {
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
(region_count - region_index - 1) * sizeof (void *));
}
memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
}
if (region_index < region_count - 1) {
memmove (region, (void *)((intptr_t) region + module->region_size),
(region_count - region_index - 1) * module->region_size);;
}
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
/**
* @brief refresh the local view of the dynamic memory region
*
* @param[in] module osc rdma module
* @param[in] peer peer object to refresh
*
* This function does the work of keeping the local view of a remote peer in sync with what is attached
* to the remote window. It is called on every address translation since there is no way (currently) to
* detect that the attached regions have changed. To reduce the amount of data read we first read the
* region count (which contains an id). If that hasn't changed the region data is not updated. If the
* list of attached regions has changed then all valid regions are read from the peer while holding
* their region lock.
*/
static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_dynamic_t *peer) {
osc_rdma_counter_t region_count, region_id;
uint64_t source_address;
int ret;
/* this loop is meant to prevent us from reading data while the remote side is in attach */
do {
osc_rdma_counter_t remote_value;
source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, region_count);
ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
&remote_value, sizeof (remote_value));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
region_id = remote_value >> 32;
region_count = remote_value & 0xffffffffl;
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "remote memory region: 0x%lx, 0x%lx",
(unsigned long) region_id, (unsigned long) region_count));
/* check if the region is changing */
} while (0xffffffffl == region_count);
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "target has region_id 0x%lx, region_count 0x%lx "
"(cached: 0x%x, 0x%x)", (unsigned long) region_id, (unsigned long) region_count, peer->region_id,
peer->region_count));
if (0 == region_count) {
return OMPI_ERR_RMA_RANGE;
}
/* check if the cached copy is out of date */
OPAL_THREAD_LOCK(&module->lock);
if (peer->region_id != region_id) {
unsigned region_len = module->region_size * region_count;
void *temp;
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "dynamic memory cache is out of data. reloading from peer"));
/* allocate only enough space for the remote regions */
temp = realloc (peer->regions, region_len);
if (NULL == temp) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
peer->regions = temp;
/* lock the region */
ompi_osc_rdma_lock_acquire_shared (module, &peer->super, 1, offsetof (ompi_osc_rdma_state_t, regions_lock),
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions);
ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
peer->regions, region_len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
/* release the region lock */
ompi_osc_rdma_lock_release_shared (module, &peer->super, -1, offsetof (ompi_osc_rdma_state_t, regions_lock));
/* update cached region ids */
peer->region_id = region_id;
peer->region_count = region_count;
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t base, size_t len,
ompi_osc_rdma_region_t **region)
{
ompi_osc_rdma_peer_dynamic_t *dy_peer = (ompi_osc_rdma_peer_dynamic_t *) peer;
intptr_t bound = (intptr_t) base + len;
ompi_osc_rdma_region_t *regions;
int ret, region_count;
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "locating dynamic memory region matching: %"
PRIx64 "-%" PRIx64 " (len %lu)", base, base + len, (unsigned long) len));
ret = ompi_osc_rdma_refresh_dynamic_region (module, dy_peer);
if (OMPI_SUCCESS != ret) {
return ret;
}
regions = dy_peer->regions;
region_count = dy_peer->region_count;
*region = ompi_osc_rdma_find_region_containing (regions, 0, region_count - 1, (intptr_t) base, bound, module->region_size, NULL);
if (!*region) {
return OMPI_ERR_RMA_RANGE;
}
/* round a matching region */
return OMPI_SUCCESS;
}

60
ompi/mca/osc/rdma/osc_rdma_dynamic.h Обычный файл
Просмотреть файл

@ -0,0 +1,60 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma.h"
/**
* @brief attach a region to a window
*
* @param[in] win mpi window
* @param[in] base base pointer of region
* @param[in] len region size
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_FLAVOR if the window is not a dynamic window
* @returns OMPI_ERR_RMA_ATTACH if the region could not be attached
*
* This function attaches a region to the local window. After this call
* completes the region will be available for RMA access by all peers in
* the window.
*/
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len);
/**
* @brief detach a region from a window
*
* @param[in] win mpi window
* @param[in] base base pointer of region specified to ompi_osc_rdma_attach()
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_FLAVOR if the window is not a dynamic window
* @returns OMPI_ERROR if the region is not attached
*
* This function requires that a region with the same base has been attached
* using the ompi_osc_rdma_attach() function.
*/
int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base);
/**
* @brief find dynamic region associated with a peer, base, and len
*
* @param[in] module osc rdma module
* @param[in] peer peer object for remote peer
* @param[in] base base pointer for region
* @param[in] len length of region
* @param[out] region region structure for the region
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_OUT_OF_RESOURCE on resource failure
* @returns OMPI_ERR_RMA_RANGE if no region matches
*/
int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t base, size_t len,
ompi_osc_rdma_region_t **region);

16
ompi/mca/osc/rdma/osc_rdma_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,16 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma.h"
#include "osc_rdma_frag.h"
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_free_list_item_t, NULL, NULL);

125
ompi/mca/osc/rdma/osc_rdma_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,125 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_RDMA_FRAG_H
#define OSC_RDMA_FRAG_H
#include "osc_rdma.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_free_list_item_t super;
/* start of unused space */
unsigned char *top;
/* space remaining in buffer */
uint32_t remain_len;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int32_t pending;
ompi_osc_rdma_module_t *module;
mca_btl_base_registration_handle_t *handle;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag)
{
if (0 == OPAL_THREAD_ADD32(&frag->pending, -1)) {
opal_atomic_rmb ();
ompi_osc_rdma_deregister (frag->module, frag->handle);
frag->handle = NULL;
opal_free_list_return (&mca_osc_rdma_component.frags, (opal_free_list_item_t *) frag);
}
}
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size_t request_len,
ompi_osc_rdma_frag_t **buffer, char **ptr)
{
ompi_osc_rdma_frag_t *curr;
int ret;
/* ensure all buffers are 8-byte aligned */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&module->lock);
curr = module->rdma_frag;
if (OPAL_UNLIKELY(NULL == curr || curr->remain_len < request_len)) {
if (NULL == curr || (NULL != curr && curr->pending > 1)) {
opal_free_list_item_t *item = NULL;
/* release the initial reference to the buffer */
module->rdma_frag = NULL;
if (curr) {
OPAL_THREAD_UNLOCK(&module->lock);
ompi_osc_rdma_frag_complete (curr);
OPAL_THREAD_LOCK(&module->lock);
}
item = opal_free_list_get (&mca_osc_rdma_component.frags);
if (OPAL_UNLIKELY(NULL == item)) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
curr = module->rdma_frag = (ompi_osc_rdma_frag_t *) item;
curr->handle = NULL;
curr->pending = 1;
curr->module = module;
}
curr->top = curr->super.ptr;
curr->remain_len = mca_osc_rdma_component.buffer_size;
if (curr->remain_len < request_len) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
if (!curr->handle && module->selected_btl->btl_register_mem) {
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size,
MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
}
*ptr = (char *) curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
OPAL_THREAD_ADD32(&curr->pending, 1);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
#endif

302
ompi/mca/osc/rdma/osc_rdma_lock.h Обычный файл
Просмотреть файл

@ -0,0 +1,302 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OMPI_OSC_RDMA_LOCK_H)
#define OMPI_OSC_RDMA_LOCK_H
#include "osc_rdma_types.h"
#include "osc_rdma_frag.h"
static inline int ompi_osc_rdma_trylock_local (volatile ompi_osc_rdma_lock_t *lock)
{
return !ompi_osc_rdma_lock_cmpset (lock, 0, OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
}
static inline void ompi_osc_rdma_unlock_local (volatile ompi_osc_rdma_lock_t *lock)
{
(void) ompi_osc_rdma_lock_add (lock, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
}
/**
* Dummy completion function for atomic operations
*/
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status);
/**
* ompi_osc_rdma_lock_acquire_shared:
*
* @param[in] peer - owner of lock
* @param[in] value - increment value
* @param[in] offset - offset of lock in remote peer's state segment
*
* @returns OMPI_SUCCESS on success and another ompi error code on failure
*
* This function increments a remote shared lock. The value provided in
* {value} should be the negative of the one used for ompi_osc_rdma_lock_acquire_shared.
* It is erroneous to release a shared lock not held by the calling process.
*/
static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_lock_t value, ptrdiff_t offset)
{
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
void *temp = &module->state->scratch_lock;
volatile bool atomic_complete = false;
int ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "decrementing shared lock %" PRIx64 " by %lx\n", lock,
(unsigned long) value));
/* spin until the lock has been acquired */
if (!ompi_osc_rdma_peer_local_state (peer)) {
if (module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS) {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) lock, peer->state_handle,
MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
} else {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, (intptr_t) lock, module->state_handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
}
if (OPAL_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
} else if (1 == OPAL_SUCCESS) {
ret = OMPI_SUCCESS;
}
return ret;
} else {
(void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value);
}
return OMPI_SUCCESS;
}
/**
* ompi_osc_rdma_lock_acquire_shared:
*
* @param[in] module - osc rdma module
* @param[in] peer - owner of lock
* @param[in] value - increment value
* @param[in] offset - offset of lock in remote peer's state segment
* @param[in] check - check value for success
*
* @returns OMPI_SUCCESS on success and another ompi error code on failure
*
* This function increments a remote shared lock and checks it against the
* check value in {check}. If any of the bits in the prior counter value
* match those in {check} the function decrements the value and tries again.
*/
static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_lock_t value, ptrdiff_t offset,
ompi_osc_rdma_lock_t check)
{
intptr_t lock = (intptr_t) peer->state + offset;
volatile bool atomic_complete;
ompi_osc_rdma_lock_t *temp;
int ret;
/* spin until the lock has been acquired */
if (!ompi_osc_rdma_peer_local_state (peer)) {
ompi_osc_rdma_frag_t *frag;
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
return ret;
}
do {
atomic_complete = false;
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, (void *) temp, lock, frag->handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS > ret)) {
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "failed to increment shared lock. ret: %d", ret));
return ret;
}
if (1 != ret) {
/* wait for completion of the atomic operation */
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "incremented shared lock 0x%lx by 0x%lx. Old value 0x%lx",
(unsigned long) lock, (unsigned long) value, (unsigned long) *temp));
if (!(*temp & check)) {
break;
}
/* NTH: i think this is correct. backoff! */
ompi_osc_rdma_lock_release_shared (module, peer, -value, offset);
ompi_osc_rdma_progress (module);
} while (1);
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_lock_t lock_state;
do {
lock_state = ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "incremented local shared lock by 0x%lx. Old value 0x%lx",
(unsigned long) value, (unsigned long) lock_state));
if (!(lock_state & check)) {
break;
}
(void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, -value);
ompi_osc_rdma_progress (module);
} while (1);
}
return OMPI_SUCCESS;
}
/**
* ompi_osc_rdma_lock_try_acquire_exclusive:
*
* @param[in] peer - peer to lock
* @param[in] temp - temporary registered location for lock result
* @param[in] temp_seg - registered segment for temp
* @param[in] offset - offset into the remote peer's state segment
*
* @returns 0 on success, 1 on failure
*
* This function attempts to lock the lock at {offset} on the remote
* peer. The buffer pointer to by {temp} must not be modified until
* this functions completes.
*/
static inline int ompi_osc_rdma_lock_try_acquire_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ptrdiff_t offset)
{
uint64_t lock = (uint64_t) (uintptr_t) peer->state + offset;
ompi_osc_rdma_lock_t *temp = NULL;
volatile bool atomic_complete;
int ret;
if (!ompi_osc_rdma_peer_local_state (peer)) {
ompi_osc_rdma_frag_t *frag = NULL;
int result;
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
return ret;
}
/* set the temporary value so we can detect success. note that a lock should never be -1 */
atomic_complete = false;
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, lock, frag->handle,
peer->state_handle, 0, OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0, 0,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS > ret)) {
return ret;
}
if (0 == ret) {
/* wait for the atomic operation to complete */
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "got %lx when attempting compare and swap %" PRIx64 " complete %d",
(unsigned long) *temp, lock, atomic_complete));
result = (*temp != 0);
ompi_osc_rdma_frag_complete (frag);
return result;
}
return ompi_osc_rdma_trylock_local ((int64_t *)(intptr_t) lock);
}
/**
* ompi_osc_rdma_lock_acquire_exclusive:
*
* @param[in] peer - peer to lock
* @param[in] temp - temporary registered location for lock result
* @param[in] temp_seg - registered segment for temp
* @param[in] offset - offset into the remote peer's state segment
*
* @returns OMPI_SUCCESS on success or another ompi error code on failure
*
* This function locks the lock at {offset} on the remote peer. The
* buffer pointed to by {temp} must not be modified until this
* function completes.
*/
static inline int ompi_osc_rdma_lock_acquire_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ptrdiff_t offset)
{
while (ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offset)) {
ompi_osc_rdma_progress (module);
}
return OMPI_SUCCESS;
}
/**
* ompi_osc_rdma_lock_release_exclusive:
*
* @param[in] peer - peer to unlock
* @param[in] offset - offset into the remote peer's state segment
*
* @returns OMPI_SUCCESS on success or another ompi error code on failure
*
* This function unlocks the lock at {offset} in the remote peer's state
* structure. It is illegal to call this function unless this process
* holds the lock.
*/
static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ptrdiff_t offset)
{
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
void *temp = &module->state->scratch_lock;
volatile bool atomic_complete = false;
int ret;
if (!ompi_osc_rdma_peer_local_state (peer)) {
if (module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS) {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, lock, peer->state_handle, MCA_BTL_ATOMIC_ADD,
-OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
} else {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, lock, module->state_handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0,
MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
return ret;
}
if (OPAL_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "unlocked target lock %" PRIx64 " with value 0x%lx. old value 0x%"
PRIx64, lock, (unsigned long) -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, ((uint64_t *) temp)[0]));
} else {
ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock);
}
return OMPI_SUCCESS;
}
#endif /* OMPI_OSC_RDMA_LOCK_H */

144
ompi/mca/osc/rdma/osc_rdma_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,144 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma.h"
#include "osc_rdma_lock.h"
#include "mpi.h"
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
{
int ret = OMPI_SUCCESS;
if (NULL == module->peer_array) {
ret = opal_hash_table_set_value_uint32 (&module->peer_hash, peer->rank, (void *) peer);
} else {
module->peer_array[peer->rank] = peer;
}
return ret;
}
int ompi_osc_rdma_free(ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
uint32_t key;
void *node;
if (NULL == module) {
return OMPI_SUCCESS;
}
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"rdma component destroying window with id %d",
ompi_comm_get_cid(module->comm));
/* finish with a barrier */
if (ompi_group_size(win->w_group) > 1) {
(void) module->comm->c_coll.coll_barrier (module->comm,
module->comm->c_coll.coll_barrier_module);
}
/* remove from component information */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules,
ompi_comm_get_cid(module->comm));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
}
win->w_osc_module = NULL;
if (module->state) {
int region_count = module->state->region_count & 0xffffffffL;
if (NULL != module->dynamic_handles) {
for (int i = 0 ; i < region_count ; ++i) {
ompi_osc_rdma_deregister (module, module->dynamic_handles[i].btl_handle);
}
free (module->dynamic_handles);
}
}
OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->lock);
OBJ_DESTRUCT(&module->peer_lock);
OBJ_DESTRUCT(&module->all_sync);
ompi_osc_rdma_deregister (module, module->state_handle);
ompi_osc_rdma_deregister (module, module->base_handle);
OPAL_LIST_DESTRUCT(&module->pending_posts);
if (NULL != module->rdma_frag) {
ompi_osc_rdma_deregister (module, module->rdma_frag->handle);
}
/* remove all cached peers */
if (NULL == module->peer_array) {
ret = opal_hash_table_get_first_key_uint32 (&module->peer_hash, &key, (void **) &peer, &node);
while (OPAL_SUCCESS == ret) {
OBJ_RELEASE(peer);
ret = opal_hash_table_get_next_key_uint32 (&module->peer_hash, &key, (void **) &peer,
node, &node);
}
OBJ_DESTRUCT(&module->peer_hash);
} else {
for (int i = 0 ; i < ompi_comm_rank (module->comm) ; ++i) {
if (NULL != module->peer_array[i]) {
OBJ_RELEASE(module->peer_array[i]);
}
}
free (module->peer_array);
}
if (NULL != module->outstanding_lock_array) {
free (module->outstanding_lock_array);
}
if (module->local_leaders && MPI_COMM_NULL != module->local_leaders) {
ompi_comm_free (&module->local_leaders);
}
if (module->shared_comm && MPI_COMM_NULL != module->shared_comm) {
ompi_comm_free (&module->shared_comm);
}
if (module->comm && MPI_COMM_NULL != module->comm) {
ompi_comm_free (&module->comm);
}
if (NULL != module->free_after) {
free(module->free_after);
}
if (module->segment_base) {
opal_shmem_segment_detach (&module->seg_ds);
module->segment_base = NULL;
}
free (module);
return OMPI_SUCCESS;
}

369
ompi/mca/osc/rdma/osc_rdma_passive_target.c Обычный файл
Просмотреть файл

@ -0,0 +1,369 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma_passive_target.h"
#include "osc_rdma_comm.h"
#include "mpi.h"
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
ompi_osc_rdma_progress (GET_MODULE(win));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
ompi_osc_rdma_peer_t *peer;
assert (0 <= target);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush. call one round of progress */
ompi_osc_rdma_progress (module);
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_UNLOCK(&module->lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
uint32_t key;
void *node;
/* flush is only allowed from within a passive target epoch */
if (!ompi_osc_rdma_in_passive_epoch (module)) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all entering..."));
/* globally complete all outstanding rdma requests */
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
}
/* flush all locks */
ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
while (OPAL_SUCCESS == ret) {
ompi_osc_rdma_sync_rdma_complete (lock);
ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
node, &node);
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all complete"));
return OPAL_SUCCESS;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
return ompi_osc_rdma_flush (target, win);
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
return ompi_osc_rdma_flush_all (win);
}
/* locking via atomics */
static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
int ret;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
do {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Incrementing global exclusive lock"));
/* lock the master lock. this requires no rank has a global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), 0xffffffff00000000L);
if (OMPI_SUCCESS != ret) {
ompi_osc_rdma_progress (module);
continue;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Acquiring exclusive lock from peer"));
ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
if (ret) {
/* release the global lock */
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
ompi_osc_rdma_progress (module);
continue;
}
peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
break;
} while (1);
} else {
do {
/* go right to the target to acquire a shared lock */
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Incrementing local shared lock"));
ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
if (OMPI_SUCCESS == ret) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_progress (module);
} while (1);
}
return OMPI_SUCCESS;
}
static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
} else {
ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "osc rdma: lock %d %d", target, lock_type));
if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
/* impossible to get an exclusive lock while holding a global shared lock or in a active
* target access epoch */
return OMPI_ERR_RMA_SYNC;
}
/* create lock item */
lock = ompi_osc_rdma_sync_allocate (module);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = target;
lock->sync.lock.type = lock_type;
lock->sync.lock.assert = assert;
lock->peer_list.peer = peer;
lock->num_peers = 1;
OBJ_RETAIN(peer);
if (0 == (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
}
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
++module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
} else {
OBJ_RELEASE(lock);
}
return ret;
}
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ompi_osc_rdma_module_lock_remove (module, lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
}
/* release our reference to this peer */
OBJ_RELEASE(peer);
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: unlock of %d complete", target));
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
/* delete the lock */
ompi_osc_rdma_sync_return (lock);
return ret;
}
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (module->all_sync.epoch_active) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "osc/rdma: attempted "
"to lock all when active target epoch is %s and lock all epoch is %s",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
"active" : "inactive",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* set up lock */
lock = &module->all_sync;
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = -1;
lock->sync.lock.type = MPI_LOCK_SHARED;
lock->sync.lock.assert = assert;
lock->num_peers = ompi_comm_size (module->comm);
lock->epoch_active = true;
/* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
* without having to access the hash table. Such a change would likely increase performance
* at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
* be needed for this array. */
if (0 != (assert & MPI_MODE_NOCHECK)) {
/* increment the global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
offsetof(ompi_osc_rdma_state_t, global_lock),
0x00000000ffffffffUL);
}
if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
} else {
++module->passive_target_access_epoch;
}
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all entering..."));
OPAL_THREAD_LOCK(&module->lock);
lock = &module->all_sync;
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all: not locked in window %s",
win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (0 != (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
/* decrement the master lock shared count */
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock));
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_unlock_all complete"));
return OMPI_SUCCESS;
}

131
ompi/mca/osc/rdma/osc_rdma_passive_target.h Обычный файл
Просмотреть файл

@ -0,0 +1,131 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OSC_RDMA_PASSIVE_TARGET_H)
#define OSC_RDMA_PASSIVE_TARGET_H
#include "osc_rdma.h"
#include "osc_rdma_sync.h"
#include "osc_rdma_lock.h"
/**
* @brief lock the target in the window using network/cpu atomics
*
* @param[in] lock_type mpi lock type (MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE)
* @param[in] target target process
* @param[in] assert asserts
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if there is a conflicting RMA epoch
*/
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win);
/**
* @brief unlock the target in the window using network/cpu atomics
*
* @param[in] target target process
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
*/
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win);
/**
* @brief lock all targets in window using network/cpu atomics
*
* @param[in] assert asserts
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if there is a conflicting RMA epoch
*/
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win);
/**
* @brief unlock all targets in window using network/cpu atomics
*
* @param[in] assert asserts
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if the window is not in a lock all access epoch
*/
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win);
/**
* @brief synchronize the public and private copies of the window
*
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
*
* Just acts as a memory barrier since this module only supports a unified memory
* model.
*/
int ompi_osc_rdma_sync (struct ompi_win_t *win);
/**
* @brief flush rdma transactions to a target
*
* @param[in] target target process
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
*/
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win);
/**
* @brief flush rdma transactions to all target(s)
*
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if no processes are locked
*
* osc/rdma does not make a distinction between local and remote rma
* completion. this could change in a future release as small messages
* may be internally buffered.
*/
int ompi_osc_rdma_flush_all (struct ompi_win_t *win);
/**
* @brief flush rdma transactions to a target (local completion)
*
* @param[in] target target process
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
*
* osc/rdma does not make a distinction between local and remote rma
* completion. this could change in a future release as small messages
* may be internally buffered.
*/
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win);
/**
* @brief flush rdma transactions to all target(s) (local completion)
*
* @param[in] win mpi window
*
* @returns OMPI_SUCCESS on success
* @returns OMPI_ERR_RMA_SYNC if no processes are locked
*
* osc/rdma does not make a distinction between local and remote rma
* completion. this could change in a future release as small messages
* may be internally buffered.
*/
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win);
#endif

323
ompi/mca/osc/rdma/osc_rdma_peer.c Обычный файл
Просмотреть файл

@ -0,0 +1,323 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma_comm.h"
#include "ompi/mca/bml/base/base.h"
#define NODE_ID_TO_RANK(module, node_id) ((node_id) * ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count))
/**
* @brief find the btl endpoint for a process
*
* @param[in] module osc rdma module
* @param[in] peer_id process rank in the module communicator
*
* @returns NULL on error
* @returns btl endpoint on success
*/
struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id)
{
ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id);
mca_bml_base_endpoint_t *bml_endpoint;
int num_btls;
/* for not just use the bml to get the btl endpoint */
bml_endpoint = mca_bml_base_get_endpoint (proc);
num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) {
if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btl) {
return bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint;
}
}
/* very unlikely. if this happened the btl section process is broken */
return NULL;
}
int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) {
struct mca_btl_base_endpoint_t *endpoint;
ompi_osc_rdma_peer_t *peer;
*peer_out = NULL;
endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
if (OPAL_UNLIKELY(NULL == endpoint)) {
return OMPI_ERR_UNREACH;
}
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_dynamic_t);
} else if (module->same_size && module->same_disp_unit) {
/* use a smaller peer object when same_size and same_disp_unit are set */
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_basic_t);
} else {
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_extended_t);
}
peer->data_endpoint = endpoint;
peer->rank = peer_id;
*peer_out = peer;
return OMPI_SUCCESS;
}
/**
* @brief finish initializing a peer object
*
* @param[in] module osc rdma module
* @param[in] peer peer object to set up
*
* This function reads the registration handle and state pointer from the peer that holds that data. If necessary
* it will then ready information about the peer from its state data structure. This information includes the
* displacement unit, base pointer, window size, and registation handle (if applicable).
*/
static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
{
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
uint64_t peer_data_size;
uint64_t peer_data_offset, array_pointer;
struct mca_btl_base_endpoint_t *array_endpoint;
ompi_osc_rdma_region_t *array_peer_data, *node_peer_data;
ompi_osc_rdma_rank_data_t rank_data;
int registration_handle_size = 0;
int node_id, node_rank, array_index;
int ret, disp_unit;
char *peer_data;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "configuring peer for rank %d", peer->rank));
if (module->selected_btl->btl_register_mem) {
registration_handle_size = module->selected_btl->btl_registration_handle_size;
}
/* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
* calculates the node and offset the mapping can be found. once the mapping has been read the state
* part of the peer structure can be initialized. */
node_id = (peer->rank * module->node_count) / ompi_comm_size (module->comm);
node_rank = NODE_ID_TO_RANK(module, node_id);
array_index = peer->rank - node_rank;
array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
/* lookup the btl endpoint needed to retrieve the mapping */
array_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, node_rank);
if (OPAL_UNLIKELY(NULL == array_endpoint)) {
return OMPI_ERR_UNREACH;
}
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "reading rank data from array rank: %d pointer: 0x%"
PRIx64 ", size: %lu", node_rank, array_pointer, sizeof (rank_data)));
ret = ompi_osc_get_data_blocking (module, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
&rank_data, sizeof (rank_data));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
* every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
* of this by re-using the endpoint and pointer stored in the node_comm_info array. */
node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
if (registration_handle_size) {
peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
}
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, rank_data.node_id));
if (OPAL_UNLIKELY(NULL == peer->state_endpoint)) {
return OPAL_ERR_UNREACH;
}
/* nothing more to do for dynamic memory windows */
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
return OMPI_SUCCESS;
}
/* read window data from the target rank */
if (module->same_disp_unit) {
/* do not bother reading the displacement unit as it is already known */
peer_data_offset = offsetof (ompi_osc_rdma_state_t, regions);
} else {
peer_data_offset = offsetof (ompi_osc_rdma_state_t, disp_unit);
}
peer_data_size = module->state_size - peer_data_offset;
peer_data = alloca (peer_data_size);
/* read window data from the end of the target's state structure */
ret = ompi_osc_get_data_blocking (module, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle,
peer_data, peer_data_size);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (!module->same_disp_unit) {
/* unpack displacement */
memcpy (&ex_peer->disp_unit, peer_data, sizeof (ex_peer->disp_unit));
peer_data += offsetof (ompi_osc_rdma_state_t, regions) - offsetof (ompi_osc_rdma_state_t, disp_unit);
disp_unit = ex_peer->disp_unit;
} else {
disp_unit = module->disp_unit;
}
ompi_osc_rdma_region_t *base_region = (ompi_osc_rdma_region_t *) peer_data;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "peer %d: remote base region: 0x%" PRIx64
", size: %" PRId64 ", flags: 0x%x, disp_unit: %d", peer->rank, base_region->base, base_region->len,
peer->flags, disp_unit));
if (ompi_osc_rdma_peer_local_base (peer)) {
/* for now we store the local address in the standard place. do no overwrite it */
return OMPI_SUCCESS;
}
ex_peer->super.base = base_region->base;
/* save size and base */
if (!module->same_size) {
ex_peer->size = base_region->len;
}
if (base_region->len) {
if (registration_handle_size) {
ex_peer->super.base_handle = malloc (registration_handle_size);
if (OPAL_UNLIKELY(NULL == ex_peer->super.base_handle)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
peer->flags |= OMPI_OSC_RDMA_PEER_BASE_FREE;
memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size);
}
}
return OMPI_SUCCESS;
}
/**
* @brief lookup (or allocate) a peer for a rank (internal)
*
* @param[in] module osc rdma module
* @param[in] peer_id rank of remote peer (in module communicator)
*
* @returns peer object on success
* @returns NULL on error
*
* This is an internal function for looking up or allocating a peer object for a window rank. This
* function requires the peer lock to be held and is only expected to be called from itself or
* the ompi_osc_rdma_peer_lookup() helper function.
*/
static struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup_internal (struct ompi_osc_rdma_module_t *module, int peer_id)
{
ompi_osc_rdma_peer_t *peer;
int ret;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "looking up peer data for rank %d", peer_id));
peer = ompi_osc_module_get_peer (module, peer_id);
if (NULL != peer) {
return peer;
}
ret = ompi_osc_rdma_new_peer (module, peer_id, &peer);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return NULL;
}
ret = ompi_osc_rdma_peer_setup (module, peer);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OBJ_RELEASE(peer);
return NULL;
}
ret = ompi_osc_module_add_peer (module, peer);
if (OPAL_SUCCESS != ret) {
/* out of memory */
OBJ_RELEASE(peer);
return NULL;
}
/* ensure the peer hash is updated before we drop the lock */
opal_atomic_wmb ();
return peer;
}
struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id)
{
struct ompi_osc_rdma_peer_t *peer;
opal_mutex_lock (&module->peer_lock);
peer = ompi_osc_rdma_peer_lookup_internal (module, peer_id);
opal_mutex_unlock (&module->peer_lock);
return peer;
}
/******* peer objects *******/
static void ompi_osc_rdma_peer_construct (ompi_osc_rdma_peer_t *peer)
{
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
}
static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer)
{
if (peer->state_handle && (peer->flags & OMPI_OSC_RDMA_PEER_STATE_FREE)) {
free (peer->state_handle);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_object_t,
ompi_osc_rdma_peer_construct,
ompi_osc_rdma_peer_destruct);
static void ompi_osc_rdma_peer_basic_construct (ompi_osc_rdma_peer_basic_t *peer)
{
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
}
static void ompi_osc_rdma_peer_basic_destruct (ompi_osc_rdma_peer_basic_t *peer)
{
if (peer->base_handle && (peer->super.flags & OMPI_OSC_RDMA_PEER_BASE_FREE)) {
free (peer->base_handle);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_basic_t, ompi_osc_rdma_peer_t,
ompi_osc_rdma_peer_basic_construct,
ompi_osc_rdma_peer_basic_destruct);
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_extended_t, ompi_osc_rdma_peer_basic_t,
NULL, NULL);
static void ompi_osc_rdma_peer_dynamic_construct (ompi_osc_rdma_peer_dynamic_t *peer)
{
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
}
static void ompi_osc_rdma_peer_dynamic_destruct (ompi_osc_rdma_peer_dynamic_t *peer)
{
if (peer->regions) {
free (peer->regions);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_dynamic_t, ompi_osc_rdma_peer_t,
ompi_osc_rdma_peer_dynamic_construct,
ompi_osc_rdma_peer_dynamic_destruct);

222
ompi/mca/osc/rdma/osc_rdma_peer.h Обычный файл
Просмотреть файл

@ -0,0 +1,222 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_PEER_H
#define OMPI_OSC_RDMA_PEER_H
#include "osc_rdma_types.h"
struct ompi_osc_rdma_module_t;
/**
* @brief osc rdma peer object
*
* This object is used as a cache for information associated with a peer.
*/
struct ompi_osc_rdma_peer_t {
opal_object_t super;
/** rdma data endpoint for this peer */
struct mca_btl_base_endpoint_t *data_endpoint;
/** endpoint for reading/modifying peer state */
struct mca_btl_base_endpoint_t *state_endpoint;
/** remote peer's state pointer */
osc_rdma_base_t state;
/** registration handle associated with the state */
mca_btl_base_registration_handle_t *state_handle;
/** rank of this peer in the window */
int rank;
/** peer flags */
int flags;
/** aggregation support */
ompi_osc_rdma_aggregation_t *aggregate;
};
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
/**
* @brief peer object used when using dynamic windows
*/
struct ompi_osc_rdma_peer_dynamic_t {
ompi_osc_rdma_peer_t super;
/** last region id seen for this peer */
uint32_t region_id;
/** number of regions in the regions array */
uint32_t region_count;
/** cached array of attached regions for this peer */
struct ompi_osc_rdma_region_t *regions;
};
typedef struct ompi_osc_rdma_peer_dynamic_t ompi_osc_rdma_peer_dynamic_t;
/**
* @brief basic peer object for non-dynamic windows used when all peers
* have the same displacement unit and size
*/
struct ompi_osc_rdma_peer_basic_t {
ompi_osc_rdma_peer_t super;
/** remote peer's base pointer */
osc_rdma_base_t base;
/** registration handle associated with the base */
mca_btl_base_registration_handle_t *base_handle;
};
typedef struct ompi_osc_rdma_peer_basic_t ompi_osc_rdma_peer_basic_t;
/**
* @brief peer object used when no assumption can be made about the
* peer's displacement unit or size
*/
struct ompi_osc_rdma_peer_extended_t {
ompi_osc_rdma_peer_basic_t super;
/** remote peer's region size */
osc_rdma_size_t size;
/** displacement unit */
int disp_unit;
};
typedef struct ompi_osc_rdma_peer_extended_t ompi_osc_rdma_peer_extended_t;
/**
* @brief object class declarations
*/
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_t);
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_dynamic_t);
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_basic_t);
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_extended_t);
/**
* @brief used to identify the node and local rank of a peer
*/
struct ompi_osc_rdma_rank_data_t {
/** index of none in none_comm_info array */
unsigned int node_id;
/** local rank of process */
unsigned int rank;
};
typedef struct ompi_osc_rdma_rank_data_t ompi_osc_rdma_rank_data_t;
enum {
/** peer is locked for exclusive access */
OMPI_OSC_RDMA_PEER_EXCLUSIVE = 0x01,
/** peer's base is accessible with direct loads/stores */
OMPI_OSC_RDMA_PEER_LOCAL_BASE = 0x02,
/** peer state is local */
OMPI_OSC_RDMA_PEER_LOCAL_STATE = 0x04,
/** currently accumulating on peer */
OMPI_OSC_RDMA_PEER_ACCUMULATING = 0x08,
/** peer is in an active access epoch (pscw) */
OMPI_OSC_RDMA_PEER_ACCESS_ACTIVE_EPOCH = 0x10,
/** peer state handle should be freed */
OMPI_OSC_RDMA_PEER_STATE_FREE = 0x20,
/** peer base handle should be freed */
OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40,
};
/**
* @brief allocate a peer object and initialize some of it structures
*
* @param[in] module osc rdma module
* @param[in] peer_id peer's rank in the communicator
* @param[out] peer_out new peer object
*
* The type of the object returned depends on the window settings. For example for a dynamic window
* this will return a peer of type \ref ompi_osc_rdma_peer_dynamic_t.
*/
int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out);
/**
* @brief lookup (or allocate) a peer
*
* @param[in] module osc rdma module
* @param[in] peer_id peer's rank in the communicator
*
* This function is used by the ompi_osc_rdma_module_peer() inline function to allocate a peer object. It is not
* intended to be called from anywhere else.
*/
struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id);
/**
* @brief flush queued aggregated operation
*
* @param[in] peer osc rdma peer
*/
int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer);
/**
* @brief lookup the btl endpoint for a peer
*
* @param[in] module osc rdma module
* @param[in] peer_id peer's rank in the communicator
*
* @returns btl endpoint for the peer on success
* @returns NULL on failure
*/
struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id);
/**
* @brief check if this process holds an exclusive lock on a peer
*
* @param[in] peer peer object to check
*/
static inline bool ompi_osc_rdma_peer_is_exclusive (ompi_osc_rdma_peer_t *peer)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_EXCLUSIVE);
}
/**
* @brief check if this process is currently accumulating on a peer
*
* @param[in] peer peer object to check
*/
static inline bool ompi_osc_rdma_peer_is_accumulating (ompi_osc_rdma_peer_t *peer)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_ACCUMULATING);
}
/**
* @brief check if the peer's base pointer is local to this process
*
* @param[in] peer peer object to check
*/
static inline bool ompi_osc_rdma_peer_local_base (ompi_osc_rdma_peer_t *peer)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_BASE);
}
/**
* @brief check if the peer's state pointer is local to this process
*
* @param[in] peer peer object to check
*
* The OMPI_OSC_RDMA_PEER_LOCAL_STATE flag will only be set if either 1) we
* will not be mixing btl atomics and cpu atomics, or 2) it is safe to mix
* btl and cpu atomics.
*/
static inline bool ompi_osc_rdma_peer_local_state (ompi_osc_rdma_peer_t *peer)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_STATE);
}
#endif /* OMPI_OSC_RDMA_PEER_H */

74
ompi/mca/osc/rdma/osc_rdma_request.c Обычный файл
Просмотреть файл

@ -0,0 +1,74 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "ompi/request/request.h"
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "osc_rdma.h"
#include "osc_rdma_request.h"
static int request_cancel(struct ompi_request_t *request, int complete)
{
return MPI_ERR_REQUEST;
}
static int request_free(struct ompi_request_t **ompi_req)
{
ompi_osc_rdma_request_t *request =
(ompi_osc_rdma_request_t*) *ompi_req;
if (true != request->super.req_complete) {
return MPI_ERR_REQUEST;
}
OMPI_OSC_RDMA_REQUEST_RETURN(request);
*ompi_req = MPI_REQUEST_NULL;
return OMPI_SUCCESS;
}
static int request_complete (struct ompi_request_t *request)
{
ompi_osc_rdma_request_t *parent_request = ((ompi_osc_rdma_request_t *) request)->parent_request;
if (parent_request && 0 == OPAL_THREAD_ADD32 (&parent_request->outstanding_requests, -1)) {
ompi_osc_rdma_request_complete (parent_request, OMPI_SUCCESS);
}
return OMPI_SUCCESS;
}
static void request_construct(ompi_osc_rdma_request_t *request)
{
request->super.req_type = OMPI_REQUEST_WIN;
request->super.req_status._cancelled = 0;
request->super.req_free = request_free;
request->super.req_cancel = request_cancel;
request->super.req_complete_cb = request_complete;
request->parent_request = 0;
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
}
static void request_destruct(ompi_osc_rdma_request_t *request)
{
OBJ_DESTRUCT(&request->convertor);
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
ompi_request_t,
request_construct,
request_destruct);

109
ompi/mca/osc/rdma/osc_rdma_request.h Обычный файл
Просмотреть файл

@ -0,0 +1,109 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_REQUEST_H
#define OMPI_OSC_RDMA_REQUEST_H
#include "osc_rdma.h"
enum ompi_osc_rdma_request_type_t {
OMPI_OSC_RDMA_TYPE_GET,
OMPI_OSC_RDMA_TYPE_PUT,
OMPI_OSC_RDMA_TYPE_RDMA,
OMPI_OSC_RDMA_TYPE_ACC,
OMPI_OSC_RDMA_TYPE_GET_ACC,
OMPI_OSC_RDMA_TYPE_CSWAP,
};
typedef enum ompi_osc_rdma_request_type_t ompi_osc_rdma_request_type_t;
struct ompi_osc_rdma_request_t {
ompi_request_t super;
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_type_t type;
void *origin_addr;
int origin_count;
struct ompi_datatype_t *origin_dt;
void *result_addr;
int result_count;
struct ompi_datatype_t *result_dt;
const void *compare_addr;
ompi_op_t *op;
ompi_osc_rdma_module_t *module;
int32_t outstanding_requests;
bool internal;
ptrdiff_t offset;
size_t len;
void *ctx;
void *frag;
uint64_t target_address;
struct ompi_osc_rdma_request_t *parent_request;
/* used for non-contiguous get accumulate operations */
opal_convertor_t convertor;
/** synchronization object */
struct ompi_osc_rdma_sync_t *sync;
};
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput,
rdma_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_RDMA_REQUEST_ALLOC(rmodule, rpeer, req) \
do { \
opal_free_list_item_t *item; \
do { \
item = opal_free_list_get (&mca_osc_rdma_component.requests); \
if (NULL == item) { \
ompi_osc_rdma_progress (rmodule); \
} \
} while (NULL == item); \
req = (ompi_osc_rdma_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = module->win; \
req->super.req_complete = false; \
req->super.req_state = OMPI_REQUEST_ACTIVE; \
req->module = rmodule; \
req->internal = false; \
req->outstanding_requests = 0; \
req->parent_request = NULL; \
req->peer = (rpeer); \
} while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
do { \
OMPI_REQUEST_FINI(&(req)->super); \
opal_free_list_return (&mca_osc_rdma_component.requests, \
(opal_free_list_item_t *) (req)); \
} while (0)
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
{
if (!request->internal) {
request->super.req_status.MPI_ERROR = mpi_error;
/* mark the request complete at the mpi level */
ompi_request_complete (&request->super, true);
} else {
OMPI_OSC_RDMA_REQUEST_RETURN (request);
}
}
#endif /* OMPI_OSC_RDMA_REQUEST_H */

83
ompi/mca/osc/rdma/osc_rdma_sync.c Обычный файл
Просмотреть файл

@ -0,0 +1,83 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "osc_rdma.h"
#include "osc_rdma_sync.h"
static void ompi_osc_rdma_sync_constructor (ompi_osc_rdma_sync_t *rdma_sync)
{
rdma_sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
rdma_sync->epoch_active = false;
rdma_sync->outstanding_rdma = 0;
OBJ_CONSTRUCT(&rdma_sync->aggregations, opal_list_t);
OBJ_CONSTRUCT(&rdma_sync->lock, opal_mutex_t);
}
static void ompi_osc_rdma_sync_destructor (ompi_osc_rdma_sync_t *rdma_sync)
{
OBJ_DESTRUCT(&rdma_sync->aggregations);
OBJ_DESTRUCT(&rdma_sync->lock);
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_sync_t, opal_object_t, ompi_osc_rdma_sync_constructor,
ompi_osc_rdma_sync_destructor);
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module)
{
ompi_osc_rdma_sync_t *rdma_sync;
rdma_sync = OBJ_NEW (ompi_osc_rdma_sync_t);
if (OPAL_UNLIKELY(NULL == rdma_sync)) {
return NULL;
}
rdma_sync->module = module;
return rdma_sync;
}
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync)
{
OBJ_RELEASE(rdma_sync);
}
static inline bool ompi_osc_rdma_sync_array_peer (int rank, ompi_osc_rdma_peer_t **peers, size_t nranks,
struct ompi_osc_rdma_peer_t **peer)
{
int mid = nranks / 2;
/* base cases */
if (0 == nranks || (1 == nranks && peers[0]->rank != rank)) {
*peer = NULL;
return false;
} else if (peers[0]->rank == rank) {
*peer = peers[0];
return true;
}
if (peers[mid]->rank > rank) {
return ompi_osc_rdma_sync_array_peer (rank, peers, mid, peer);
}
return ompi_osc_rdma_sync_array_peer (rank, peers + mid, nranks - mid, peer);
}
bool ompi_osc_rdma_sync_pscw_peer (ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer)
{
ompi_osc_rdma_sync_t *rdma_sync = &module->all_sync;
/* check synchronization type */
if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW != rdma_sync->type) {
*peer = NULL;
return false;
}
return ompi_osc_rdma_sync_array_peer (target, rdma_sync->peer_list.peers, rdma_sync->num_peers, peer);
}

158
ompi/mca/osc/rdma/osc_rdma_sync.h Обычный файл
Просмотреть файл

@ -0,0 +1,158 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#if !defined(OSC_RDMA_SYNC_H)
#define OSC_RDMA_SYNC_H
#include "osc_rdma_types.h"
#include "opal/class/opal_object.h"
#include "opal/threads/threads.h"
/**
* @brief synchronization types
*/
enum ompi_osc_rdma_sync_type_t {
/** default value */
OMPI_OSC_RDMA_SYNC_TYPE_NONE,
/** lock access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_LOCK,
/** fence access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_FENCE,
/* post-start-complete-wait access epoch */
OMPI_OSC_RDMA_SYNC_TYPE_PSCW,
};
typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;
struct ompi_osc_rdma_module_t;
/**
* @brief synchronization object
*
* This structure holds information about an access epoch.
*/
struct ompi_osc_rdma_sync_t {
opal_object_t super;
/** osc rdma module */
struct ompi_osc_rdma_module_t *module;
/** synchronization type */
ompi_osc_rdma_sync_type_t type;
/** synchronization data */
union {
/** lock specific synchronization data */
struct {
/** lock target rank (-1 for all) */
int target;
/** lock type: MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE */
int16_t type;
/** assert specified at lock acquire time. at this time Open MPI
* only uses 5-bits for asserts. if this number goes over 16 this
* will need to be changed to accomodate. */
int16_t assert;
} lock;
/** post/start/complete/wait specific synchronization data */
struct {
/** group passed to ompi_osc_rdma_start */
ompi_group_t *group;
} pscw;
} sync;
/** array of peers for this sync */
union {
/** multiple peers (lock all, pscw, fence) */
struct ompi_osc_rdma_peer_t **peers;
/** single peer (targeted lock) */
struct ompi_osc_rdma_peer_t *peer;
} peer_list;
/** number of peers */
int num_peers;
/** communication has started on this epoch */
bool epoch_active;
/** outstanding rdma operations on epoch */
osc_rdma_counter_t outstanding_rdma;
/** aggregated operations in this epoch */
opal_list_t aggregations;
/** lock to protect sync structure members */
opal_mutex_t lock;
};
typedef struct ompi_osc_rdma_sync_t ompi_osc_rdma_sync_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_sync_t);
/**
* @brief allocate a new synchronization object
*
* @param[in] module osc rdma module
*
* @returns NULL on failure
* @returns a new synchronization object on success
*/
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module);
/**
* @brief release a synchronization object
*
* @param[in] rdma_sync synchronization object allocated by ompi_osc_rdma_sync_allocate()
*/
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);
/**
* Check if the target is part of a PSCW access epoch
*
* @param[in] module osc rdma module
* @param[in] target target rank
* @param[out] peer peer object
*
* @returns false if the window is not in a PSCW access epoch or the peer is not
* in the group passed to MPI_Win_start
* @returns true otherwise
*
* This functions verifies the target is part of an active PSCW access epoch.
*/
bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer);
/**
* @brief increment the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, 1);
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "INC: there are %ld outstanding RDMA operations",
(unsigned long) rdma_sync->outstanding_rdma));
}
/**
* @brief decrement the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, -1);
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "DEC: there are %ld outstanding RDMA operations",
(unsigned long) rdma_sync->outstanding_rdma));
}
#endif /* OSC_RDMA_SYNC_H */

213
ompi/mca/osc/rdma/osc_rdma_types.h Обычный файл
Просмотреть файл

@ -0,0 +1,213 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_TYPES_H
#define OMPI_OSC_RDMA_TYPES_H
#include "ompi_config.h"
/* forward declarations of some other component types */
struct ompi_osc_rdma_frag_t;
struct ompi_osc_rdma_sync_t;
struct ompi_osc_rdma_peer_t;
#if OPAL_HAVE_ATOMIC_MATH_64
typedef int64_t osc_rdma_base_t;
typedef int64_t osc_rdma_size_t;
typedef int64_t osc_rdma_counter_t;
#define ompi_osc_rdma_counter_add opal_atomic_add_64
#else
typedef int32_t osc_rdma_base_t;
typedef int32_t osc_rdma_size_t;
typedef int32_t osc_rdma_counter_t;
#define ompi_osc_rdma_counter_add opal_atomic_add_32
#endif
#if OPAL_HAVE_ATOMIC_MATH_64
#define OMPI_OSC_RDMA_LOCK_EXCLUSIVE 0x8000000000000000l
typedef int64_t ompi_osc_rdma_lock_t;
static inline int64_t ompi_osc_rdma_lock_add (volatile int64_t *p, int64_t value)
{
int64_t new;
opal_atomic_mb ();
new = opal_atomic_add_64 (p, value) - value;
opal_atomic_mb ();
return new;
}
static inline int ompi_osc_rdma_lock_cmpset (volatile int64_t *p, int64_t comp, int64_t value)
{
int ret;
opal_atomic_mb ();
ret = opal_atomic_cmpset_64 (p, comp, value);
opal_atomic_mb ();
return ret;
}
#else
#define OMPI_OSC_RDMA_LOCK_EXCLUSIVE 0x80000000l
typedef int32_t ompi_osc_rdma_lock_t;
static inline int32_t ompi_osc_rdma_lock_add (volatile int32_t *p, int32_t value)
{
int32_t new;
opal_atomic_mb ();
/* opal_atomic_add_32 differs from normal atomics in that is returns the new value */
new = opal_atomic_add_32 (p, value) - value;
opal_atomic_mb ();
return new;
}
static inline int ompi_osc_rdma_lock_cmpset (volatile int32_t *p, int32_t comp, int32_t value)
{
int ret;
opal_atomic_mb ();
ret = opal_atomic_cmpset_32 (p, comp, value);
opal_atomic_mb ();
return ret;
}
#endif /* OPAL_HAVE_ATOMIC_MATH_64 */
/**
* @brief structure describing a window memory region
*/
struct ompi_osc_rdma_region_t {
/** base of the region */
osc_rdma_base_t base;
/** length (in bytes) of the region */
osc_rdma_size_t len;
/** BTL segment for the region (may be empty) */
unsigned char btl_handle_data[];
};
typedef struct ompi_osc_rdma_region_t ompi_osc_rdma_region_t;
/**
* @brief data handle for dynamic memory regions
*
* This structure holds the btl handle (if one exists) and the
* reference count for a dynamically attached region. The reference
* count is used to keep track of the number of times a memory
* region associated with a page (or set of pages) has been attached.
*/
struct ompi_osc_rdma_handle_t {
/** btl handle for the memory region */
mca_btl_base_registration_handle_t *btl_handle;
/** number of attaches assocated with this region */
int refcnt;
};
typedef struct ompi_osc_rdma_handle_t ompi_osc_rdma_handle_t;
/**
* @brief number of state buffers that can be used for storing
* post messages.
*
* This value was chosen because post exposure epochs are expected to be
* small relative to the size of the communicator. The value is constant
* and not exposed as an MCA variable to keep the layout of the
* \ref ompi_osc_rdma_state_t structure simple.
*/
#define OMPI_OSC_RDMA_POST_PEER_MAX 32
/**
* @brief window state structure
*
* This structure holds the information relevant to the window state
* of a peer. The structure synchronization data and includes useful
* information that can be remotely read by other peers in the window.
*/
struct ompi_osc_rdma_state_t {
/** used when rdma is in use to handle excusive locks and global shared locks (lock_all) */
ompi_osc_rdma_lock_t global_lock;
/** lock state for this node. the top bit indicates if a exclusive lock exists and the
* remaining bits count the number of shared locks */
ompi_osc_rdma_lock_t local_lock;
/** lock for the accumulate state to ensure ordering and consistency */
ompi_osc_rdma_lock_t accumulate_lock;
/** persistent scratch space for fetch and op/cswap when the result is not needed */
ompi_osc_rdma_lock_t scratch_lock;
/** current index to post to. compare-and-swap must be used to ensure
* the index is free */
osc_rdma_counter_t post_index;
/** post buffers */
osc_rdma_counter_t post_peers[OMPI_OSC_RDMA_POST_PEER_MAX];
/** counter for number of post messages received */
osc_rdma_counter_t num_post_msgs;
/** counter for number of complete messages received */
osc_rdma_counter_t num_complete_msgs;
/** lock for the region state to ensure consistency */
ompi_osc_rdma_lock_t regions_lock;
/** displacement unit for this process */
int64_t disp_unit;
/** number of attached regions. this count will be 1 in non-dynamic regions */
osc_rdma_counter_t region_count;
/** attached memory regions */
unsigned char regions[];
};
typedef struct ompi_osc_rdma_state_t ompi_osc_rdma_state_t;
struct ompi_osc_rdma_aggregation_t {
opal_list_item_t super;
/** associated peer */
struct ompi_osc_rdma_peer_t *peer;
/** aggregation buffer frag */
struct ompi_osc_rdma_frag_t *frag;
/** synchronization object */
struct ompi_osc_rdma_sync_t *sync;
/** aggregation buffer */
char *buffer;
/** target for the operation */
osc_rdma_base_t target_address;
/** handle for target memory address */
mca_btl_base_registration_handle_t *target_handle;
/** buffer size */
size_t buffer_size;
/** buffer used */
size_t buffer_used;
/** type */
int type;
/** list of associated requests */
opal_list_t requests;
};
typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t);
#endif /* OMPI_OSC_RDMA_TYPES_H */

Просмотреть файл

@ -4,4 +4,4 @@
# status: e.g. active, maintenance, unmaintained
#
owner: LANL
status: active?
status: active