osc/rdma: add true RDMA one-sided component
This commit adds support for performing one-sided operations over supported hardware (currently Infiniband and Cray Gemini/Aries). This component is still undergoing active development. Current features: - Use network atomic operations (fadd, cswap) for implementing locking and PSCW synchronization. - Aggregate small contiguous puts. - Reduced memory footprint by storing window data (pointer, keys, etc) at the lowest rank on each node. The data is fetched as each process needs to communicate with a new peer. This is a trade-off between the performance of the first operation on a peer and the memory utilization of a window. TODO: - Add support for the accumulate_ops info key. If it is known that the same op or same op/no op is used it may be possible to use hardware atomics for fetch-and-op and compare-and-swap. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
131681acc6
Коммит
d8df9d414d
64
ompi/mca/osc/rdma/Makefile.am
Обычный файл
64
ompi/mca/osc/rdma/Makefile.am
Обычный файл
@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
rdma_sources = \
|
||||
osc_rdma.h \
|
||||
osc_rdma_module.c \
|
||||
osc_rdma_comm.h \
|
||||
osc_rdma_comm.c \
|
||||
osc_rdma_accumulate.c \
|
||||
osc_rdma_accumulate.h \
|
||||
osc_rdma_component.c \
|
||||
osc_rdma_frag.h \
|
||||
osc_rdma_frag.c \
|
||||
osc_rdma_request.h \
|
||||
osc_rdma_request.c \
|
||||
osc_rdma_active_target.h \
|
||||
osc_rdma_active_target.c \
|
||||
osc_rdma_passive_target.h \
|
||||
osc_rdma_passive_target.c \
|
||||
osc_rdma_lock.h \
|
||||
osc_rdma_peer.h \
|
||||
osc_rdma_peer.c \
|
||||
osc_rdma_dynamic.h \
|
||||
osc_rdma_dynamic.c \
|
||||
osc_rdma_sync.h \
|
||||
osc_rdma_sync.c \
|
||||
osc_rdma_types.h
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_ompi_osc_rdma_DSO
|
||||
component_noinst =
|
||||
component_install = mca_osc_rdma.la
|
||||
else
|
||||
component_noinst = libmca_osc_rdma.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(ompilibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_osc_rdma_la_SOURCES = $(rdma_sources)
|
||||
mca_osc_rdma_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_osc_rdma_la_SOURCES = $(rdma_sources)
|
||||
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version
|
26
ompi/mca/osc/rdma/configure.m4
Обычный файл
26
ompi/mca/osc/rdma/configure.m4
Обычный файл
@ -0,0 +1,26 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
|
||||
# ----------------------------------------
|
||||
# Only require the tag if we're actually going to be built, since bml
|
||||
# is one of the ones frequently disabled for large installs.
|
||||
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
|
||||
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
|
||||
])dnl
|
||||
|
||||
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
# We can always build, unless we were explicitly disabled.
|
||||
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
|
||||
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
|
||||
[$1]
|
||||
])dnl
|
506
ompi/mca/osc/rdma/osc_rdma.h
Обычный файл
506
ompi/mca/osc/rdma/osc_rdma.h
Обычный файл
@ -0,0 +1,506 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_RDMA_H
|
||||
#define OMPI_OSC_RDMA_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/class/opal_free_list.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/threads/threads.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "opal/mca/shmem/shmem.h"
|
||||
#include "opal/mca/shmem/base/base.h"
|
||||
|
||||
#include "ompi/win/win.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/osc/osc.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "ompi/memchecker.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
#include "osc_rdma_types.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
|
||||
#include "osc_rdma_peer.h"
|
||||
|
||||
#include "opal_stdint.h"
|
||||
|
||||
/**
|
||||
* @brief osc rdma component structure
|
||||
*/
|
||||
struct ompi_osc_rdma_component_t {
|
||||
/** Extend the basic osc component interface */
|
||||
ompi_osc_base_component_t super;
|
||||
|
||||
/** lock access to modules */
|
||||
opal_mutex_t lock;
|
||||
|
||||
/** cid -> module mapping */
|
||||
opal_hash_table_t modules;
|
||||
|
||||
/** free list of ompi_osc_rdma_frag_t structures */
|
||||
opal_free_list_t frags;
|
||||
|
||||
/** Free list of requests */
|
||||
opal_free_list_t requests;
|
||||
|
||||
/** RDMA component buffer size */
|
||||
unsigned int buffer_size;
|
||||
|
||||
/** aggregation limit */
|
||||
unsigned int aggregation_limit;
|
||||
|
||||
/** List of requests that need to be freed */
|
||||
opal_list_t request_gc;
|
||||
|
||||
/** List of buffers that need to be freed */
|
||||
opal_list_t buffer_gc;
|
||||
|
||||
/** Maximum number of segments that can be attached to a dynamic window */
|
||||
unsigned int max_attach;
|
||||
|
||||
/** Default value of the no_locks info key for new windows */
|
||||
bool no_locks;
|
||||
|
||||
/** Priority of the osc/rdma component */
|
||||
unsigned int priority;
|
||||
|
||||
/** aggregation free list */
|
||||
opal_free_list_t aggregate;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
|
||||
|
||||
struct ompi_osc_rdma_frag_t;
|
||||
|
||||
/**
|
||||
* @brief osc rdma module structure
|
||||
*
|
||||
* Each MPI window is associated with a single osc module. This struct
|
||||
* stores the data relevant to the osc/rdma component.
|
||||
*/
|
||||
struct ompi_osc_rdma_module_t {
|
||||
/** Extend the basic osc module interface */
|
||||
ompi_osc_base_module_t super;
|
||||
|
||||
/** pointer back to MPI window */
|
||||
struct ompi_win_t *win;
|
||||
|
||||
/** Mutex lock protecting module data */
|
||||
opal_mutex_t lock;
|
||||
|
||||
|
||||
/* window configuration */
|
||||
|
||||
/** value of same_disp_unit info key for this window */
|
||||
bool same_disp_unit;
|
||||
|
||||
/** value of same_size info key for this window */
|
||||
bool same_size;
|
||||
|
||||
/** window should have accumulate ordering... */
|
||||
bool accumulate_ordering;
|
||||
|
||||
/** passive-target synchronization will not be used in this window */
|
||||
bool no_locks;
|
||||
|
||||
/** flavor of this window */
|
||||
int flavor;
|
||||
|
||||
/** size of local window */
|
||||
size_t size;
|
||||
|
||||
/** Local displacement unit. */
|
||||
int disp_unit;
|
||||
|
||||
|
||||
/** global leader */
|
||||
ompi_osc_rdma_peer_t *leader;
|
||||
|
||||
/** pointer to free on cleanup (may be NULL) */
|
||||
void *free_after;
|
||||
|
||||
/** local state structure (shared memory) */
|
||||
ompi_osc_rdma_state_t *state;
|
||||
|
||||
/** node-level communication data (shared memory) */
|
||||
unsigned char *node_comm_info;
|
||||
|
||||
/* only relevant on the lowest rank on each node (shared memory) */
|
||||
ompi_osc_rdma_rank_data_t *rank_array;
|
||||
|
||||
|
||||
/** communicator created with this window. This is the cid used
|
||||
* in the component's modules mapping. */
|
||||
ompi_communicator_t *comm;
|
||||
|
||||
/* temporary communicators for window initialization */
|
||||
ompi_communicator_t *local_leaders;
|
||||
ompi_communicator_t *shared_comm;
|
||||
|
||||
/** node id of this rank */
|
||||
int node_id;
|
||||
|
||||
/** number of nodes */
|
||||
int node_count;
|
||||
|
||||
/** handle valid for local state (valid for local data for MPI_Win_allocate) */
|
||||
mca_btl_base_registration_handle_t *state_handle;
|
||||
|
||||
/** registration handle for the window base (only used for MPI_Win_create) */
|
||||
mca_btl_base_registration_handle_t *base_handle;
|
||||
|
||||
/** size of a region */
|
||||
size_t region_size;
|
||||
|
||||
/** size of the state structure */
|
||||
size_t state_size;
|
||||
|
||||
/** offset in the shared memory segment where the state array starts */
|
||||
size_t state_offset;
|
||||
|
||||
/* ********************* sync data ************************ */
|
||||
|
||||
/** global sync object (PSCW, fence, lock all) */
|
||||
ompi_osc_rdma_sync_t all_sync;
|
||||
|
||||
/** current group associate with pscw exposure epoch */
|
||||
struct ompi_group_t *pw_group;
|
||||
|
||||
/** list of unmatched post messages */
|
||||
opal_list_t pending_posts;
|
||||
|
||||
/* ********************* LOCK data ************************ */
|
||||
|
||||
/** number of outstanding locks */
|
||||
osc_rdma_counter_t passive_target_access_epoch;
|
||||
|
||||
/** origin side list of locks currently outstanding */
|
||||
opal_hash_table_t outstanding_locks;
|
||||
|
||||
/** array of locks (small jobs) */
|
||||
ompi_osc_rdma_sync_t **outstanding_lock_array;
|
||||
|
||||
|
||||
/* ******************* peer storage *********************** */
|
||||
|
||||
/** hash table of allocated peers */
|
||||
opal_hash_table_t peer_hash;
|
||||
|
||||
/** array of allocated peers (small jobs) */
|
||||
ompi_osc_rdma_peer_t **peer_array;
|
||||
|
||||
/** lock for peer hash table/array */
|
||||
opal_mutex_t peer_lock;
|
||||
|
||||
|
||||
/** BTL in use */
|
||||
struct mca_btl_base_module_t *selected_btl;
|
||||
|
||||
/** registered fragment used for locally buffered RDMA transfers */
|
||||
struct ompi_osc_rdma_frag_t *rdma_frag;
|
||||
|
||||
/** registration handles for dynamically attached regions. These are not stored
|
||||
* in the state structure as it is entirely local. */
|
||||
ompi_osc_rdma_handle_t *dynamic_handles;
|
||||
|
||||
/** shared memory segment. this segment holds this node's portion of the rank -> node
|
||||
* mapping array, node communication data (node_comm_info), state for all local ranks,
|
||||
* and data for all local ranks (MPI_Win_allocate only) */
|
||||
void *segment_base;
|
||||
|
||||
/** opal shared memory structure for the shared memory segment */
|
||||
opal_shmem_ds_t seg_ds;
|
||||
|
||||
|
||||
/* performance values */
|
||||
|
||||
/** number of times a put had to be retried */
|
||||
unsigned long put_retry_count;
|
||||
|
||||
/** number of time a get had to be retried */
|
||||
unsigned long get_retry_count;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
|
||||
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
|
||||
|
||||
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
|
||||
|
||||
int ompi_osc_rdma_free (struct ompi_win_t *win);
|
||||
|
||||
|
||||
/* peer functions */
|
||||
|
||||
/**
|
||||
* @brief cache a peer object
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer peer object to cache
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_OUT_OF_RESOURCE on failure
|
||||
*/
|
||||
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
|
||||
|
||||
/**
|
||||
* @brief check if a peer object is cached for a remote rank
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id remote peer rank
|
||||
*
|
||||
* @returns peer object on success
|
||||
* @returns NULL if a peer object is not cached for the peer
|
||||
*/
|
||||
static inline ompi_osc_rdma_peer_t *ompi_osc_module_get_peer (ompi_osc_rdma_module_t *module, int peer_id)
|
||||
{
|
||||
if (NULL == module->peer_array) {
|
||||
ompi_osc_rdma_peer_t *peer = NULL;
|
||||
(void) opal_hash_table_get_value_uint32 (&module->peer_hash, peer_id, (void **) &peer);
|
||||
return peer;
|
||||
}
|
||||
|
||||
return module->peer_array[peer_id];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief get the peer object for a remote rank
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id remote peer rank
|
||||
*/
|
||||
static inline ompi_osc_rdma_peer_t *ompi_osc_rdma_module_peer (ompi_osc_rdma_module_t *module, int peer_id)
|
||||
{
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
peer = ompi_osc_module_get_peer (module, peer_id);
|
||||
if (NULL != peer) {
|
||||
return peer;
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_peer_lookup (module, peer_id);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief check if this process has this process is in a passive target access epoch
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
return 0 != module->passive_target_access_epoch;
|
||||
}
|
||||
|
||||
static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr,
|
||||
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
|
||||
{
|
||||
if (module->selected_btl->btl_register_mem) {
|
||||
*handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags);
|
||||
if (OPAL_UNLIKELY(NULL == *handle)) {
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "failed to register pointer with selected BTL. base: %p, "
|
||||
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
*handle = NULL;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#define ompi_osc_rdma_register(...) _ompi_osc_rdma_register(__VA_ARGS__, __LINE__, __FILE__)
|
||||
|
||||
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
|
||||
{
|
||||
if (handle) {
|
||||
module->selected_btl->btl_deregister_mem (module->selected_btl, handle);
|
||||
}
|
||||
}
|
||||
|
||||
#define ompi_osc_rdma_deregister(...) _ompi_osc_rdma_deregister(__VA_ARGS__, __LINE__, __FILE__)
|
||||
|
||||
static inline void ompi_osc_rdma_progress (ompi_osc_rdma_module_t *module) {
|
||||
module->selected_btl->btl_component->btl_progress ();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first outstanding lock of the target.
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] target target rank
|
||||
* @param[out] peer peer object associated with the target
|
||||
*
|
||||
* @returns an outstanding lock on success
|
||||
*
|
||||
* This function looks for an outstanding lock to the target. If a lock exists it is returned.
|
||||
*/
|
||||
static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_lock_find (ompi_osc_rdma_module_t *module, int target,
|
||||
ompi_osc_rdma_peer_t **peer)
|
||||
{
|
||||
ompi_osc_rdma_sync_t *outstanding_lock = NULL;
|
||||
|
||||
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
|
||||
outstanding_lock = module->outstanding_lock_array[target];
|
||||
} else {
|
||||
(void) opal_hash_table_get_value_uint32 (&module->outstanding_locks, (uint32_t) target, (void **) &outstanding_lock);
|
||||
}
|
||||
|
||||
if (NULL != outstanding_lock && peer) {
|
||||
*peer = outstanding_lock->peer_list.peer;
|
||||
}
|
||||
|
||||
return outstanding_lock;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an outstanding lock
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] lock lock object
|
||||
*
|
||||
* This function inserts a lock object to the list of outstanding locks. The caller must be holding the module
|
||||
* lock.
|
||||
*/
|
||||
static inline void ompi_osc_rdma_module_lock_insert (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock)
|
||||
{
|
||||
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
|
||||
module->outstanding_lock_array[lock->sync.lock.target] = lock;
|
||||
} else {
|
||||
(void) opal_hash_table_set_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target, (void *) lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Remove an outstanding lock
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] lock lock object
|
||||
*
|
||||
* This function removes a lock object to the list of outstanding locks. The caller must be holding the module
|
||||
* lock.
|
||||
*/
|
||||
static inline void ompi_osc_rdma_module_lock_remove (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock)
|
||||
{
|
||||
if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) {
|
||||
module->outstanding_lock_array[lock->sync.lock.target] = NULL;
|
||||
} else {
|
||||
(void) opal_hash_table_remove_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup a synchronization object associated with the target
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] target target rank
|
||||
* @param[out] peer peer object
|
||||
*
|
||||
* @returns NULL if the target is not locked, fenced, or part of a pscw sync
|
||||
* @returns synchronization object on success
|
||||
*
|
||||
* This function returns the synchronization object associated with an access epoch for
|
||||
* the target. If the target is not part of any current access epoch then NULL is returned.
|
||||
*/
|
||||
static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer)
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc/rdma: looking for synchronization object for target %d", target));
|
||||
|
||||
switch (module->all_sync.type) {
|
||||
case OMPI_OSC_RDMA_SYNC_TYPE_NONE:
|
||||
if (!module->no_locks) {
|
||||
return ompi_osc_rdma_module_lock_find (module, target, peer);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
|
||||
case OMPI_OSC_RDMA_SYNC_TYPE_LOCK:
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc/rdma: found fence/lock_all access epoch for target %d", target));
|
||||
|
||||
/* fence epoch is now active */
|
||||
module->all_sync.epoch_active = true;
|
||||
*peer = ompi_osc_rdma_module_peer (module, target);
|
||||
|
||||
return &module->all_sync;
|
||||
case OMPI_OSC_RDMA_SYNC_TYPE_PSCW:
|
||||
if (ompi_osc_rdma_sync_pscw_peer (module, target, peer)) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc/rdma: found PSCW access epoch target for %d", target));
|
||||
return &module->all_sync;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief complete all outstanding rdma operations to all peers
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
*/
|
||||
static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
|
||||
{
|
||||
ompi_osc_rdma_aggregation_t *aggregation, *next;
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
|
||||
if (opal_list_get_size (&sync->aggregations)) {
|
||||
OPAL_THREAD_SCOPED_LOCK(&sync->lock,
|
||||
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
|
||||
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
|
||||
});
|
||||
}
|
||||
|
||||
do {
|
||||
module->selected_btl->btl_component->btl_progress ();
|
||||
} while (sync->outstanding_rdma);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief check if an access epoch is active
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
*
|
||||
* @returns true if any type of access epoch is active
|
||||
* @returns false otherwise
|
||||
*
|
||||
* This function is used to check for conflicting access epochs.
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_access_epoch_active (ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
return (module->all_sync.epoch_active || ompi_osc_rdma_in_passive_epoch (module));
|
||||
}
|
||||
|
||||
static inline void ompi_osc_rdma_aggregation_return (ompi_osc_rdma_aggregation_t *aggregation)
|
||||
{
|
||||
if (aggregation->sync) {
|
||||
opal_list_remove_item (&aggregation->sync->aggregations, (opal_list_item_t *) aggregation);
|
||||
}
|
||||
|
||||
opal_free_list_return(&mca_osc_rdma_component.aggregate, (opal_free_list_item_t *) aggregation);
|
||||
}
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_H */
|
907
ompi/mca/osc/rdma/osc_rdma_accumulate.c
Обычный файл
907
ompi/mca/osc/rdma/osc_rdma_accumulate.c
Обычный файл
@ -0,0 +1,907 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma_accumulate.h"
|
||||
#include "osc_rdma_request.h"
|
||||
#include "osc_rdma_comm.h"
|
||||
|
||||
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
||||
|
||||
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
|
||||
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
|
||||
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, int target_count,
|
||||
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module,
|
||||
ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
do {
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
|
||||
if (NULL != result_buffer) {
|
||||
/* get accumulate */
|
||||
|
||||
ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
|
||||
result_buffer, result_count, result_datatype);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (&ompi_mpi_op_no_op.op != op) {
|
||||
if (&ompi_mpi_op_replace.op != op) {
|
||||
ret = ompi_osc_base_sndrcv_op (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
|
||||
target_count, target_datatype, op);
|
||||
} else {
|
||||
ret = ompi_datatype_sndrcv (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address,
|
||||
target_count, target_datatype);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
} while (0);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (request) {
|
||||
/* NTH: is it ok to use an ompi error code here? */
|
||||
ompi_osc_rdma_request_complete (request, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer,
|
||||
ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer,
|
||||
uint64_t target_address, mca_btl_base_registration_handle_t *target_handle,
|
||||
ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
|
||||
memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size);
|
||||
|
||||
if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) {
|
||||
memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* completion of an accumulate put */
|
||||
static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
|
||||
ompi_osc_rdma_sync_t *sync = request->sync;
|
||||
ompi_osc_rdma_peer_t *peer = request->peer;
|
||||
|
||||
ompi_osc_rdma_frag_complete (request->frag);
|
||||
ompi_osc_rdma_request_complete (request, status);
|
||||
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_release_exclusive (sync->module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
|
||||
}
|
||||
|
||||
/* completion of an accumulate get operation */
|
||||
static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
|
||||
intptr_t source = (intptr_t) local_address + request->offset;
|
||||
ompi_osc_rdma_sync_t *sync = request->sync;
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
|
||||
assert (OMPI_SUCCESS == status);
|
||||
|
||||
if (OMPI_SUCCESS == status && OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) {
|
||||
if (NULL == request->result_addr) {
|
||||
/* result buffer is not necessarily contiguous. use the opal datatype engine to
|
||||
* copy the data over in this case */
|
||||
struct iovec iov = {.iov_base = (void *) source, request->len};
|
||||
uint32_t iov_count = 1;
|
||||
size_t size = request->len;
|
||||
|
||||
opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size);
|
||||
opal_convertor_cleanup (&request->convertor);
|
||||
} else {
|
||||
/* copy contiguous data to the result buffer */
|
||||
ompi_datatype_sndrcv ((void *) source, request->len, MPI_BYTE, request->result_addr,
|
||||
request->result_count, request->result_dt);
|
||||
}
|
||||
|
||||
if (&ompi_mpi_op_no_op.op == request->op) {
|
||||
/* this is a no-op. nothing more to do except release resources and the accumulate lock */
|
||||
ompi_osc_rdma_acc_put_complete (btl, endpoint, local_address, local_handle, context, data, status);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* accumulate the data */
|
||||
if (&ompi_mpi_op_replace.op != request->op) {
|
||||
ompi_op_reduce (request->op, request->origin_addr, (void *) source, request->origin_count, request->origin_dt);
|
||||
}
|
||||
|
||||
/* initiate the put of the accumulated data */
|
||||
status = module->selected_btl->btl_put (module->selected_btl, endpoint, (void *) source,
|
||||
request->target_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) request->ctx,
|
||||
request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
|
||||
request, NULL);
|
||||
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
|
||||
assert (OPAL_SUCCESS == status);
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count, ompi_datatype_t *source_datatype,
|
||||
void *result, int result_count, ompi_datatype_t *result_datatype,
|
||||
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, int target_count,
|
||||
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
|
||||
unsigned long len = target_count * target_datatype->super.size;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
unsigned long aligned_len, offset;
|
||||
char *ptr = NULL;
|
||||
int ret;
|
||||
|
||||
offset = target_address & btl_alignment_mask;;
|
||||
aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"Could not allocate an rdma fragment for get accumulate"));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
/* to ensure order wait until the previous accumulate completes */
|
||||
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
ompi_osc_rdma_progress (module);
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
}
|
||||
|
||||
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
|
||||
/* set up the request */
|
||||
request->frag = frag;
|
||||
request->origin_addr = (void *) source;
|
||||
request->origin_dt = source_datatype;
|
||||
request->origin_count = source_count;
|
||||
request->ctx = (void *) target_handle;
|
||||
request->result_addr = result;
|
||||
request->result_count = result_count;
|
||||
request->result_dt = result_datatype;
|
||||
request->offset = (ptrdiff_t) target_address & btl_alignment_mask;
|
||||
request->target_address = target_address;
|
||||
request->len = len;
|
||||
request->op = op;
|
||||
request->sync = sync;
|
||||
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
|
||||
if (&ompi_mpi_op_replace.op != op || result) {
|
||||
/* align the target address */
|
||||
target_address = target_address & ~btl_alignment_mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
|
||||
"initiating btl get local: {%p, %p}, remote: {0x%" PRIx64 ", %p}...",
|
||||
ptr, (void *) frag->handle, target_address, (void *) target_handle));
|
||||
|
||||
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
|
||||
target_address, frag->handle, target_handle, aligned_len,
|
||||
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_get_complete,
|
||||
request, NULL);
|
||||
} else {
|
||||
/* copy the put accumulate data */
|
||||
memcpy (ptr, source, len);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
|
||||
"initiating btl put..."));
|
||||
|
||||
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr,
|
||||
target_address, frag->handle, target_handle, len, 0,
|
||||
MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
|
||||
request, NULL);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "btl operation failed with ret = %d", ret));
|
||||
|
||||
ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count,
|
||||
ompi_datatype_t *source_datatype, void *result_buffer, int result_count,
|
||||
ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, int target_count,
|
||||
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX];
|
||||
const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3);
|
||||
uint32_t source_primitive_count, target_primitive_count;
|
||||
opal_convertor_t source_convertor, target_convertor;
|
||||
uint32_t source_iov_count, target_iov_count;
|
||||
uint32_t source_iov_index, target_iov_index;
|
||||
ompi_datatype_t *source_primitive, *target_primitive;
|
||||
/* needed for opal_convertor_raw but not used */
|
||||
size_t source_size, target_size;
|
||||
ompi_osc_rdma_request_t *subreq;
|
||||
size_t result_position;
|
||||
ptrdiff_t lb, extent;
|
||||
int ret, acc_len;
|
||||
bool done;
|
||||
|
||||
(void) ompi_datatype_get_extent (target_datatype, &lb, &extent);
|
||||
target_address += lb;
|
||||
|
||||
/* fast path for accumulate on built-in types */
|
||||
if (OPAL_LIKELY((!source_count || ompi_datatype_is_predefined (source_datatype)) &&
|
||||
ompi_datatype_is_predefined (target_datatype) &&
|
||||
(!result_count || ompi_datatype_is_predefined (result_datatype)) &&
|
||||
(target_datatype->super.size * target_count <= acc_limit))) {
|
||||
if (NULL == request) {
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
|
||||
if (NULL == request) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
request->internal = true;
|
||||
request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
|
||||
}
|
||||
|
||||
if (source_datatype) {
|
||||
(void) ompi_datatype_get_extent (source_datatype, &lb, &extent);
|
||||
source_buffer = (void *)((intptr_t) source_buffer + lb);
|
||||
}
|
||||
|
||||
if (result_datatype) {
|
||||
(void) ompi_datatype_get_extent (result_datatype, &lb, &extent);
|
||||
result_buffer = (void *)((intptr_t) result_buffer + lb);
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer,
|
||||
result_count, result_datatype, peer, target_address,
|
||||
target_handle, target_count, target_datatype, op,
|
||||
request);
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (source_datatype) {
|
||||
/* the convertors will handle the lb */
|
||||
(void) ompi_datatype_get_extent (source_datatype, &lb, &extent);
|
||||
source_buffer = (void *)((intptr_t) source_buffer - lb);
|
||||
}
|
||||
|
||||
if (result_datatype) {
|
||||
(void) ompi_datatype_get_extent (result_datatype, &lb, &extent);
|
||||
result_buffer = (void *)((intptr_t) result_buffer - lb);
|
||||
}
|
||||
}
|
||||
|
||||
/* the convertor will handle lb from here */
|
||||
(void) ompi_datatype_get_extent (target_datatype, &lb, &extent);
|
||||
target_address -= lb;
|
||||
|
||||
/* get the primitive datatype info */
|
||||
ret = ompi_osc_base_get_primitive_type_info (target_datatype, &target_primitive, &target_primitive_count);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
/* target datatype is not made up of a single basic datatype */
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (source_datatype) {
|
||||
ret = ompi_osc_base_get_primitive_type_info (source_datatype, &source_primitive, &source_primitive_count);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
/* target datatype is not made up of a single basic datatype */
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(source_primitive != target_primitive)) {
|
||||
return MPI_ERR_TYPE;
|
||||
}
|
||||
}
|
||||
|
||||
/* prepare convertors for the source and target. these convertors will be used to determine the
|
||||
* contiguous segments within the source and target. */
|
||||
/* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */
|
||||
if (source_datatype) {
|
||||
OBJ_CONSTRUCT(&source_convertor, opal_convertor_t);
|
||||
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer,
|
||||
0, &source_convertor);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* target_datatype can never be NULL */
|
||||
OBJ_CONSTRUCT(&target_convertor, opal_convertor_t);
|
||||
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &target_datatype->super, target_count,
|
||||
(void *) (intptr_t) target_address, 0, &target_convertor);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (request) {
|
||||
/* keep the request from completing until all the transfers have started */
|
||||
request->outstanding_requests = 1;
|
||||
}
|
||||
|
||||
target_iov_index = 0;
|
||||
target_iov_count = 0;
|
||||
result_position = 0;
|
||||
|
||||
do {
|
||||
/* decode segments of the source data */
|
||||
source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
|
||||
source_iov_index = 0;
|
||||
/* opal_convertor_raw returns done when it has reached the end of the data */
|
||||
if (!source_datatype) {
|
||||
done = true;
|
||||
source_iovec[0].iov_len = (size_t) -1;
|
||||
source_iovec[0].iov_base = NULL;
|
||||
source_iov_count = 1;
|
||||
} else {
|
||||
done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
|
||||
}
|
||||
|
||||
/* loop on the target segments until we have exhaused the decoded source data */
|
||||
while (source_iov_index != source_iov_count) {
|
||||
if (target_iov_index == target_iov_count) {
|
||||
/* decode segments of the target buffer */
|
||||
target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
|
||||
target_iov_index = 0;
|
||||
(void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
|
||||
}
|
||||
|
||||
/* we already checked that the target was large enough. this should be impossible */
|
||||
assert (0 != target_iov_count);
|
||||
|
||||
/* determine how much to put in this operation */
|
||||
acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len);
|
||||
acc_len = min((size_t) acc_len, acc_limit);
|
||||
|
||||
/* execute the get */
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
|
||||
if (NULL == subreq) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
subreq->internal = true;
|
||||
subreq->parent_request = request;
|
||||
if (request) {
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
|
||||
}
|
||||
|
||||
if (result_datatype) {
|
||||
/* prepare a convertor for this part of the result */
|
||||
opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count,
|
||||
result_buffer, 0, &subreq->convertor);
|
||||
opal_convertor_set_position (&subreq->convertor, &result_position);
|
||||
subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC;
|
||||
} else {
|
||||
subreq->type = OMPI_OSC_RDMA_TYPE_ACC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
|
||||
"target index = %d, target = {%p, %lu}, source_index = %d, source = {%p, %lu}, result = %p, result position = %lu, "
|
||||
"acc_len = %d, count = %lu",
|
||||
target_iov_index, target_iovec[target_iov_index].iov_base, (unsigned long) target_iovec[target_iov_index].iov_len,
|
||||
source_iov_index, source_iovec[source_iov_index].iov_base, (unsigned long) source_iovec[source_iov_index].iov_len,
|
||||
result_buffer, (unsigned long) result_position, acc_len, (unsigned long)(acc_len / target_primitive->super.size)));
|
||||
|
||||
|
||||
ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
|
||||
target_primitive, NULL, 0, NULL, peer, (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base,
|
||||
target_handle, acc_len / target_primitive->super.size, target_primitive, op, subreq);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
|
||||
/* something bad happened. need to figure out how to handle these errors */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* progress and try again */
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* adjust io vectors */
|
||||
target_iovec[target_iov_index].iov_len -= acc_len;
|
||||
source_iovec[source_iov_index].iov_len -= acc_len;
|
||||
target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len);
|
||||
source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
|
||||
result_position += acc_len;
|
||||
|
||||
source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len);
|
||||
target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
|
||||
}
|
||||
} while (!done);
|
||||
|
||||
if (request) {
|
||||
/* release our reference so the request can complete */
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
|
||||
}
|
||||
|
||||
if (source_datatype) {
|
||||
opal_convertor_cleanup (&source_convertor);
|
||||
OBJ_DESTRUCT(&source_convertor);
|
||||
}
|
||||
|
||||
opal_convertor_cleanup (&target_convertor);
|
||||
OBJ_DESTRUCT(&target_convertor);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void ompi_osc_rdma_cas_atomic_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
|
||||
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
|
||||
void *result_buffer = (void *)(intptr_t) ((int64_t *) local_address)[1];
|
||||
|
||||
/* copy the result */
|
||||
memcpy (result_buffer, local_address, 8);
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer,
|
||||
void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer,
|
||||
uint64_t target_address, mca_btl_base_registration_handle_t *target_handle)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
char *ptr;
|
||||
int ret;
|
||||
|
||||
/* XXX -- TODO -- Update the BTL interface to allow for other CAS sizes */
|
||||
if (datatype->super.size != 8) {
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, 16, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* store the destination in the temporary buffer */
|
||||
((int64_t *) ptr)[1] = (intptr_t) result_buffer;
|
||||
|
||||
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->data_endpoint, ptr, target_address,
|
||||
frag->handle, target_handle, ((int64_t *)compare_buffer)[0],
|
||||
*((int64_t *) source_buffer), 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_cas_atomic_complete, module, frag);
|
||||
if (OPAL_UNLIKELY(0 > ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (1 != ret) {
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
} else {
|
||||
memcpy (result_buffer, ptr, 8);
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_cas_get_complete:
|
||||
* Note: This function will not work as is in a heterogeneous environment.
|
||||
*/
|
||||
static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
|
||||
ompi_osc_rdma_sync_t *sync = request->sync;
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
intptr_t source = (intptr_t) local_address + request->offset;
|
||||
ompi_osc_rdma_frag_t *frag = request->frag;
|
||||
ompi_osc_rdma_peer_t *peer = request->peer;
|
||||
int ret;
|
||||
|
||||
if (OMPI_SUCCESS == status) {
|
||||
/* copy data to the user buffer (for gacc) */
|
||||
memcpy (request->result_addr, (void *) source, request->len);
|
||||
memcpy ((void *) source, request->origin_addr, request->len);
|
||||
|
||||
if (0 == memcmp ((void *) source, request->compare_addr, request->len)) {
|
||||
/* the target and compare buffers match so write the source to the target */
|
||||
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, local_address,
|
||||
request->target_address, local_handle,
|
||||
(mca_btl_base_registration_handle_t *) request->ctx,
|
||||
request->len, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_acc_put_complete, request, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output, "could not start put to complete accumulate "
|
||||
"operation. opal return code: %d", ret));
|
||||
}
|
||||
|
||||
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
|
||||
assert (OPAL_SUCCESS == ret);
|
||||
} else {
|
||||
/* this is a no-op. nothing more to do except release the accumulate lock */
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_release_exclusive (module, request->peer,
|
||||
offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
|
||||
/* the request is now complete and the outstanding rdma operation is complete */
|
||||
ompi_osc_rdma_request_complete (request, status);
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer,
|
||||
ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
|
||||
unsigned long offset, aligned_len, len = datatype->super.size;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
ompi_osc_rdma_request_t *request;
|
||||
char *ptr = NULL;
|
||||
int ret;
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
|
||||
if (NULL == request) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
request->internal = true;
|
||||
request->type = OMPI_OSC_RDMA_TYPE_CSWAP;
|
||||
request->sync = sync;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
/* to ensure order wait until the previous accumulate completes */
|
||||
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
ompi_osc_rdma_progress (module);
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
}
|
||||
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
offset = target_address & btl_alignment_mask;;
|
||||
aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"Could not allocate an rdma fragment for get accumulate. Falling back on point-to-point"));
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
|
||||
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
|
||||
}
|
||||
|
||||
/* set up the request */
|
||||
request->frag = frag;
|
||||
request->origin_addr = (void *) source_buffer;
|
||||
request->ctx = (void *) target_handle;
|
||||
request->result_addr = result_buffer;
|
||||
request->compare_addr = compare_buffer;
|
||||
request->result_dt = datatype;
|
||||
request->offset = (ptrdiff_t) offset;
|
||||
request->target_address = target_address;
|
||||
request->len = len;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get..."));
|
||||
|
||||
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
|
||||
target_address, frag->handle, target_handle,
|
||||
aligned_len, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_cas_get_complete, request, NULL);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr, void *result_addr,
|
||||
struct ompi_datatype_t *dt, int target_rank, OPAL_PTRDIFF_TYPE target_disp,
|
||||
struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
mca_btl_base_registration_handle_t *target_handle;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
uint64_t target_address;
|
||||
int ret;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s",
|
||||
(unsigned long) origin_addr, (unsigned long) compare_addr, (unsigned long) result_addr,
|
||||
dt->name, target_rank, (int) target_disp, win->w_name));
|
||||
|
||||
ret = osc_rdma_get_remote_segment (module, peer, target_disp, 8, &target_address, &target_handle);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (MCA_OSC_RDMA_SAME_OP <= module->accumulate_ops) {
|
||||
/* the user has indicated that they will only use the same op (or same op and no op)
|
||||
* for operations on overlapping memory ranges. that indicates it is safe to go ahead
|
||||
* and use network atomic operations. */
|
||||
ret = ompi_osc_rdma_cas_atomic (sync, origin_addr, compare_addr, result_addr, dt,
|
||||
peer, target_address, target_handle);
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
|
||||
if (ompi_osc_rdma_peer_local_base (peer)) {
|
||||
return ompi_osc_rdma_cas_local (origin_addr, compare_addr, result_addr, dt,
|
||||
peer, target_address, target_handle, module);
|
||||
}
|
||||
|
||||
return cas_rdma (sync, origin_addr, compare_addr, result_addr, dt, peer, target_address,
|
||||
target_handle);
|
||||
}
|
||||
|
||||
|
||||
static inline
|
||||
int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count,
|
||||
struct ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer,
|
||||
int target_rank, MPI_Aint target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
|
||||
ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
mca_btl_base_registration_handle_t *target_handle;
|
||||
uint64_t target_address;
|
||||
int ret;
|
||||
|
||||
/* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */
|
||||
if ((result_addr && 0 == result_count) || 0 == target_count) {
|
||||
if (request) {
|
||||
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count,
|
||||
&target_address, &target_handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ompi_osc_rdma_peer_local_base (peer)) {
|
||||
/* local/self optimization */
|
||||
return ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
|
||||
result_datatype, peer, target_address, target_handle, target_count,
|
||||
target_datatype, op, module, request);
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count,
|
||||
result_datatype, peer, target_address, target_handle, target_count,
|
||||
target_datatype, op, request);
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr, int result_count,
|
||||
struct ompi_datatype_t *result_datatype,
|
||||
int target_rank, MPI_Aint target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_datatype,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"get_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
|
||||
(unsigned long) origin_addr, origin_count, origin_datatype->name,
|
||||
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
|
||||
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
|
||||
win->w_name));
|
||||
|
||||
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype,
|
||||
result_addr, result_count, result_datatype,
|
||||
peer, target_rank, target_disp, target_count,
|
||||
target_datatype, op, NULL);
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr, int result_count,
|
||||
struct ompi_datatype_t *result_datatype,
|
||||
int target_rank, MPI_Aint target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_datatype,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win,
|
||||
ompi_request_t **request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_request_t *rdma_request;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
int ret;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
|
||||
(unsigned long) origin_addr, origin_count, origin_datatype->name,
|
||||
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
|
||||
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
|
||||
win->w_name));
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_request)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, result_addr,
|
||||
result_count, result_datatype, peer, target_rank, target_disp,
|
||||
target_count, target_datatype, op, rdma_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = &rdma_request->super;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target_rank,
|
||||
OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "fop: %p, %s, %d, %lu, %s, %s",
|
||||
result_addr, dt->name, target_rank, (unsigned long) target_disp, op->o_name, win->w_name));
|
||||
|
||||
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer, target_rank,
|
||||
target_disp, 1, dt, op, NULL);
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype, int target_rank,
|
||||
OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
|
||||
struct ompi_win_t *win, struct ompi_request_t **request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_request_t *rdma_request;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
int ret;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "racc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
|
||||
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
|
||||
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name));
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_request)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
|
||||
NULL, peer, target_rank, target_disp, target_count, target_datatype,
|
||||
op, rdma_request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = &rdma_request->super;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype, int target_rank,
|
||||
OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, struct ompi_op_t *op,
|
||||
struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
|
||||
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
|
||||
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name));
|
||||
|
||||
return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
|
||||
NULL, peer, target_rank, target_disp, target_count, target_datatype,
|
||||
op, NULL);
|
||||
}
|
57
ompi/mca/osc/rdma/osc_rdma_accumulate.h
Обычный файл
57
ompi/mca/osc/rdma/osc_rdma_accumulate.h
Обычный файл
@ -0,0 +1,57 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OSC_RDMA_ACCUMULATE_H)
|
||||
#define OSC_RDMA_ACCUMULATE_H
|
||||
|
||||
#include "osc_rdma.h"
|
||||
|
||||
int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare_addr,
|
||||
void *result_addr, struct ompi_datatype_t *dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp,
|
||||
struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_dt,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr,
|
||||
struct ompi_datatype_t *dt, int target,
|
||||
OPAL_PTRDIFF_TYPE target_disp,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr, int result_count,
|
||||
struct ompi_datatype_t *result_datatype,
|
||||
int target_rank, MPI_Aint target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_datatype,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_dt,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype,
|
||||
void *result_addr, int result_count,
|
||||
struct ompi_datatype_t *result_datatype,
|
||||
int target_rank, MPI_Aint target_disp,
|
||||
int target_count, struct ompi_datatype_t *target_datatype,
|
||||
struct ompi_op_t *op, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
|
||||
#endif /* OSC_RDMA_ACCUMULATE_H */
|
652
ompi/mca/osc/rdma/osc_rdma_active_target.c
Обычный файл
652
ompi/mca/osc/rdma/osc_rdma_active_target.c
Обычный файл
@ -0,0 +1,652 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
#include "osc_rdma_active_target.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/threads/mutex.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_pending_post_t:
|
||||
*
|
||||
* Describes a post operation that was encountered outside it's
|
||||
* matching start operation.
|
||||
*/
|
||||
struct ompi_osc_rdma_pending_post_t {
|
||||
opal_list_item_t super;
|
||||
int rank;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
|
||||
|
||||
static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
|
||||
|
||||
/**
|
||||
* Dummy completion function for atomic operations
|
||||
*/
|
||||
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
volatile bool *atomic_complete = (volatile bool *) context;
|
||||
|
||||
if (atomic_complete) {
|
||||
*atomic_complete = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* compare_ranks:
|
||||
*
|
||||
* @param[in] ptra Pointer to integer item
|
||||
* @param[in] ptrb Pointer to integer item
|
||||
*
|
||||
* @returns 0 if *ptra == *ptrb
|
||||
* @returns -1 if *ptra < *ptrb
|
||||
* @returns 1 otherwise
|
||||
*
|
||||
* This function is used to sort the rank list. It can be removed if
|
||||
* groups are always in order.
|
||||
*/
|
||||
static int compare_ranks (const void *ptra, const void *ptrb)
|
||||
{
|
||||
int a = *((int *) ptra);
|
||||
int b = *((int *) ptrb);
|
||||
|
||||
if (a < b) {
|
||||
return -1;
|
||||
} else if (a > b) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_get_comm_ranks:
|
||||
*
|
||||
* @param[in] module - OSC RDMA module
|
||||
* @param[in] sub_group - Group with ranks to translate
|
||||
*
|
||||
* @returns an array of translated ranks on success or NULL on failure
|
||||
*
|
||||
* Translate the ranks given in {sub_group} into ranks in the
|
||||
* communicator used to create {module}.
|
||||
*/
|
||||
static ompi_osc_rdma_peer_t **ompi_osc_rdma_get_peers (ompi_osc_rdma_module_t *module, ompi_group_t *sub_group)
|
||||
{
|
||||
int size = ompi_group_size(sub_group);
|
||||
ompi_osc_rdma_peer_t **peers;
|
||||
int *ranks1, *ranks2;
|
||||
int ret;
|
||||
|
||||
ranks1 = malloc (sizeof(int) * size);
|
||||
ranks2 = malloc (sizeof(int) * size);
|
||||
peers = malloc (sizeof (ompi_osc_rdma_peer_t *) * size);
|
||||
if (NULL == ranks1 || NULL == ranks2 || NULL == peers) {
|
||||
free (ranks1);
|
||||
free (ranks2);
|
||||
free (peers);
|
||||
}
|
||||
|
||||
for (int i = 0 ; i < size ; ++i) {
|
||||
ranks1[i] = i;
|
||||
}
|
||||
|
||||
ret = ompi_group_translate_ranks (sub_group, size, ranks1, module->comm->c_local_group,
|
||||
ranks2);
|
||||
free (ranks1);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
free (ranks2);
|
||||
free (peers);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
qsort (ranks2, size, sizeof (int), compare_ranks);
|
||||
for (int i = 0 ; i < size ; ++i) {
|
||||
peers[i] = ompi_osc_rdma_module_peer (module, ranks2[i]);
|
||||
if (NULL == peers[i]) {
|
||||
free (peers);
|
||||
peers = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
OBJ_RETAIN(peers[i]);
|
||||
}
|
||||
free (ranks2);
|
||||
|
||||
return peers;
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_release_peers (ompi_osc_rdma_peer_t **peers, int npeers)
|
||||
{
|
||||
for (int i = 0 ; i < npeers ; ++i) {
|
||||
OBJ_RELEASE(peers[i]);
|
||||
}
|
||||
|
||||
free (peers);
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank, ompi_osc_rdma_peer_t **peers, int npeers) {
|
||||
ompi_osc_rdma_state_t *state = module->state;
|
||||
ompi_osc_rdma_pending_post_t *pending_post;
|
||||
|
||||
/* look for the posting peer in the group */
|
||||
for (int j = 0 ; j < npeers ; ++j) {
|
||||
if (rank == peers[j]->rank) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"got expected post from %d. still expecting posts from %d processes",
|
||||
rank, (int) (npeers - state->num_post_msgs - 1)));
|
||||
++state->num_post_msgs;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* post does not belong to this start epoch. save it for later */
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "got unexpected post from %d "
|
||||
". queueing for later", rank));
|
||||
pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
|
||||
pending_post->rank = rank;
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->lock, opal_list_append (&module->pending_posts, &pending_post->super));
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t **peers;
|
||||
int my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_rdma_state_t *state = module->state;
|
||||
volatile bool atomic_complete;
|
||||
ompi_osc_rdma_frag_t *frag;
|
||||
osc_rdma_counter_t *temp;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_post_atomic entering..."));
|
||||
|
||||
/* check if we are already in a post epoch */
|
||||
if (module->pw_group) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* save the group */
|
||||
OBJ_RETAIN(group);
|
||||
ompi_group_increment_proc_count(group);
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* ensure we're not already in a post */
|
||||
if (NULL != module->pw_group) {
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
module->pw_group = group;
|
||||
|
||||
/* Update completion counter. Can't have received any completion
|
||||
messages yet; complete won't send a completion header until
|
||||
we've sent a post header. */
|
||||
state->num_complete_msgs = 0;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
/* allocate a temporary buffer for atomic response */
|
||||
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
|
||||
|
||||
if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* translate group ranks into the communicator */
|
||||
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
|
||||
if (OPAL_UNLIKELY(NULL == peers)) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"sending post messages"));
|
||||
|
||||
/* send a hello counter to everyone in group */
|
||||
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
|
||||
ompi_osc_rdma_peer_t *peer = peers[i];
|
||||
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
|
||||
int post_index;
|
||||
|
||||
if (peer->rank == my_rank) {
|
||||
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* get a post index */
|
||||
atomic_complete = false;
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
do {
|
||||
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle,
|
||||
peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
|
||||
assert (OPAL_SUCCESS >= ret);
|
||||
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_progress (module);
|
||||
} while (1);
|
||||
} else {
|
||||
*temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
|
||||
}
|
||||
post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1);
|
||||
|
||||
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
|
||||
sizeof (osc_rdma_counter_t) * post_index;
|
||||
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((80, ompi_osc_base_framework.framework_output,
|
||||
"Attempting to post to index %d @ rank %d", post_index, peer->rank));
|
||||
|
||||
/* try to post. if the value isn't 0 then another rank is occupying this index */
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
atomic_complete = false;
|
||||
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle,
|
||||
0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
|
||||
(void *) &atomic_complete, NULL);
|
||||
assert (OPAL_SUCCESS >= ret);
|
||||
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
} else {
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
|
||||
} else {
|
||||
*temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(0 == *temp)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* prevent circular wait by checking for post messages received */
|
||||
for (int j = 0 ; j < OMPI_OSC_RDMA_POST_PEER_MAX ; ++j) {
|
||||
/* no post at this index (yet) */
|
||||
if (0 == state->post_peers[j]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_handle_post (module, state->post_peers[j] - 1, NULL, 0);
|
||||
state->post_peers[j] = 0;
|
||||
}
|
||||
|
||||
usleep (100);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
|
||||
ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"post complete"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_pending_post_t *pending_post, *next;
|
||||
ompi_osc_rdma_state_t *state = module->state;
|
||||
ompi_osc_rdma_sync_t *sync = &module->all_sync;
|
||||
int group_size = ompi_group_size (group);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_start entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* check if we are already in an access epoch */
|
||||
if (ompi_osc_rdma_access_epoch_active (module)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* mark all procs in this group as being in an access epoch */
|
||||
sync->num_peers = ompi_group_size (group);
|
||||
sync->sync.pscw.group = group;
|
||||
|
||||
/* haven't processed any post messaes yet */
|
||||
state->num_post_msgs = 0;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_start entering with group size %d...",
|
||||
sync->num_peers));
|
||||
|
||||
if (0 == ompi_group_size (group)) {
|
||||
/* nothing more to do. this is an empty start epoch */
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;
|
||||
|
||||
/* prevent us from entering a passive-target, fence, or another pscw access epoch until
|
||||
* the matching complete is called */
|
||||
sync->epoch_active = true;
|
||||
|
||||
/* translate the group ranks into the communicator */
|
||||
sync->peer_list.peers = ompi_osc_rdma_get_peers (module, group);
|
||||
if (NULL == sync->peer_list.peers) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* save the group */
|
||||
OBJ_RETAIN(group);
|
||||
ompi_group_increment_proc_count(group);
|
||||
|
||||
if (!(assert & MPI_MODE_NOCHECK)) {
|
||||
/* look through list of pending posts */
|
||||
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
|
||||
for (int i = 0 ; i < group_size ; ++i) {
|
||||
ompi_osc_rdma_peer_t *peer = sync->peer_list.peers[i];
|
||||
|
||||
if (pending_post->rank == peer->rank) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"found queued post from %d. still expecting posts from %d processes",
|
||||
peer->rank, (int) (group_size - state->num_post_msgs - 1)));
|
||||
opal_list_remove_item (&module->pending_posts, &pending_post->super);
|
||||
OBJ_RELEASE(pending_post);
|
||||
/* only one thread can process post messages so there is no need of atomics here */
|
||||
++state->num_post_msgs;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* wait for all post messages to arrive */
|
||||
while (state->num_post_msgs != group_size) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"Waiting for post messages. Have %d of %d",
|
||||
(int) state->num_post_msgs, group_size));
|
||||
for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) {
|
||||
/* no post at this index (yet) */
|
||||
if (0 == state->post_peers[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, group_size);
|
||||
state->post_peers[i] = 0;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
} else {
|
||||
state->num_post_msgs = group_size;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_start complete"));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_sync_t *sync = &module->all_sync;
|
||||
ompi_osc_rdma_peer_t **peers;
|
||||
ompi_group_t *group;
|
||||
int group_size;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW != sync->type) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* phase 1 cleanup sync object */
|
||||
group = sync->sync.pscw.group;
|
||||
group_size = sync->num_peers;
|
||||
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
|
||||
sync->epoch_active = false;
|
||||
|
||||
/* phase 2 cleanup group */
|
||||
ompi_group_decrement_proc_count(group);
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
peers = sync->peer_list.peers;
|
||||
if (NULL == peers) {
|
||||
/* empty peer list */
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
OBJ_RELEASE(group);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
sync->peer_list.peers = NULL;
|
||||
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
|
||||
ompi_osc_rdma_sync_rdma_complete (sync);
|
||||
|
||||
/* for each process in the group increment their number of complete messages */
|
||||
for (int i = 0 ; i < group_size ; ++i) {
|
||||
ompi_osc_rdma_peer_t *peer = peers[i];
|
||||
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs);
|
||||
int ret;
|
||||
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
do {
|
||||
if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) {
|
||||
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle,
|
||||
1, MCA_BTL_ATOMIC_ADD, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_atomic_complete, NULL, NULL);
|
||||
} else {
|
||||
/* don't care about the read value so use the scratch lock */
|
||||
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, &module->state->scratch_lock,
|
||||
target, module->state_handle, peer->state_handle, 1, MCA_BTL_ATOMIC_ADD,
|
||||
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL);
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
} else {
|
||||
(void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/* release our reference to peers in this group */
|
||||
ompi_osc_rdma_release_peers (peers, group_size);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_complete complete"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_wait_atomic (ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_state_t *state = module->state;
|
||||
ompi_group_t *group;
|
||||
int group_size;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_wait entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
if (NULL == module->pw_group) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_wait_atomic no post group"));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
group_size = ompi_group_size (module->pw_group);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_wait_atomic group size %d, complete messages %d",
|
||||
group_size, (int) state->num_complete_msgs));
|
||||
|
||||
while (group_size != state->num_complete_msgs) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
opal_atomic_mb ();
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
state->num_complete_msgs = 0;
|
||||
group = module->pw_group;
|
||||
module->pw_group = NULL;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
ompi_group_decrement_proc_count(group);
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_wait complete"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_state_t *state = module->state;
|
||||
ompi_group_t *group;
|
||||
int group_size;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_test_atomic entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
if (NULL == module->pw_group) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_test_atomic no post group"));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
group_size = ompi_group_size (module->pw_group);
|
||||
|
||||
*flag = (group_size == state->num_complete_msgs);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_test_atomic flag %d", *flag));
|
||||
|
||||
if (!*flag) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
state->num_complete_msgs = 0;
|
||||
|
||||
OPAL_THREAD_LOCK(&(module->lock));
|
||||
group = module->pw_group;
|
||||
module->pw_group = NULL;
|
||||
OPAL_THREAD_UNLOCK(&(module->lock));
|
||||
|
||||
ompi_group_decrement_proc_count(group);
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: fence start"));
|
||||
|
||||
/* can't enter an active target epoch while a lock is active */
|
||||
if (ompi_osc_rdma_in_passive_epoch (module)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: could not enter fence. already in an access epoch"));
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* active sends are now active (we will close the epoch if NOSUCCEED is specified) */
|
||||
if (0 == (assert & MPI_MODE_NOSUCCEED)) {
|
||||
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_FENCE;
|
||||
module->all_sync.num_peers = ompi_comm_size (module->comm);
|
||||
/* NTH: should add a fast access array for peers here later. for now just use the
|
||||
* hash table. */
|
||||
}
|
||||
|
||||
/* technically it is possible to enter a lock epoch (which will close the fence epoch) if
|
||||
* no communication has occurred. this flag will be set on the next put, get, accumulate, etc. */
|
||||
module->all_sync.epoch_active = false;
|
||||
|
||||
/* short-circuit the noprecede case */
|
||||
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: fence end (short circuit)"));
|
||||
/* no communication can occur until a peer has entered the same fence epoch. for now
|
||||
* a barrier is used to ensure this is the case. */
|
||||
ret = module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module);
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
|
||||
|
||||
/* ensure all writes to my memory are complete */
|
||||
ret = module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module);
|
||||
|
||||
if (assert & MPI_MODE_NOSUCCEED) {
|
||||
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
|
||||
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
|
||||
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: fence end: %d", ret));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
42
ompi/mca/osc/rdma/osc_rdma_active_target.h
Обычный файл
42
ompi/mca/osc/rdma/osc_rdma_active_target.h
Обычный файл
@ -0,0 +1,42 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OSC_RDMA_ACTIVE_TARGET_H)
|
||||
#define OSC_RDMA_ACTIVE_TARGET_H
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
#include "osc_rdma_lock.h"
|
||||
|
||||
int ompi_osc_rdma_fence_atomic (int assert, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_start_atomic (struct ompi_group_t *group,
|
||||
int assert, struct ompi_win_t *win);
|
||||
int ompi_osc_rdma_complete_atomic (struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_post_atomic (struct ompi_group_t *group,
|
||||
int assert, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_wait_atomic (struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_test_atomic (struct ompi_win_t *win, int *flag);
|
||||
|
||||
#endif /* OSC_RDMA_ACTIVE_TARGET_H */
|
874
ompi/mca/osc/rdma/osc_rdma_comm.c
Обычный файл
874
ompi/mca/osc/rdma/osc_rdma_comm.c
Обычный файл
@ -0,0 +1,874 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma_comm.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
#include "osc_rdma_request.h"
|
||||
#include "osc_rdma_dynamic.h"
|
||||
|
||||
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
||||
|
||||
static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
assert (OPAL_SUCCESS == status);
|
||||
((bool *) context)[0] = true;
|
||||
}
|
||||
|
||||
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
|
||||
void *data, size_t len)
|
||||
{
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
volatile bool read_complete = false;
|
||||
char *ptr = data;
|
||||
int ret;
|
||||
|
||||
if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) {
|
||||
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "error allocating a fragment!"));
|
||||
return ret;
|
||||
}
|
||||
|
||||
local_handle = frag->handle;
|
||||
}
|
||||
|
||||
assert (!(source_address & (module->selected_btl->btl_get_alignment - 1)));
|
||||
|
||||
do {
|
||||
ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, source_address,
|
||||
local_handle, source_handle, len, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
|
||||
if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
|
||||
break;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_progress (module);
|
||||
} while (1);
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl get failed with ret = %d", ret));
|
||||
|
||||
if (frag) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* block until the callback is called */
|
||||
while (!read_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
|
||||
opal_memchecker_base_mem_defined (ptr, len);
|
||||
|
||||
if (frag) {
|
||||
memcpy (data, ptr, len);
|
||||
|
||||
/* done with the fragment */
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief function signature for the rdma transfer function used by ompi_osc_rdma_master_noncontig()
|
||||
*
|
||||
* @param[in] peer peer object for remote peer
|
||||
* @param[in] remote_address base of remote region (destination for put, source for get)
|
||||
* @param[in] remote_handle btl registration handle for remote region (must be valid for the entire region)
|
||||
* @param[in] local_address base of local region (source for put, destination for get)
|
||||
* @param[in] size number of bytes to transfer
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] request osc rdma request if used (can be NULL)
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_OUT_OF_RESOURCE on temporary error
|
||||
* @returns other OMPI error on fatal error
|
||||
*
|
||||
* This function does the work of scheduling a contiguous transfer between the local and remote regions.
|
||||
*/
|
||||
typedef int (*ompi_osc_rdma_fn_t) (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *remote_handle, void *local_address, size_t size,
|
||||
ompi_osc_rdma_request_t *request);
|
||||
|
||||
/**
|
||||
* @brief break down rdma transaction into contiguous regions
|
||||
*
|
||||
* @param[in] local_address base of local region (source for put, destination for get)
|
||||
* @param[in] local_count number of elements in local region
|
||||
* @param[in] local_datatype datatype of local region
|
||||
* @param[in] peer peer object for remote peer
|
||||
* @param[in] remote_address base of remote region (destination for put, source for get)
|
||||
* @param[in] remote_handle btl registration handle for remote region (must be valid for the entire region)
|
||||
* @param[in] remote_count number of elements in remote region
|
||||
* @param[in] remote_datatype datatype of remote region
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] request osc rdma request if used (can be NULL)
|
||||
* @param[in] max_rdma_len maximum length of an rdma request (usually btl limitation)
|
||||
* @param[in] rdma_fn function to use for contiguous rdma operations
|
||||
* @param[in] alloc_reqs true if rdma_fn requires a valid request object (any allocated objects will be marked internal)
|
||||
*
|
||||
* This function does the work of breaking a non-contiguous rdma transfer into contiguous components. It will
|
||||
* continue to submit rdma transfers until the entire region is transferred or a fatal error occurs.
|
||||
*/
|
||||
static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count, ompi_datatype_t *local_datatype,
|
||||
ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *remote_handle, int remote_count,
|
||||
ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
|
||||
const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
|
||||
opal_convertor_t local_convertor, remote_convertor;
|
||||
uint32_t local_iov_count, remote_iov_count;
|
||||
uint32_t local_iov_index, remote_iov_index;
|
||||
/* needed for opal_convertor_raw but not used */
|
||||
size_t local_size, remote_size, rdma_len;
|
||||
ompi_osc_rdma_request_t *subreq;
|
||||
int ret;
|
||||
bool done;
|
||||
|
||||
subreq = NULL;
|
||||
|
||||
/* prepare convertors for the source and target. these convertors will be used to determine the
|
||||
* contiguous segments within the source and target. */
|
||||
OBJ_CONSTRUCT(&remote_convertor, opal_convertor_t);
|
||||
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &remote_datatype->super, remote_count,
|
||||
(void *) (intptr_t) remote_address, 0, &remote_convertor);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&local_convertor, opal_convertor_t);
|
||||
ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &local_datatype->super, local_count,
|
||||
local_address, 0, &local_convertor);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (request) {
|
||||
/* keep the request from completing until all the transfers have started */
|
||||
request->outstanding_requests = 1;
|
||||
}
|
||||
|
||||
local_iov_index = 0;
|
||||
local_iov_count = 0;
|
||||
|
||||
do {
|
||||
/* decode segments of the remote data */
|
||||
remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
|
||||
remote_iov_index = 0;
|
||||
|
||||
/* opal_convertor_raw returns done when it has reached the end of the data */
|
||||
done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size);
|
||||
|
||||
/* loop on the target segments until we have exhaused the decoded source data */
|
||||
while (remote_iov_index != remote_iov_count) {
|
||||
if (local_iov_index == local_iov_count) {
|
||||
/* decode segments of the target buffer */
|
||||
local_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
|
||||
local_iov_index = 0;
|
||||
(void) opal_convertor_raw (&local_convertor, local_iovec, &local_iov_count, &local_size);
|
||||
}
|
||||
|
||||
/* we already checked that the target was large enough. this should be impossible */
|
||||
assert (0 != local_iov_count);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((40, ompi_osc_base_framework.framework_output, "local index = %d, local = {%p, %lu}, "
|
||||
"source_index = %d, source = {%p, %lu}", local_iov_index, local_iovec[local_iov_index].iov_base,
|
||||
(unsigned long) local_iovec[local_iov_index].iov_len, remote_iov_index, remote_iovec[remote_iov_index].iov_base,
|
||||
(unsigned long) remote_iovec[remote_iov_index].iov_len));
|
||||
|
||||
/* determine how much to transfer in this operation */
|
||||
rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len);
|
||||
|
||||
/* execute the get */
|
||||
if (!subreq && alloc_reqs) {
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
|
||||
if (NULL == subreq) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
subreq->internal = true;
|
||||
subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
|
||||
subreq->parent_request = request;
|
||||
|
||||
if (request) {
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
|
||||
}
|
||||
} else if (!alloc_reqs) {
|
||||
subreq = request;
|
||||
}
|
||||
|
||||
ret = rdma_fn (sync, peer, (uint64_t) (intptr_t) remote_iovec[remote_iov_index].iov_base, remote_handle,
|
||||
local_iovec[local_iov_index].iov_base, rdma_len, subreq);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
|
||||
if (request) {
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
|
||||
}
|
||||
|
||||
if (alloc_reqs) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
|
||||
}
|
||||
|
||||
/* something bad happened. need to figure out best way to handle rma errors */
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* progress and try again */
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
subreq = NULL;
|
||||
|
||||
/* adjust io vectors */
|
||||
local_iovec[local_iov_index].iov_len -= rdma_len;
|
||||
remote_iovec[remote_iov_index].iov_len -= rdma_len;
|
||||
local_iovec[local_iov_index].iov_base = (void *)((intptr_t) local_iovec[local_iov_index].iov_base + rdma_len);
|
||||
remote_iovec[remote_iov_index].iov_base = (void *)((intptr_t) remote_iovec[remote_iov_index].iov_base + rdma_len);
|
||||
|
||||
local_iov_index += (0 == local_iovec[local_iov_index].iov_len);
|
||||
remote_iov_index += (0 == remote_iovec[remote_iov_index].iov_len);
|
||||
}
|
||||
} while (!done);
|
||||
|
||||
if (request) {
|
||||
/* release our reference so the request can complete */
|
||||
if (1 == request->outstanding_requests) {
|
||||
ompi_osc_rdma_request_complete (request, OMPI_SUCCESS);
|
||||
}
|
||||
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
|
||||
}
|
||||
|
||||
/* clean up convertors */
|
||||
opal_convertor_cleanup (&local_convertor);
|
||||
OBJ_DESTRUCT(&local_convertor);
|
||||
opal_convertor_cleanup (&remote_convertor);
|
||||
OBJ_DESTRUCT(&remote_convertor);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_master (ompi_osc_rdma_sync_t *sync, void *local_address, int local_count,
|
||||
ompi_datatype_t *local_datatype, ompi_osc_rdma_peer_t *peer,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
int remote_count, ompi_datatype_t *remote_datatype,
|
||||
ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
|
||||
const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
|
||||
{
|
||||
size_t rdma_len;
|
||||
ptrdiff_t lb, extent;
|
||||
int ret;
|
||||
|
||||
rdma_len = local_datatype->super.size * local_count;
|
||||
|
||||
/* fast path for contiguous rdma */
|
||||
if (OPAL_LIKELY(ompi_datatype_is_contiguous_memory_layout (local_datatype, local_count) &&
|
||||
ompi_datatype_is_contiguous_memory_layout (remote_datatype, remote_count) &&
|
||||
rdma_len <= max_rdma_len)) {
|
||||
if (NULL == request && alloc_reqs) {
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request);
|
||||
if (NULL == request) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
request->internal = true;
|
||||
request->type = OMPI_OSC_RDMA_TYPE_RDMA;
|
||||
}
|
||||
|
||||
/* ignore failure here */
|
||||
(void) ompi_datatype_get_extent (local_datatype, &lb, &extent);
|
||||
local_address = (void *)((intptr_t) local_address + lb);
|
||||
|
||||
(void) ompi_datatype_get_extent (remote_datatype, &lb, &extent);
|
||||
remote_address += lb;
|
||||
|
||||
do {
|
||||
ret = rdma_fn (sync, peer, remote_address, remote_handle, local_address, rdma_len, request);
|
||||
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_progress (sync->module);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_master_noncontig (sync, local_address, local_count, local_datatype, peer, remote_address,
|
||||
remote_handle, remote_count, remote_datatype, request,
|
||||
max_rdma_len, rdma_fn, alloc_reqs);
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_copy_local (const void *source, int source_count, ompi_datatype_t *source_datatype,
|
||||
void *target, int target_count, ompi_datatype_t *target_datatype,
|
||||
ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "performing local copy from %p -> %p", source, target));
|
||||
|
||||
opal_atomic_mb ();
|
||||
ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype);
|
||||
|
||||
if (request) {
|
||||
ompi_osc_rdma_request_complete (request, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
|
||||
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
|
||||
ompi_osc_rdma_request_t *request = NULL;
|
||||
|
||||
assert (OPAL_SUCCESS == status);
|
||||
|
||||
/* the lowest bit is used as a flag indicating this put operation has a request */
|
||||
if ((intptr_t) context & 0x1) {
|
||||
request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
|
||||
sync = request->sync;
|
||||
|
||||
/* NTH -- TODO: better error handling */
|
||||
ompi_osc_rdma_request_complete (request, status);
|
||||
}
|
||||
|
||||
if (frag) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
} else {
|
||||
ompi_osc_rdma_deregister (sync->module, local_handle);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_aggregate_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_aggregation_t *aggregation = (ompi_osc_rdma_aggregation_t *) context;
|
||||
ompi_osc_rdma_sync_t *sync = aggregation->sync;
|
||||
ompi_osc_rdma_frag_t *frag = aggregation->frag;
|
||||
ompi_osc_rdma_request_t *request = NULL, *next;
|
||||
|
||||
assert (OPAL_SUCCESS == status);
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
|
||||
OPAL_LIST_FOREACH_SAFE(request, next, &aggregation->requests, ompi_osc_rdma_request_t) {
|
||||
opal_list_remove_item (&aggregation->requests, (opal_list_item_t *) request);
|
||||
ompi_osc_rdma_request_complete (request, status);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_aggregation_return (aggregation);
|
||||
|
||||
/* make sure the aggregation is returned before marking the operation as complete */
|
||||
opal_atomic_wmb ();
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, void *ptr,
|
||||
mca_btl_base_registration_handle_t *local_handle, size_t size,
|
||||
mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating the btl put of %llu bytes to remote "
|
||||
"address %" PRIx64 ", sync object %p...", (unsigned long long) size, target_address, (void *) sync));
|
||||
|
||||
/* flag outstanding rma requests */
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
|
||||
do {
|
||||
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
|
||||
local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
|
||||
cb, context, cbdata);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
++module->put_retry_count;
|
||||
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* spin a bit on progress */
|
||||
for (int i = 0 ; i < 10 ; ++i) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
} while (1);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl put failed with code %d", ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_aggregate_append (ompi_osc_rdma_aggregation_t *aggregation, ompi_osc_rdma_request_t *request,
|
||||
void *source_buffer, size_t size)
|
||||
{
|
||||
size_t offset = aggregation->buffer_used;
|
||||
memcpy (aggregation->buffer + offset, source_buffer, size);
|
||||
|
||||
aggregation->buffer_used += size;
|
||||
|
||||
if (request) {
|
||||
opal_list_append (&aggregation->requests, (opal_list_item_t *) request);
|
||||
}
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
|
||||
ompi_osc_rdma_request_t *request, int type)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
ompi_osc_rdma_aggregation_t *aggregation;
|
||||
int ret;
|
||||
|
||||
aggregation = (ompi_osc_rdma_aggregation_t *) opal_free_list_get (&mca_osc_rdma_component.aggregate);
|
||||
if (OPAL_UNLIKELY(NULL == aggregation)) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, mca_osc_rdma_component.aggregation_limit, &aggregation->frag,
|
||||
&aggregation->buffer);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
opal_free_list_return(&mca_osc_rdma_component.aggregate, (opal_free_list_item_t *) aggregation);
|
||||
return ret;
|
||||
}
|
||||
|
||||
peer->aggregate = aggregation;
|
||||
|
||||
aggregation->target_address = target_address;
|
||||
aggregation->target_handle = target_handle;
|
||||
aggregation->buffer_size = mca_osc_rdma_component.aggregation_limit;
|
||||
aggregation->sync = sync;
|
||||
aggregation->peer = peer;
|
||||
aggregation->type = type;
|
||||
aggregation->buffer_used = 0;
|
||||
|
||||
ompi_osc_rdma_aggregate_append (aggregation, request, source_buffer, size);
|
||||
|
||||
OPAL_THREAD_SCOPED_LOCK(&sync->lock, opal_list_append (&sync->aggregations, (opal_list_item_t *) aggregation));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
|
||||
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
|
||||
ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
char *ptr = source_buffer;
|
||||
void *cbcontext;
|
||||
int ret;
|
||||
|
||||
if (aggregation) {
|
||||
if (size <= (aggregation->buffer_size - aggregation->buffer_used) && (target_handle == aggregation->target_handle) &&
|
||||
(target_address == aggregation->target_address + aggregation->buffer_used)) {
|
||||
assert (OMPI_OSC_RDMA_TYPE_PUT == aggregation->type);
|
||||
ompi_osc_rdma_aggregate_append (aggregation, request, source_buffer, size);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* can't aggregate this operation. flush the previous segment */
|
||||
ret = ompi_osc_rdma_peer_aggregate_flush (peer);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (size <= (mca_osc_rdma_component.aggregation_limit >> 2)) {
|
||||
ret = ompi_osc_rdma_aggregate_alloc (sync, peer, target_address, target_handle, source_buffer, size, request,
|
||||
OMPI_OSC_RDMA_TYPE_PUT);
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) {
|
||||
ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
memcpy (ptr, source_buffer, size);
|
||||
local_handle = frag->handle;
|
||||
}
|
||||
}
|
||||
|
||||
/* increment the outstanding request counter in the request object */
|
||||
if (request) {
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
|
||||
cbcontext = (void *) ((intptr_t) request | 1);
|
||||
request->sync = sync;
|
||||
} else {
|
||||
cbcontext = (void *) sync;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, ompi_osc_rdma_put_complete,
|
||||
cbcontext, frag);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status)
|
||||
{
|
||||
ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) context;
|
||||
intptr_t source = (intptr_t) local_address + request->offset;
|
||||
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
|
||||
ompi_osc_rdma_sync_t *sync = request->sync;
|
||||
void *origin_addr = request->origin_addr;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((status ? 10 : 60, ompi_osc_base_framework.framework_output, "btl get operation complete with status %d",
|
||||
status));
|
||||
|
||||
assert (OPAL_SUCCESS == status);
|
||||
|
||||
if (NULL != frag) {
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "copying %lu bytes from temporary buffer %p to destination %p",
|
||||
request->len, (void *) source, origin_addr));
|
||||
memcpy (origin_addr, (void *) source, request->len);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
} else {
|
||||
ompi_osc_rdma_deregister (sync->module, local_handle);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
|
||||
ompi_osc_rdma_request_complete (request, status);
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
|
||||
int ret;
|
||||
|
||||
if (NULL == aggregation) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
assert (OMPI_OSC_RDMA_TYPE_PUT == aggregation->type);
|
||||
|
||||
ret = ompi_osc_rdma_put_real (aggregation->sync, peer, aggregation->target_address, aggregation->target_handle,
|
||||
aggregation->buffer, aggregation->frag->handle, aggregation->buffer_used,
|
||||
ompi_osc_rdma_aggregate_put_complete, (void *) aggregation, NULL);
|
||||
|
||||
peer->aggregate = NULL;
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_cleanup_rdma (aggregation->sync, aggregation->frag, NULL, NULL);
|
||||
|
||||
ompi_osc_rdma_aggregation_return (aggregation);
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
|
||||
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
|
||||
ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
|
||||
mca_btl_base_registration_handle_t *local_handle = NULL;
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
osc_rdma_size_t aligned_len;
|
||||
osc_rdma_base_t aligned_source_base, aligned_source_bound;
|
||||
char *ptr = target_buffer;
|
||||
int ret;
|
||||
|
||||
aligned_source_base = source_address & ~btl_alignment_mask;
|
||||
aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
|
||||
aligned_len = aligned_source_bound - aligned_source_base;
|
||||
|
||||
request->offset = source_address - aligned_source_base;
|
||||
request->len = size;
|
||||
request->origin_addr = target_buffer;
|
||||
request->sync = sync;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating get from remote ptr %" PRIx64 " to local ptr %p",
|
||||
source_address, target_buffer));
|
||||
|
||||
if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
|
||||
(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
|
||||
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
/* check for alignment */
|
||||
if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
|
||||
(void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE,
|
||||
&local_handle);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == local_handle)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
local_handle = frag->handle;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get... source: %" PRIx64
|
||||
" (handle 0x%llx, 0x%llx), %" PRIu64 ", destination: %p, %" PRIu64, source_address,
|
||||
((unsigned long long *) source_handle)[0], ((unsigned long long *) source_handle)[1],
|
||||
aligned_len, ptr, aligned_len));
|
||||
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
|
||||
do {
|
||||
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, aligned_source_base, local_handle,
|
||||
source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
|
||||
request, frag);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
++module->get_retry_count;
|
||||
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* spin a bit on progress */
|
||||
for (int i = 0 ; i < 10 ; ++i) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
} while (1);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "btl get failed with ret = %d", ret));
|
||||
|
||||
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
|
||||
struct ompi_datatype_t *origin_datatype, ompi_osc_rdma_peer_t *peer,
|
||||
OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
mca_btl_base_registration_handle_t *target_handle;
|
||||
uint64_t target_address;
|
||||
int ret;
|
||||
|
||||
/* short-circuit case */
|
||||
if (0 == origin_count || 0 == target_count) {
|
||||
if (request) {
|
||||
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_datatype->super.size * target_count,
|
||||
&target_address, &target_handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* optimize communication with peers that we can do direct load and store operations on */
|
||||
if (ompi_osc_rdma_peer_local_base (peer)) {
|
||||
return ompi_osc_rdma_copy_local (origin_addr, origin_count, origin_datatype, (void *) (intptr_t) target_address,
|
||||
target_count, target_datatype, request);
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle,
|
||||
target_count, target_datatype, request, module->selected_btl->btl_put_limit,
|
||||
ompi_osc_rdma_put_contig, false);
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
|
||||
ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE source_disp, int source_count,
|
||||
struct ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
mca_btl_base_registration_handle_t *source_handle;
|
||||
uint64_t source_address;
|
||||
int ret;
|
||||
|
||||
/* short-circuit case */
|
||||
if (0 == origin_count || 0 == source_count) {
|
||||
if (request) {
|
||||
ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ret = osc_rdma_get_remote_segment (module, peer, source_disp, source_datatype->super.size * source_count,
|
||||
&source_address, &source_handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* optimize self/local communication */
|
||||
if (ompi_osc_rdma_peer_local_base (peer)) {
|
||||
return ompi_osc_rdma_copy_local ((void *) (intptr_t) source_address, source_count, source_datatype,
|
||||
origin_addr, origin_count, origin_datatype, request);
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address,
|
||||
source_handle, source_count, source_datatype, request,
|
||||
module->selected_btl->btl_get_limit, ompi_osc_rdma_get_contig, true);
|
||||
}
|
||||
int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
|
||||
int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "put: 0x%lx, %d, %s, %d, %d, %d, %s, %s",
|
||||
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
|
||||
(int) target_disp, target_count, target_datatype->name, win->w_name));
|
||||
|
||||
return ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
|
||||
target_count, target_datatype, NULL);
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
|
||||
int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_datatype, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_request_t *rdma_request;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
int ret;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "rput: 0x%lx, %d, %s, %d, %d, "
|
||||
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
|
||||
origin_datatype->name, target_rank, (int) target_disp, target_count,
|
||||
target_datatype->name, win->w_name));
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
|
||||
if (NULL == rdma_request) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
rdma_request->type = OMPI_OSC_RDMA_TYPE_PUT;
|
||||
|
||||
ret = ompi_osc_rdma_put_w_req (sync, origin_addr, origin_count, origin_datatype, peer, target_disp,
|
||||
target_count, target_datatype, rdma_request);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = (ompi_request_t *) rdma_request;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
|
||||
int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count,
|
||||
struct ompi_datatype_t *source_datatype, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "get: 0x%lx, %d, %s, %d, %d, "
|
||||
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
|
||||
origin_datatype->name, source_rank, (int) source_disp, source_count,
|
||||
source_datatype->name, win->w_name));
|
||||
|
||||
return ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
|
||||
source_disp, source_count, source_datatype, NULL);
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype,
|
||||
int source_rank, OPAL_PTRDIFF_TYPE source_disp, int source_count,
|
||||
struct ompi_datatype_t *source_datatype, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_request_t *rdma_request;
|
||||
ompi_osc_rdma_sync_t *sync;
|
||||
int ret;
|
||||
|
||||
sync = ompi_osc_rdma_module_sync_lookup (module, source_rank, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == sync)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "rget: 0x%lx, %d, %s, %d, %d, "
|
||||
"%d, %s, %s", (unsigned long) origin_addr, origin_count,
|
||||
origin_datatype->name, source_rank, (int) source_disp, source_count,
|
||||
source_datatype->name, win->w_name));
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
|
||||
if (NULL == rdma_request) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
rdma_request->type = OMPI_OSC_RDMA_TYPE_GET;
|
||||
ret = ompi_osc_rdma_get_w_req (sync, origin_addr, origin_count, origin_datatype, peer,
|
||||
source_disp, source_count, source_datatype, rdma_request);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
|
||||
return ret;
|
||||
}
|
||||
|
||||
*request = (ompi_request_t *) rdma_request;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
136
ompi/mca/osc/rdma/osc_rdma_comm.h
Обычный файл
136
ompi/mca/osc/rdma/osc_rdma_comm.h
Обычный файл
@ -0,0 +1,136 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OMPI_OSC_RDMA_COMM_H)
|
||||
#define OMPI_OSC_RDMA_COMM_H
|
||||
|
||||
#include "osc_rdma_dynamic.h"
|
||||
#include "osc_rdma_request.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
#include "osc_rdma_lock.h"
|
||||
|
||||
#define OMPI_OSC_RDMA_DECODE_MAX 64
|
||||
|
||||
#define min(a,b) ((a) < (b) ? (a) : (b))
|
||||
#define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0)
|
||||
|
||||
/* helper functions */
|
||||
static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_frag_t *frag,
|
||||
mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
if (frag) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
} else {
|
||||
ompi_osc_rdma_deregister (sync->module, handle);
|
||||
}
|
||||
|
||||
if (request) {
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
|
||||
}
|
||||
|
||||
ompi_osc_rdma_sync_rdma_dec (sync);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief find a remote segment associate with the memory region
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer peer object for remote peer
|
||||
* @param[in] target_disp displacement in remote region
|
||||
* @param[in] length length of remote region
|
||||
* @param[out] remote_address remote address
|
||||
* @param[out] remote_handle btl handle for remote region (valid over entire region)
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_RANGE if the address range is not valid at the remote window
|
||||
* @returns other OMPI error on error
|
||||
*/
|
||||
static inline int osc_rdma_get_remote_segment (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, OPAL_PTRDIFF_TYPE target_disp,
|
||||
size_t length, uint64_t *remote_address, mca_btl_base_registration_handle_t **remote_handle)
|
||||
{
|
||||
ompi_osc_rdma_region_t *region;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "getting remote address for peer %d target_disp %lu",
|
||||
peer->rank, (unsigned long) target_disp));
|
||||
|
||||
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
|
||||
ret = ompi_osc_rdma_find_dynamic_region (module, peer, (uint64_t) target_disp, length, ®ion);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
||||
"could not retrieve region for %" PRIx64 " from window rank %d", (uint64_t) target_disp, peer->rank));
|
||||
return ret;
|
||||
}
|
||||
|
||||
*remote_address = (uint64_t) target_disp;
|
||||
*remote_handle = (mca_btl_base_registration_handle_t *) region->btl_handle_data;
|
||||
} else {
|
||||
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
|
||||
int disp_unit = (module->same_disp_unit) ? module->disp_unit : ex_peer->disp_unit;
|
||||
size_t size = (module->same_size) ? module->size : (size_t) ex_peer->size;
|
||||
|
||||
*remote_address = ex_peer->super.base +disp_unit * target_disp;
|
||||
if (OPAL_UNLIKELY(*remote_address + length > (ex_peer->super.base + size))) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "remote address range 0x%" PRIx64 " - 0x%" PRIx64
|
||||
" is out of range. Valid address range is 0x%" PRIx64 " - 0x%" PRIx64 " (%" PRIu64 " bytes)",
|
||||
*remote_address, *remote_address + length, ex_peer->super.base, ex_peer->super.base + size,
|
||||
(uint64_t) size));
|
||||
return OMPI_ERR_RMA_RANGE;
|
||||
}
|
||||
|
||||
*remote_handle = ex_peer->super.base_handle;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output,
|
||||
"remote address: 0x%" PRIx64 ", handle: %p", *remote_address, (void *) *remote_handle));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* prototypes for implementations of MPI RMA window functions. these will be called from the
|
||||
* mpi interface (ompi/mpi/c) */
|
||||
int ompi_osc_rdma_put (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_get (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win);
|
||||
|
||||
int ompi_osc_rdma_rput (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
int ompi_osc_rdma_rget (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt,
|
||||
int target, OPAL_PTRDIFF_TYPE target_disp, int target_count,
|
||||
struct ompi_datatype_t *target_dt, struct ompi_win_t *win,
|
||||
struct ompi_request_t **request);
|
||||
|
||||
/**
|
||||
* @brief read data from a remote memory region (blocking)
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] endpoint btl endpoint
|
||||
* @param[in] source_address remote address to read from
|
||||
* @param[in] source_handle btl registration handle for remote region (must be valid for the entire region)
|
||||
* @param[in] data local buffer to store to
|
||||
* @param[in] len number of bytes to read
|
||||
*
|
||||
* This is an internal function for reading data from a remote peer. It is used to read peer and state
|
||||
* data that is stored on the remote peer. The peer object does not have to be fully initialized to
|
||||
* work. Only the btl endpoint is needed.
|
||||
*/
|
||||
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
|
||||
void *data, size_t len);
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_COMM_H */
|
1196
ompi/mca/osc/rdma/osc_rdma_component.c
Обычный файл
1196
ompi/mca/osc/rdma/osc_rdma_component.c
Обычный файл
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
372
ompi/mca/osc/rdma/osc_rdma_dynamic.c
Обычный файл
372
ompi/mca/osc/rdma/osc_rdma_dynamic.c
Обычный файл
@ -0,0 +1,372 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma_comm.h"
|
||||
#include "osc_rdma_lock.h"
|
||||
|
||||
#include "mpi.h"
|
||||
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_find_region_containing:
|
||||
*
|
||||
* @param[in] regions sorted list of regions
|
||||
* @param[in] min_index minimum index to search (call with 0)
|
||||
* @param[in] max_index maximum index to search (call with length - 1)
|
||||
* @param[in] base base of region to search for
|
||||
* @param[in] bound bound of region to search for
|
||||
* @param[in] region_size size of an ompi_osc_rdma_region_t object
|
||||
* @param[out] region_index index of region if found (may be NULL)
|
||||
*
|
||||
* @returns an index on success or -1 on failure
|
||||
*
|
||||
* This function searches through a sorted list of rdma regions {regions} and finds
|
||||
* the region that contains the region specified by {base} and {bound}. If a
|
||||
* matching region is found the index of that region is returned else the function
|
||||
* returns -1.
|
||||
*/
|
||||
static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi_osc_rdma_region_t *regions, int min_index,
|
||||
int max_index, intptr_t base, intptr_t bound,
|
||||
size_t region_size, int *region_index)
|
||||
{
|
||||
int mid_index = (max_index + min_index) >> 1;
|
||||
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
|
||||
intptr_t region_bound;
|
||||
|
||||
if (min_index > max_index) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
region_bound = (intptr_t) (region->base + region->len);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
|
||||
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
|
||||
min_index, max_index));
|
||||
|
||||
if (region->base > base) {
|
||||
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
|
||||
} else if (bound <= region_bound) {
|
||||
if (region_index) {
|
||||
*region_index = mid_index;
|
||||
}
|
||||
|
||||
return region;
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_find_region_containing (regions, mid_index+1, max_index, base, bound, region_size, region_index);
|
||||
}
|
||||
|
||||
/* binary search for insertion point */
|
||||
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
|
||||
size_t region_size, int *region_index)
|
||||
{
|
||||
int mid_index = (max_index + min_index) >> 1;
|
||||
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
|
||||
|
||||
if (max_index < min_index) {
|
||||
*region_index = mid_index;
|
||||
return region;
|
||||
}
|
||||
|
||||
if (region->base > base) {
|
||||
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
|
||||
} else {
|
||||
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
|
||||
}
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
const int my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
|
||||
ompi_osc_rdma_region_t *region;
|
||||
osc_rdma_counter_t region_count;
|
||||
osc_rdma_counter_t region_id;
|
||||
intptr_t page_size = getpagesize ();
|
||||
int region_index;
|
||||
int ret;
|
||||
|
||||
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
return OMPI_ERR_RMA_FLAVOR;
|
||||
}
|
||||
|
||||
if (0 == len) {
|
||||
/* shot-circuit 0-byte case */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
region_count = module->state->region_count & 0xffffffffL;
|
||||
region_id = module->state->region_count >> 32;
|
||||
|
||||
if (region_count == mca_osc_rdma_component.max_attach) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_ATTACH;
|
||||
}
|
||||
|
||||
/* see if a matching region already exists */
|
||||
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
|
||||
(intptr_t) base + len, module->region_size, ®ion_index);
|
||||
if (NULL != region) {
|
||||
++module->dynamic_handles[region_index].refcnt;
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
/* no need to invalidate remote caches */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* region is in flux */
|
||||
module->state->region_count = -1;
|
||||
opal_atomic_wmb ();
|
||||
|
||||
ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
|
||||
|
||||
/* do a binary seach for where the region should be inserted */
|
||||
if (region_count) {
|
||||
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
|
||||
module->region_size, ®ion_index);
|
||||
|
||||
if (region_index < region_count) {
|
||||
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
|
||||
|
||||
if (module->selected_btl->btl_register_mem) {
|
||||
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
|
||||
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
region_index = 0;
|
||||
region = (ompi_osc_rdma_region_t *) module->state->regions;
|
||||
}
|
||||
|
||||
/* it is wasteful to register less than a page. this may allow the remote side to access more
|
||||
* memory but the MPI standard covers this with calling the calling behavior erroneous */
|
||||
region->base = OPAL_ALIGN((intptr_t) base - page_size + 1, page_size, intptr_t);
|
||||
region->len = OPAL_ALIGN(len, page_size, size_t);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Attaching memory region %p-%p at index %d",
|
||||
base, (void *)((intptr_t) base + len), region_index));
|
||||
|
||||
if (module->selected_btl->btl_register_mem) {
|
||||
mca_btl_base_registration_handle_t *handle;
|
||||
|
||||
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
|
||||
&handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_ATTACH;
|
||||
}
|
||||
|
||||
memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
|
||||
module->dynamic_handles[region_index].btl_handle = handle;
|
||||
} else {
|
||||
module->dynamic_handles[region_index].btl_handle = NULL;
|
||||
}
|
||||
|
||||
module->dynamic_handles[region_index].refcnt = 1;
|
||||
|
||||
for (int i = 0 ; i < region_count + 1 ; ++i) {
|
||||
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + i * module->region_size);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Cache[%d] = {%p, %lu}",
|
||||
i, (void *) region->base, (unsigned long) region->len));
|
||||
}
|
||||
|
||||
|
||||
opal_atomic_mb ();
|
||||
/* the region state has changed */
|
||||
module->state->region_count = ((region_id + 1) << 32) | (region_count + 1);
|
||||
|
||||
ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
const int my_rank = ompi_comm_rank (module->comm);
|
||||
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
|
||||
osc_rdma_counter_t region_count, region_id;
|
||||
ompi_osc_rdma_region_t *region;
|
||||
int region_index;
|
||||
|
||||
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
|
||||
return OMPI_ERR_WIN;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* the upper 4 bytes of the region count are an instance counter */
|
||||
region_count = module->state->region_count & 0xffffffffL;
|
||||
region_id = module->state->region_count >> 32;
|
||||
|
||||
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
|
||||
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
|
||||
module->region_size, ®ion_index);
|
||||
if (NULL == region) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (--module->dynamic_handles[region_index].refcnt > 0) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* lock the region so it can't change while a peer is reading it */
|
||||
ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "Detaching memory region %p-%p at index %d",
|
||||
base, (void *)((intptr_t) base + region->len), region_index));
|
||||
|
||||
if (module->selected_btl && module->selected_btl->btl_register_mem) {
|
||||
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
|
||||
|
||||
if (region_index < region_count - 1) {
|
||||
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
|
||||
(region_count - region_index - 1) * sizeof (void *));
|
||||
}
|
||||
|
||||
memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
|
||||
}
|
||||
|
||||
if (region_index < region_count - 1) {
|
||||
memmove (region, (void *)((intptr_t) region + module->region_size),
|
||||
(region_count - region_index - 1) * module->region_size);;
|
||||
}
|
||||
|
||||
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
|
||||
|
||||
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief refresh the local view of the dynamic memory region
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer peer object to refresh
|
||||
*
|
||||
* This function does the work of keeping the local view of a remote peer in sync with what is attached
|
||||
* to the remote window. It is called on every address translation since there is no way (currently) to
|
||||
* detect that the attached regions have changed. To reduce the amount of data read we first read the
|
||||
* region count (which contains an id). If that hasn't changed the region data is not updated. If the
|
||||
* list of attached regions has changed then all valid regions are read from the peer while holding
|
||||
* their region lock.
|
||||
*/
|
||||
static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_dynamic_t *peer) {
|
||||
osc_rdma_counter_t region_count, region_id;
|
||||
uint64_t source_address;
|
||||
int ret;
|
||||
|
||||
/* this loop is meant to prevent us from reading data while the remote side is in attach */
|
||||
do {
|
||||
osc_rdma_counter_t remote_value;
|
||||
|
||||
source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, region_count);
|
||||
ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
|
||||
&remote_value, sizeof (remote_value));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
region_id = remote_value >> 32;
|
||||
region_count = remote_value & 0xffffffffl;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "remote memory region: 0x%lx, 0x%lx",
|
||||
(unsigned long) region_id, (unsigned long) region_count));
|
||||
/* check if the region is changing */
|
||||
} while (0xffffffffl == region_count);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "target has region_id 0x%lx, region_count 0x%lx "
|
||||
"(cached: 0x%x, 0x%x)", (unsigned long) region_id, (unsigned long) region_count, peer->region_id,
|
||||
peer->region_count));
|
||||
|
||||
if (0 == region_count) {
|
||||
return OMPI_ERR_RMA_RANGE;
|
||||
}
|
||||
|
||||
/* check if the cached copy is out of date */
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
if (peer->region_id != region_id) {
|
||||
unsigned region_len = module->region_size * region_count;
|
||||
void *temp;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "dynamic memory cache is out of data. reloading from peer"));
|
||||
|
||||
/* allocate only enough space for the remote regions */
|
||||
temp = realloc (peer->regions, region_len);
|
||||
if (NULL == temp) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
peer->regions = temp;
|
||||
|
||||
/* lock the region */
|
||||
ompi_osc_rdma_lock_acquire_shared (module, &peer->super, 1, offsetof (ompi_osc_rdma_state_t, regions_lock),
|
||||
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
|
||||
|
||||
source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions);
|
||||
ret = ompi_osc_get_data_blocking (module, peer->super.state_endpoint, source_address, peer->super.state_handle,
|
||||
peer->regions, region_len);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* release the region lock */
|
||||
ompi_osc_rdma_lock_release_shared (module, &peer->super, -1, offsetof (ompi_osc_rdma_state_t, regions_lock));
|
||||
|
||||
/* update cached region ids */
|
||||
peer->region_id = region_id;
|
||||
peer->region_count = region_count;
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t base, size_t len,
|
||||
ompi_osc_rdma_region_t **region)
|
||||
{
|
||||
ompi_osc_rdma_peer_dynamic_t *dy_peer = (ompi_osc_rdma_peer_dynamic_t *) peer;
|
||||
intptr_t bound = (intptr_t) base + len;
|
||||
ompi_osc_rdma_region_t *regions;
|
||||
int ret, region_count;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "locating dynamic memory region matching: %"
|
||||
PRIx64 "-%" PRIx64 " (len %lu)", base, base + len, (unsigned long) len));
|
||||
|
||||
ret = ompi_osc_rdma_refresh_dynamic_region (module, dy_peer);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
regions = dy_peer->regions;
|
||||
region_count = dy_peer->region_count;
|
||||
|
||||
*region = ompi_osc_rdma_find_region_containing (regions, 0, region_count - 1, (intptr_t) base, bound, module->region_size, NULL);
|
||||
if (!*region) {
|
||||
return OMPI_ERR_RMA_RANGE;
|
||||
}
|
||||
|
||||
/* round a matching region */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
60
ompi/mca/osc/rdma/osc_rdma_dynamic.h
Обычный файл
60
ompi/mca/osc/rdma/osc_rdma_dynamic.h
Обычный файл
@ -0,0 +1,60 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma.h"
|
||||
|
||||
/**
|
||||
* @brief attach a region to a window
|
||||
*
|
||||
* @param[in] win mpi window
|
||||
* @param[in] base base pointer of region
|
||||
* @param[in] len region size
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_FLAVOR if the window is not a dynamic window
|
||||
* @returns OMPI_ERR_RMA_ATTACH if the region could not be attached
|
||||
*
|
||||
* This function attaches a region to the local window. After this call
|
||||
* completes the region will be available for RMA access by all peers in
|
||||
* the window.
|
||||
*/
|
||||
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len);
|
||||
|
||||
/**
|
||||
* @brief detach a region from a window
|
||||
*
|
||||
* @param[in] win mpi window
|
||||
* @param[in] base base pointer of region specified to ompi_osc_rdma_attach()
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_FLAVOR if the window is not a dynamic window
|
||||
* @returns OMPI_ERROR if the region is not attached
|
||||
*
|
||||
* This function requires that a region with the same base has been attached
|
||||
* using the ompi_osc_rdma_attach() function.
|
||||
*/
|
||||
int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base);
|
||||
|
||||
/**
|
||||
* @brief find dynamic region associated with a peer, base, and len
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer peer object for remote peer
|
||||
* @param[in] base base pointer for region
|
||||
* @param[in] len length of region
|
||||
* @param[out] region region structure for the region
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_OUT_OF_RESOURCE on resource failure
|
||||
* @returns OMPI_ERR_RMA_RANGE if no region matches
|
||||
*/
|
||||
int ompi_osc_rdma_find_dynamic_region (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t base, size_t len,
|
||||
ompi_osc_rdma_region_t **region);
|
16
ompi/mca/osc/rdma/osc_rdma_frag.c
Обычный файл
16
ompi/mca/osc/rdma/osc_rdma_frag.c
Обычный файл
@ -0,0 +1,16 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_free_list_item_t, NULL, NULL);
|
125
ompi/mca/osc/rdma/osc_rdma_frag.h
Обычный файл
125
ompi/mca/osc/rdma/osc_rdma_frag.h
Обычный файл
@ -0,0 +1,125 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OSC_RDMA_FRAG_H
|
||||
#define OSC_RDMA_FRAG_H
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
/** Communication buffer for packing messages */
|
||||
struct ompi_osc_rdma_frag_t {
|
||||
opal_free_list_item_t super;
|
||||
|
||||
/* start of unused space */
|
||||
unsigned char *top;
|
||||
|
||||
/* space remaining in buffer */
|
||||
uint32_t remain_len;
|
||||
/* Number of operations which have started writing into the frag, but not yet completed doing so */
|
||||
int32_t pending;
|
||||
|
||||
ompi_osc_rdma_module_t *module;
|
||||
mca_btl_base_registration_handle_t *handle;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
|
||||
|
||||
|
||||
static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag)
|
||||
{
|
||||
if (0 == OPAL_THREAD_ADD32(&frag->pending, -1)) {
|
||||
opal_atomic_rmb ();
|
||||
|
||||
ompi_osc_rdma_deregister (frag->module, frag->handle);
|
||||
frag->handle = NULL;
|
||||
|
||||
opal_free_list_return (&mca_osc_rdma_component.frags, (opal_free_list_item_t *) frag);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: module lock must be held during this operation
|
||||
*/
|
||||
static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size_t request_len,
|
||||
ompi_osc_rdma_frag_t **buffer, char **ptr)
|
||||
{
|
||||
ompi_osc_rdma_frag_t *curr;
|
||||
int ret;
|
||||
|
||||
/* ensure all buffers are 8-byte aligned */
|
||||
request_len = OPAL_ALIGN(request_len, 8, size_t);
|
||||
|
||||
if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
curr = module->rdma_frag;
|
||||
if (OPAL_UNLIKELY(NULL == curr || curr->remain_len < request_len)) {
|
||||
if (NULL == curr || (NULL != curr && curr->pending > 1)) {
|
||||
opal_free_list_item_t *item = NULL;
|
||||
|
||||
/* release the initial reference to the buffer */
|
||||
module->rdma_frag = NULL;
|
||||
|
||||
if (curr) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
ompi_osc_rdma_frag_complete (curr);
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
}
|
||||
|
||||
item = opal_free_list_get (&mca_osc_rdma_component.frags);
|
||||
if (OPAL_UNLIKELY(NULL == item)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
curr = module->rdma_frag = (ompi_osc_rdma_frag_t *) item;
|
||||
|
||||
curr->handle = NULL;
|
||||
curr->pending = 1;
|
||||
curr->module = module;
|
||||
}
|
||||
|
||||
curr->top = curr->super.ptr;
|
||||
curr->remain_len = mca_osc_rdma_component.buffer_size;
|
||||
|
||||
if (curr->remain_len < request_len) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!curr->handle && module->selected_btl->btl_register_mem) {
|
||||
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size,
|
||||
MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
*ptr = (char *) curr->top;
|
||||
*buffer = curr;
|
||||
|
||||
curr->top += request_len;
|
||||
curr->remain_len -= request_len;
|
||||
OPAL_THREAD_ADD32(&curr->pending, 1);
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
302
ompi/mca/osc/rdma/osc_rdma_lock.h
Обычный файл
302
ompi/mca/osc/rdma/osc_rdma_lock.h
Обычный файл
@ -0,0 +1,302 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OMPI_OSC_RDMA_LOCK_H)
|
||||
#define OMPI_OSC_RDMA_LOCK_H
|
||||
|
||||
#include "osc_rdma_types.h"
|
||||
#include "osc_rdma_frag.h"
|
||||
|
||||
static inline int ompi_osc_rdma_trylock_local (volatile ompi_osc_rdma_lock_t *lock)
|
||||
{
|
||||
return !ompi_osc_rdma_lock_cmpset (lock, 0, OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
|
||||
}
|
||||
|
||||
static inline void ompi_osc_rdma_unlock_local (volatile ompi_osc_rdma_lock_t *lock)
|
||||
{
|
||||
(void) ompi_osc_rdma_lock_add (lock, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dummy completion function for atomic operations
|
||||
*/
|
||||
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
void *context, void *data, int status);
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_lock_acquire_shared:
|
||||
*
|
||||
* @param[in] peer - owner of lock
|
||||
* @param[in] value - increment value
|
||||
* @param[in] offset - offset of lock in remote peer's state segment
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success and another ompi error code on failure
|
||||
*
|
||||
* This function increments a remote shared lock. The value provided in
|
||||
* {value} should be the negative of the one used for ompi_osc_rdma_lock_acquire_shared.
|
||||
* It is erroneous to release a shared lock not held by the calling process.
|
||||
*/
|
||||
static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ompi_osc_rdma_lock_t value, ptrdiff_t offset)
|
||||
{
|
||||
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
|
||||
void *temp = &module->state->scratch_lock;
|
||||
volatile bool atomic_complete = false;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "decrementing shared lock %" PRIx64 " by %lx\n", lock,
|
||||
(unsigned long) value));
|
||||
|
||||
/* spin until the lock has been acquired */
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
if (module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS) {
|
||||
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) lock, peer->state_handle,
|
||||
MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
|
||||
(void *) &atomic_complete, NULL);
|
||||
} else {
|
||||
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, (intptr_t) lock, module->state_handle,
|
||||
peer->state_handle, MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
} else if (1 == OPAL_SUCCESS) {
|
||||
ret = OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
return ret;
|
||||
} else {
|
||||
(void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_lock_acquire_shared:
|
||||
*
|
||||
* @param[in] module - osc rdma module
|
||||
* @param[in] peer - owner of lock
|
||||
* @param[in] value - increment value
|
||||
* @param[in] offset - offset of lock in remote peer's state segment
|
||||
* @param[in] check - check value for success
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success and another ompi error code on failure
|
||||
*
|
||||
* This function increments a remote shared lock and checks it against the
|
||||
* check value in {check}. If any of the bits in the prior counter value
|
||||
* match those in {check} the function decrements the value and tries again.
|
||||
*/
|
||||
static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ompi_osc_rdma_lock_t value, ptrdiff_t offset,
|
||||
ompi_osc_rdma_lock_t check)
|
||||
{
|
||||
intptr_t lock = (intptr_t) peer->state + offset;
|
||||
volatile bool atomic_complete;
|
||||
ompi_osc_rdma_lock_t *temp;
|
||||
int ret;
|
||||
|
||||
/* spin until the lock has been acquired */
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
ompi_osc_rdma_frag_t *frag;
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
do {
|
||||
atomic_complete = false;
|
||||
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, (void *) temp, lock, frag->handle,
|
||||
peer->state_handle, MCA_BTL_ATOMIC_ADD, value, 0, MCA_BTL_NO_ORDER,
|
||||
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS > ret)) {
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "failed to increment shared lock. ret: %d", ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (1 != ret) {
|
||||
/* wait for completion of the atomic operation */
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "incremented shared lock 0x%lx by 0x%lx. Old value 0x%lx",
|
||||
(unsigned long) lock, (unsigned long) value, (unsigned long) *temp));
|
||||
|
||||
if (!(*temp & check)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* NTH: i think this is correct. backoff! */
|
||||
ompi_osc_rdma_lock_release_shared (module, peer, -value, offset);
|
||||
ompi_osc_rdma_progress (module);
|
||||
} while (1);
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
} else {
|
||||
ompi_osc_rdma_lock_t lock_state;
|
||||
do {
|
||||
lock_state = ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value);
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "incremented local shared lock by 0x%lx. Old value 0x%lx",
|
||||
(unsigned long) value, (unsigned long) lock_state));
|
||||
if (!(lock_state & check)) {
|
||||
break;
|
||||
}
|
||||
|
||||
(void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, -value);
|
||||
ompi_osc_rdma_progress (module);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_lock_try_acquire_exclusive:
|
||||
*
|
||||
* @param[in] peer - peer to lock
|
||||
* @param[in] temp - temporary registered location for lock result
|
||||
* @param[in] temp_seg - registered segment for temp
|
||||
* @param[in] offset - offset into the remote peer's state segment
|
||||
*
|
||||
* @returns 0 on success, 1 on failure
|
||||
*
|
||||
* This function attempts to lock the lock at {offset} on the remote
|
||||
* peer. The buffer pointer to by {temp} must not be modified until
|
||||
* this functions completes.
|
||||
*/
|
||||
static inline int ompi_osc_rdma_lock_try_acquire_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ptrdiff_t offset)
|
||||
{
|
||||
uint64_t lock = (uint64_t) (uintptr_t) peer->state + offset;
|
||||
ompi_osc_rdma_lock_t *temp = NULL;
|
||||
volatile bool atomic_complete;
|
||||
int ret;
|
||||
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
ompi_osc_rdma_frag_t *frag = NULL;
|
||||
int result;
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* set the temporary value so we can detect success. note that a lock should never be -1 */
|
||||
atomic_complete = false;
|
||||
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, lock, frag->handle,
|
||||
peer->state_handle, 0, OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0, 0,
|
||||
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS > ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (0 == ret) {
|
||||
/* wait for the atomic operation to complete */
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "got %lx when attempting compare and swap %" PRIx64 " complete %d",
|
||||
(unsigned long) *temp, lock, atomic_complete));
|
||||
result = (*temp != 0);
|
||||
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_trylock_local ((int64_t *)(intptr_t) lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_lock_acquire_exclusive:
|
||||
*
|
||||
* @param[in] peer - peer to lock
|
||||
* @param[in] temp - temporary registered location for lock result
|
||||
* @param[in] temp_seg - registered segment for temp
|
||||
* @param[in] offset - offset into the remote peer's state segment
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success or another ompi error code on failure
|
||||
*
|
||||
* This function locks the lock at {offset} on the remote peer. The
|
||||
* buffer pointed to by {temp} must not be modified until this
|
||||
* function completes.
|
||||
*/
|
||||
static inline int ompi_osc_rdma_lock_acquire_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ptrdiff_t offset)
|
||||
{
|
||||
while (ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offset)) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* ompi_osc_rdma_lock_release_exclusive:
|
||||
*
|
||||
* @param[in] peer - peer to unlock
|
||||
* @param[in] offset - offset into the remote peer's state segment
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success or another ompi error code on failure
|
||||
*
|
||||
* This function unlocks the lock at {offset} in the remote peer's state
|
||||
* structure. It is illegal to call this function unless this process
|
||||
* holds the lock.
|
||||
*/
|
||||
static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ptrdiff_t offset)
|
||||
{
|
||||
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
|
||||
void *temp = &module->state->scratch_lock;
|
||||
volatile bool atomic_complete = false;
|
||||
int ret;
|
||||
|
||||
if (!ompi_osc_rdma_peer_local_state (peer)) {
|
||||
if (module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS) {
|
||||
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, lock, peer->state_handle, MCA_BTL_ATOMIC_ADD,
|
||||
-OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
|
||||
(void *) &atomic_complete, NULL);
|
||||
} else {
|
||||
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, lock, module->state_handle,
|
||||
peer->state_handle, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, 0,
|
||||
MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS > ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS == ret) {
|
||||
while (!atomic_complete) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "unlocked target lock %" PRIx64 " with value 0x%lx. old value 0x%"
|
||||
PRIx64, lock, (unsigned long) -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, ((uint64_t *) temp)[0]));
|
||||
} else {
|
||||
ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock);
|
||||
}
|
||||
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_LOCK_H */
|
144
ompi/mca/osc/rdma/osc_rdma_module.c
Обычный файл
144
ompi/mca/osc/rdma/osc_rdma_module.c
Обычный файл
@ -0,0 +1,144 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_lock.h"
|
||||
|
||||
#include "mpi.h"
|
||||
|
||||
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
if (NULL == module->peer_array) {
|
||||
ret = opal_hash_table_set_value_uint32 (&module->peer_hash, peer->rank, (void *) peer);
|
||||
} else {
|
||||
module->peer_array[peer->rank] = peer;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_free(ompi_win_t *win)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
uint32_t key;
|
||||
void *node;
|
||||
|
||||
if (NULL == module) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL != module->comm) {
|
||||
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
|
||||
"rdma component destroying window with id %d",
|
||||
ompi_comm_get_cid(module->comm));
|
||||
|
||||
/* finish with a barrier */
|
||||
if (ompi_group_size(win->w_group) > 1) {
|
||||
(void) module->comm->c_coll.coll_barrier (module->comm,
|
||||
module->comm->c_coll.coll_barrier_module);
|
||||
}
|
||||
|
||||
/* remove from component information */
|
||||
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
||||
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules,
|
||||
ompi_comm_get_cid(module->comm));
|
||||
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
||||
}
|
||||
|
||||
win->w_osc_module = NULL;
|
||||
|
||||
if (module->state) {
|
||||
int region_count = module->state->region_count & 0xffffffffL;
|
||||
if (NULL != module->dynamic_handles) {
|
||||
for (int i = 0 ; i < region_count ; ++i) {
|
||||
ompi_osc_rdma_deregister (module, module->dynamic_handles[i].btl_handle);
|
||||
}
|
||||
|
||||
free (module->dynamic_handles);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&module->outstanding_locks);
|
||||
OBJ_DESTRUCT(&module->lock);
|
||||
OBJ_DESTRUCT(&module->peer_lock);
|
||||
OBJ_DESTRUCT(&module->all_sync);
|
||||
|
||||
ompi_osc_rdma_deregister (module, module->state_handle);
|
||||
ompi_osc_rdma_deregister (module, module->base_handle);
|
||||
|
||||
OPAL_LIST_DESTRUCT(&module->pending_posts);
|
||||
|
||||
if (NULL != module->rdma_frag) {
|
||||
ompi_osc_rdma_deregister (module, module->rdma_frag->handle);
|
||||
}
|
||||
|
||||
/* remove all cached peers */
|
||||
if (NULL == module->peer_array) {
|
||||
ret = opal_hash_table_get_first_key_uint32 (&module->peer_hash, &key, (void **) &peer, &node);
|
||||
while (OPAL_SUCCESS == ret) {
|
||||
OBJ_RELEASE(peer);
|
||||
ret = opal_hash_table_get_next_key_uint32 (&module->peer_hash, &key, (void **) &peer,
|
||||
node, &node);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&module->peer_hash);
|
||||
} else {
|
||||
for (int i = 0 ; i < ompi_comm_rank (module->comm) ; ++i) {
|
||||
if (NULL != module->peer_array[i]) {
|
||||
OBJ_RELEASE(module->peer_array[i]);
|
||||
}
|
||||
}
|
||||
|
||||
free (module->peer_array);
|
||||
}
|
||||
|
||||
if (NULL != module->outstanding_lock_array) {
|
||||
free (module->outstanding_lock_array);
|
||||
}
|
||||
|
||||
if (module->local_leaders && MPI_COMM_NULL != module->local_leaders) {
|
||||
ompi_comm_free (&module->local_leaders);
|
||||
}
|
||||
|
||||
if (module->shared_comm && MPI_COMM_NULL != module->shared_comm) {
|
||||
ompi_comm_free (&module->shared_comm);
|
||||
}
|
||||
|
||||
if (module->comm && MPI_COMM_NULL != module->comm) {
|
||||
ompi_comm_free (&module->comm);
|
||||
}
|
||||
|
||||
if (NULL != module->free_after) {
|
||||
free(module->free_after);
|
||||
}
|
||||
|
||||
if (module->segment_base) {
|
||||
opal_shmem_segment_detach (&module->seg_ds);
|
||||
module->segment_base = NULL;
|
||||
}
|
||||
|
||||
free (module);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
369
ompi/mca/osc/rdma/osc_rdma_passive_target.c
Обычный файл
369
ompi/mca/osc/rdma/osc_rdma_passive_target.c
Обычный файл
@ -0,0 +1,369 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "osc_rdma_passive_target.h"
|
||||
#include "osc_rdma_comm.h"
|
||||
|
||||
#include "mpi.h"
|
||||
|
||||
|
||||
int ompi_osc_rdma_sync (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_progress (GET_MODULE(win));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
assert (0 <= target);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush starting..."));
|
||||
|
||||
if (ompi_comm_rank (module->comm) == target) {
|
||||
/* nothing to flush. call one round of progress */
|
||||
ompi_osc_rdma_progress (module);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
/* finish all outstanding fragments */
|
||||
ompi_osc_rdma_sync_rdma_complete (lock);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
uint32_t key;
|
||||
void *node;
|
||||
|
||||
/* flush is only allowed from within a passive target epoch */
|
||||
if (!ompi_osc_rdma_in_passive_epoch (module)) {
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush_all entering..."));
|
||||
|
||||
/* globally complete all outstanding rdma requests */
|
||||
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
|
||||
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
|
||||
}
|
||||
|
||||
/* flush all locks */
|
||||
ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
|
||||
while (OPAL_SUCCESS == ret) {
|
||||
ompi_osc_rdma_sync_rdma_complete (lock);
|
||||
ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
|
||||
node, &node);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_flush_all complete"));
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
|
||||
{
|
||||
return ompi_osc_rdma_flush (target, win);
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
|
||||
{
|
||||
return ompi_osc_rdma_flush_all (win);
|
||||
}
|
||||
|
||||
/* locking via atomics */
|
||||
static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ompi_osc_rdma_sync_t *lock)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
|
||||
do {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Incrementing global exclusive lock"));
|
||||
/* lock the master lock. this requires no rank has a global shared lock */
|
||||
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), 0xffffffff00000000L);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Acquiring exclusive lock from peer"));
|
||||
ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
|
||||
if (ret) {
|
||||
/* release the global lock */
|
||||
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
|
||||
ompi_osc_rdma_progress (module);
|
||||
continue;
|
||||
}
|
||||
|
||||
peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
|
||||
break;
|
||||
} while (1);
|
||||
} else {
|
||||
do {
|
||||
/* go right to the target to acquire a shared lock */
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "Incrementing local shared lock"));
|
||||
ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
|
||||
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
|
||||
if (OMPI_SUCCESS == ret) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_progress (module);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
|
||||
ompi_osc_rdma_sync_t *lock)
|
||||
{
|
||||
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
|
||||
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
|
||||
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
|
||||
peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
|
||||
} else {
|
||||
ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "osc rdma: lock %d %d", target, lock_type));
|
||||
|
||||
if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
|
||||
/* impossible to get an exclusive lock while holding a global shared lock or in a active
|
||||
* target access epoch */
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
|
||||
/* create lock item */
|
||||
lock = ompi_osc_rdma_sync_allocate (module);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
|
||||
lock->sync.lock.target = target;
|
||||
lock->sync.lock.type = lock_type;
|
||||
lock->sync.lock.assert = assert;
|
||||
|
||||
lock->peer_list.peer = peer;
|
||||
lock->num_peers = 1;
|
||||
OBJ_RETAIN(peer);
|
||||
|
||||
if (0 == (assert & MPI_MODE_NOCHECK)) {
|
||||
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
||||
++module->passive_target_access_epoch;
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
|
||||
} else {
|
||||
OBJ_RELEASE(lock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
|
||||
if (OPAL_UNLIKELY(NULL == lock)) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
|
||||
target, win->w_name));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_module_lock_remove (module, lock);
|
||||
|
||||
/* finish all outstanding fragments */
|
||||
ompi_osc_rdma_sync_rdma_complete (lock);
|
||||
|
||||
if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
|
||||
ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
|
||||
}
|
||||
|
||||
/* release our reference to this peer */
|
||||
OBJ_RELEASE(peer);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock: unlock of %d complete", target));
|
||||
|
||||
--module->passive_target_access_epoch;
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
/* delete the lock */
|
||||
ompi_osc_rdma_sync_return (lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
int ret = OMPI_SUCCESS;
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* Check if no_locks is set. TODO: we also need to track whether we are in an
|
||||
* active target epoch. Fence can make this tricky to track. */
|
||||
if (module->all_sync.epoch_active) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "osc/rdma: attempted "
|
||||
"to lock all when active target epoch is %s and lock all epoch is %s",
|
||||
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
|
||||
"active" : "inactive",
|
||||
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive"));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* set up lock */
|
||||
lock = &module->all_sync;
|
||||
|
||||
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
|
||||
lock->sync.lock.target = -1;
|
||||
lock->sync.lock.type = MPI_LOCK_SHARED;
|
||||
lock->sync.lock.assert = assert;
|
||||
lock->num_peers = ompi_comm_size (module->comm);
|
||||
|
||||
lock->epoch_active = true;
|
||||
/* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
|
||||
* without having to access the hash table. Such a change would likely increase performance
|
||||
* at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
|
||||
* be needed for this array. */
|
||||
|
||||
if (0 != (assert & MPI_MODE_NOCHECK)) {
|
||||
/* increment the global shared lock */
|
||||
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
|
||||
offsetof(ompi_osc_rdma_state_t, global_lock),
|
||||
0x00000000ffffffffUL);
|
||||
}
|
||||
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
|
||||
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
|
||||
lock->num_peers = 0;
|
||||
lock->epoch_active = false;
|
||||
} else {
|
||||
++module->passive_target_access_epoch;
|
||||
}
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
ompi_osc_rdma_sync_t *lock;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock_all entering..."));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
lock = &module->all_sync;
|
||||
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
|
||||
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_unlock_all: not locked in window %s",
|
||||
win->w_name));
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
/* finish all outstanding fragments */
|
||||
ompi_osc_rdma_sync_rdma_complete (lock);
|
||||
|
||||
if (0 != (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
|
||||
/* decrement the master lock shared count */
|
||||
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock));
|
||||
}
|
||||
|
||||
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
|
||||
lock->num_peers = 0;
|
||||
lock->epoch_active = false;
|
||||
|
||||
--module->passive_target_access_epoch;
|
||||
|
||||
opal_atomic_wmb ();
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_unlock_all complete"));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
131
ompi/mca/osc/rdma/osc_rdma_passive_target.h
Обычный файл
131
ompi/mca/osc/rdma/osc_rdma_passive_target.h
Обычный файл
@ -0,0 +1,131 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OSC_RDMA_PASSIVE_TARGET_H)
|
||||
#define OSC_RDMA_PASSIVE_TARGET_H
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
#include "osc_rdma_lock.h"
|
||||
|
||||
/**
|
||||
* @brief lock the target in the window using network/cpu atomics
|
||||
*
|
||||
* @param[in] lock_type mpi lock type (MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE)
|
||||
* @param[in] target target process
|
||||
* @param[in] assert asserts
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if there is a conflicting RMA epoch
|
||||
*/
|
||||
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief unlock the target in the window using network/cpu atomics
|
||||
*
|
||||
* @param[in] target target process
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
|
||||
*/
|
||||
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief lock all targets in window using network/cpu atomics
|
||||
*
|
||||
* @param[in] assert asserts
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if there is a conflicting RMA epoch
|
||||
*/
|
||||
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief unlock all targets in window using network/cpu atomics
|
||||
*
|
||||
* @param[in] assert asserts
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if the window is not in a lock all access epoch
|
||||
*/
|
||||
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief synchronize the public and private copies of the window
|
||||
*
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
*
|
||||
* Just acts as a memory barrier since this module only supports a unified memory
|
||||
* model.
|
||||
*/
|
||||
int ompi_osc_rdma_sync (struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief flush rdma transactions to a target
|
||||
*
|
||||
* @param[in] target target process
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
|
||||
*/
|
||||
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief flush rdma transactions to all target(s)
|
||||
*
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if no processes are locked
|
||||
*
|
||||
* osc/rdma does not make a distinction between local and remote rma
|
||||
* completion. this could change in a future release as small messages
|
||||
* may be internally buffered.
|
||||
*/
|
||||
int ompi_osc_rdma_flush_all (struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief flush rdma transactions to a target (local completion)
|
||||
*
|
||||
* @param[in] target target process
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if the target is not locked
|
||||
*
|
||||
* osc/rdma does not make a distinction between local and remote rma
|
||||
* completion. this could change in a future release as small messages
|
||||
* may be internally buffered.
|
||||
*/
|
||||
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win);
|
||||
|
||||
/**
|
||||
* @brief flush rdma transactions to all target(s) (local completion)
|
||||
*
|
||||
* @param[in] win mpi window
|
||||
*
|
||||
* @returns OMPI_SUCCESS on success
|
||||
* @returns OMPI_ERR_RMA_SYNC if no processes are locked
|
||||
*
|
||||
* osc/rdma does not make a distinction between local and remote rma
|
||||
* completion. this could change in a future release as small messages
|
||||
* may be internally buffered.
|
||||
*/
|
||||
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win);
|
||||
|
||||
#endif
|
323
ompi/mca/osc/rdma/osc_rdma_peer.c
Обычный файл
323
ompi/mca/osc/rdma/osc_rdma_peer.c
Обычный файл
@ -0,0 +1,323 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma_comm.h"
|
||||
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
|
||||
#define NODE_ID_TO_RANK(module, node_id) ((node_id) * ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count))
|
||||
|
||||
/**
|
||||
* @brief find the btl endpoint for a process
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id process rank in the module communicator
|
||||
*
|
||||
* @returns NULL on error
|
||||
* @returns btl endpoint on success
|
||||
*/
|
||||
struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id)
|
||||
{
|
||||
ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id);
|
||||
mca_bml_base_endpoint_t *bml_endpoint;
|
||||
int num_btls;
|
||||
|
||||
/* for not just use the bml to get the btl endpoint */
|
||||
bml_endpoint = mca_bml_base_get_endpoint (proc);
|
||||
|
||||
num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
|
||||
|
||||
for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) {
|
||||
if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btl) {
|
||||
return bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint;
|
||||
}
|
||||
}
|
||||
|
||||
/* very unlikely. if this happened the btl section process is broken */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) {
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
*peer_out = NULL;
|
||||
|
||||
endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
|
||||
if (OPAL_UNLIKELY(NULL == endpoint)) {
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
|
||||
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_dynamic_t);
|
||||
} else if (module->same_size && module->same_disp_unit) {
|
||||
/* use a smaller peer object when same_size and same_disp_unit are set */
|
||||
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_basic_t);
|
||||
} else {
|
||||
peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_extended_t);
|
||||
}
|
||||
|
||||
peer->data_endpoint = endpoint;
|
||||
peer->rank = peer_id;
|
||||
|
||||
*peer_out = peer;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief finish initializing a peer object
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer peer object to set up
|
||||
*
|
||||
* This function reads the registration handle and state pointer from the peer that holds that data. If necessary
|
||||
* it will then ready information about the peer from its state data structure. This information includes the
|
||||
* displacement unit, base pointer, window size, and registation handle (if applicable).
|
||||
*/
|
||||
static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
|
||||
uint64_t peer_data_size;
|
||||
uint64_t peer_data_offset, array_pointer;
|
||||
struct mca_btl_base_endpoint_t *array_endpoint;
|
||||
ompi_osc_rdma_region_t *array_peer_data, *node_peer_data;
|
||||
ompi_osc_rdma_rank_data_t rank_data;
|
||||
int registration_handle_size = 0;
|
||||
int node_id, node_rank, array_index;
|
||||
int ret, disp_unit;
|
||||
char *peer_data;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "configuring peer for rank %d", peer->rank));
|
||||
|
||||
if (module->selected_btl->btl_register_mem) {
|
||||
registration_handle_size = module->selected_btl->btl_registration_handle_size;
|
||||
}
|
||||
|
||||
/* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
|
||||
* calculates the node and offset the mapping can be found. once the mapping has been read the state
|
||||
* part of the peer structure can be initialized. */
|
||||
node_id = (peer->rank * module->node_count) / ompi_comm_size (module->comm);
|
||||
node_rank = NODE_ID_TO_RANK(module, node_id);
|
||||
array_index = peer->rank - node_rank;
|
||||
array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
|
||||
array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
|
||||
|
||||
/* lookup the btl endpoint needed to retrieve the mapping */
|
||||
array_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, node_rank);
|
||||
if (OPAL_UNLIKELY(NULL == array_endpoint)) {
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "reading rank data from array rank: %d pointer: 0x%"
|
||||
PRIx64 ", size: %lu", node_rank, array_pointer, sizeof (rank_data)));
|
||||
|
||||
ret = ompi_osc_get_data_blocking (module, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
|
||||
&rank_data, sizeof (rank_data));
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
|
||||
* every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
|
||||
* of this by re-using the endpoint and pointer stored in the node_comm_info array. */
|
||||
node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
|
||||
|
||||
peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
|
||||
|
||||
if (registration_handle_size) {
|
||||
peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
|
||||
}
|
||||
|
||||
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, rank_data.node_id));
|
||||
if (OPAL_UNLIKELY(NULL == peer->state_endpoint)) {
|
||||
return OPAL_ERR_UNREACH;
|
||||
}
|
||||
|
||||
/* nothing more to do for dynamic memory windows */
|
||||
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* read window data from the target rank */
|
||||
if (module->same_disp_unit) {
|
||||
/* do not bother reading the displacement unit as it is already known */
|
||||
peer_data_offset = offsetof (ompi_osc_rdma_state_t, regions);
|
||||
} else {
|
||||
peer_data_offset = offsetof (ompi_osc_rdma_state_t, disp_unit);
|
||||
}
|
||||
|
||||
peer_data_size = module->state_size - peer_data_offset;
|
||||
peer_data = alloca (peer_data_size);
|
||||
|
||||
/* read window data from the end of the target's state structure */
|
||||
ret = ompi_osc_get_data_blocking (module, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle,
|
||||
peer_data, peer_data_size);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!module->same_disp_unit) {
|
||||
/* unpack displacement */
|
||||
memcpy (&ex_peer->disp_unit, peer_data, sizeof (ex_peer->disp_unit));
|
||||
peer_data += offsetof (ompi_osc_rdma_state_t, regions) - offsetof (ompi_osc_rdma_state_t, disp_unit);
|
||||
disp_unit = ex_peer->disp_unit;
|
||||
} else {
|
||||
disp_unit = module->disp_unit;
|
||||
}
|
||||
|
||||
ompi_osc_rdma_region_t *base_region = (ompi_osc_rdma_region_t *) peer_data;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "peer %d: remote base region: 0x%" PRIx64
|
||||
", size: %" PRId64 ", flags: 0x%x, disp_unit: %d", peer->rank, base_region->base, base_region->len,
|
||||
peer->flags, disp_unit));
|
||||
|
||||
if (ompi_osc_rdma_peer_local_base (peer)) {
|
||||
/* for now we store the local address in the standard place. do no overwrite it */
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ex_peer->super.base = base_region->base;
|
||||
|
||||
/* save size and base */
|
||||
if (!module->same_size) {
|
||||
ex_peer->size = base_region->len;
|
||||
}
|
||||
|
||||
if (base_region->len) {
|
||||
if (registration_handle_size) {
|
||||
ex_peer->super.base_handle = malloc (registration_handle_size);
|
||||
if (OPAL_UNLIKELY(NULL == ex_peer->super.base_handle)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
peer->flags |= OMPI_OSC_RDMA_PEER_BASE_FREE;
|
||||
|
||||
memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief lookup (or allocate) a peer for a rank (internal)
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id rank of remote peer (in module communicator)
|
||||
*
|
||||
* @returns peer object on success
|
||||
* @returns NULL on error
|
||||
*
|
||||
* This is an internal function for looking up or allocating a peer object for a window rank. This
|
||||
* function requires the peer lock to be held and is only expected to be called from itself or
|
||||
* the ompi_osc_rdma_peer_lookup() helper function.
|
||||
*/
|
||||
static struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup_internal (struct ompi_osc_rdma_module_t *module, int peer_id)
|
||||
{
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
int ret;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "looking up peer data for rank %d", peer_id));
|
||||
|
||||
peer = ompi_osc_module_get_peer (module, peer_id);
|
||||
if (NULL != peer) {
|
||||
return peer;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_new_peer (module, peer_id, &peer);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = ompi_osc_rdma_peer_setup (module, peer);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OBJ_RELEASE(peer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ret = ompi_osc_module_add_peer (module, peer);
|
||||
if (OPAL_SUCCESS != ret) {
|
||||
/* out of memory */
|
||||
OBJ_RELEASE(peer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ensure the peer hash is updated before we drop the lock */
|
||||
opal_atomic_wmb ();
|
||||
|
||||
return peer;
|
||||
}
|
||||
|
||||
struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id)
|
||||
{
|
||||
struct ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
opal_mutex_lock (&module->peer_lock);
|
||||
peer = ompi_osc_rdma_peer_lookup_internal (module, peer_id);
|
||||
opal_mutex_unlock (&module->peer_lock);
|
||||
|
||||
return peer;
|
||||
}
|
||||
|
||||
|
||||
/******* peer objects *******/
|
||||
|
||||
static void ompi_osc_rdma_peer_construct (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
if (peer->state_handle && (peer->flags & OMPI_OSC_RDMA_PEER_STATE_FREE)) {
|
||||
free (peer->state_handle);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_object_t,
|
||||
ompi_osc_rdma_peer_construct,
|
||||
ompi_osc_rdma_peer_destruct);
|
||||
|
||||
static void ompi_osc_rdma_peer_basic_construct (ompi_osc_rdma_peer_basic_t *peer)
|
||||
{
|
||||
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_peer_basic_destruct (ompi_osc_rdma_peer_basic_t *peer)
|
||||
{
|
||||
if (peer->base_handle && (peer->super.flags & OMPI_OSC_RDMA_PEER_BASE_FREE)) {
|
||||
free (peer->base_handle);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_basic_t, ompi_osc_rdma_peer_t,
|
||||
ompi_osc_rdma_peer_basic_construct,
|
||||
ompi_osc_rdma_peer_basic_destruct);
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_extended_t, ompi_osc_rdma_peer_basic_t,
|
||||
NULL, NULL);
|
||||
|
||||
static void ompi_osc_rdma_peer_dynamic_construct (ompi_osc_rdma_peer_dynamic_t *peer)
|
||||
{
|
||||
memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_peer_dynamic_destruct (ompi_osc_rdma_peer_dynamic_t *peer)
|
||||
{
|
||||
if (peer->regions) {
|
||||
free (peer->regions);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_dynamic_t, ompi_osc_rdma_peer_t,
|
||||
ompi_osc_rdma_peer_dynamic_construct,
|
||||
ompi_osc_rdma_peer_dynamic_destruct);
|
222
ompi/mca/osc/rdma/osc_rdma_peer.h
Обычный файл
222
ompi/mca/osc/rdma/osc_rdma_peer.h
Обычный файл
@ -0,0 +1,222 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_RDMA_PEER_H
|
||||
#define OMPI_OSC_RDMA_PEER_H
|
||||
|
||||
#include "osc_rdma_types.h"
|
||||
|
||||
struct ompi_osc_rdma_module_t;
|
||||
|
||||
/**
|
||||
* @brief osc rdma peer object
|
||||
*
|
||||
* This object is used as a cache for information associated with a peer.
|
||||
*/
|
||||
struct ompi_osc_rdma_peer_t {
|
||||
opal_object_t super;
|
||||
|
||||
/** rdma data endpoint for this peer */
|
||||
struct mca_btl_base_endpoint_t *data_endpoint;
|
||||
|
||||
/** endpoint for reading/modifying peer state */
|
||||
struct mca_btl_base_endpoint_t *state_endpoint;
|
||||
|
||||
/** remote peer's state pointer */
|
||||
osc_rdma_base_t state;
|
||||
|
||||
/** registration handle associated with the state */
|
||||
mca_btl_base_registration_handle_t *state_handle;
|
||||
|
||||
/** rank of this peer in the window */
|
||||
int rank;
|
||||
|
||||
/** peer flags */
|
||||
int flags;
|
||||
|
||||
/** aggregation support */
|
||||
ompi_osc_rdma_aggregation_t *aggregate;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
|
||||
|
||||
/**
|
||||
* @brief peer object used when using dynamic windows
|
||||
*/
|
||||
struct ompi_osc_rdma_peer_dynamic_t {
|
||||
ompi_osc_rdma_peer_t super;
|
||||
|
||||
/** last region id seen for this peer */
|
||||
uint32_t region_id;
|
||||
|
||||
/** number of regions in the regions array */
|
||||
uint32_t region_count;
|
||||
|
||||
/** cached array of attached regions for this peer */
|
||||
struct ompi_osc_rdma_region_t *regions;
|
||||
};
|
||||
|
||||
typedef struct ompi_osc_rdma_peer_dynamic_t ompi_osc_rdma_peer_dynamic_t;
|
||||
|
||||
/**
|
||||
* @brief basic peer object for non-dynamic windows used when all peers
|
||||
* have the same displacement unit and size
|
||||
*/
|
||||
struct ompi_osc_rdma_peer_basic_t {
|
||||
ompi_osc_rdma_peer_t super;
|
||||
|
||||
/** remote peer's base pointer */
|
||||
osc_rdma_base_t base;
|
||||
|
||||
/** registration handle associated with the base */
|
||||
mca_btl_base_registration_handle_t *base_handle;
|
||||
};
|
||||
|
||||
typedef struct ompi_osc_rdma_peer_basic_t ompi_osc_rdma_peer_basic_t;
|
||||
|
||||
/**
|
||||
* @brief peer object used when no assumption can be made about the
|
||||
* peer's displacement unit or size
|
||||
*/
|
||||
struct ompi_osc_rdma_peer_extended_t {
|
||||
ompi_osc_rdma_peer_basic_t super;
|
||||
|
||||
/** remote peer's region size */
|
||||
osc_rdma_size_t size;
|
||||
|
||||
/** displacement unit */
|
||||
int disp_unit;
|
||||
};
|
||||
|
||||
typedef struct ompi_osc_rdma_peer_extended_t ompi_osc_rdma_peer_extended_t;
|
||||
|
||||
/**
|
||||
* @brief object class declarations
|
||||
*/
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_t);
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_dynamic_t);
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_basic_t);
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_peer_extended_t);
|
||||
|
||||
/**
|
||||
* @brief used to identify the node and local rank of a peer
|
||||
*/
|
||||
struct ompi_osc_rdma_rank_data_t {
|
||||
/** index of none in none_comm_info array */
|
||||
unsigned int node_id;
|
||||
/** local rank of process */
|
||||
unsigned int rank;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_rank_data_t ompi_osc_rdma_rank_data_t;
|
||||
|
||||
enum {
|
||||
/** peer is locked for exclusive access */
|
||||
OMPI_OSC_RDMA_PEER_EXCLUSIVE = 0x01,
|
||||
/** peer's base is accessible with direct loads/stores */
|
||||
OMPI_OSC_RDMA_PEER_LOCAL_BASE = 0x02,
|
||||
/** peer state is local */
|
||||
OMPI_OSC_RDMA_PEER_LOCAL_STATE = 0x04,
|
||||
/** currently accumulating on peer */
|
||||
OMPI_OSC_RDMA_PEER_ACCUMULATING = 0x08,
|
||||
/** peer is in an active access epoch (pscw) */
|
||||
OMPI_OSC_RDMA_PEER_ACCESS_ACTIVE_EPOCH = 0x10,
|
||||
/** peer state handle should be freed */
|
||||
OMPI_OSC_RDMA_PEER_STATE_FREE = 0x20,
|
||||
/** peer base handle should be freed */
|
||||
OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40,
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief allocate a peer object and initialize some of it structures
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id peer's rank in the communicator
|
||||
* @param[out] peer_out new peer object
|
||||
*
|
||||
* The type of the object returned depends on the window settings. For example for a dynamic window
|
||||
* this will return a peer of type \ref ompi_osc_rdma_peer_dynamic_t.
|
||||
*/
|
||||
int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out);
|
||||
|
||||
/**
|
||||
* @brief lookup (or allocate) a peer
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id peer's rank in the communicator
|
||||
*
|
||||
* This function is used by the ompi_osc_rdma_module_peer() inline function to allocate a peer object. It is not
|
||||
* intended to be called from anywhere else.
|
||||
*/
|
||||
struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id);
|
||||
|
||||
/**
|
||||
* @brief flush queued aggregated operation
|
||||
*
|
||||
* @param[in] peer osc rdma peer
|
||||
*/
|
||||
int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer);
|
||||
|
||||
/**
|
||||
* @brief lookup the btl endpoint for a peer
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] peer_id peer's rank in the communicator
|
||||
*
|
||||
* @returns btl endpoint for the peer on success
|
||||
* @returns NULL on failure
|
||||
*/
|
||||
struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id);
|
||||
|
||||
/**
|
||||
* @brief check if this process holds an exclusive lock on a peer
|
||||
*
|
||||
* @param[in] peer peer object to check
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_peer_is_exclusive (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
return !!(peer->flags & OMPI_OSC_RDMA_PEER_EXCLUSIVE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief check if this process is currently accumulating on a peer
|
||||
*
|
||||
* @param[in] peer peer object to check
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_peer_is_accumulating (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
return !!(peer->flags & OMPI_OSC_RDMA_PEER_ACCUMULATING);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief check if the peer's base pointer is local to this process
|
||||
*
|
||||
* @param[in] peer peer object to check
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_peer_local_base (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_BASE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief check if the peer's state pointer is local to this process
|
||||
*
|
||||
* @param[in] peer peer object to check
|
||||
*
|
||||
* The OMPI_OSC_RDMA_PEER_LOCAL_STATE flag will only be set if either 1) we
|
||||
* will not be mixing btl atomics and cpu atomics, or 2) it is safe to mix
|
||||
* btl and cpu atomics.
|
||||
*/
|
||||
static inline bool ompi_osc_rdma_peer_local_state (ompi_osc_rdma_peer_t *peer)
|
||||
{
|
||||
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_STATE);
|
||||
}
|
||||
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_PEER_H */
|
74
ompi/mca/osc/rdma/osc_rdma_request.c
Обычный файл
74
ompi/mca/osc/rdma/osc_rdma_request.c
Обычный файл
@ -0,0 +1,74 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/osc/osc.h"
|
||||
#include "ompi/mca/osc/base/base.h"
|
||||
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_request.h"
|
||||
|
||||
static int request_cancel(struct ompi_request_t *request, int complete)
|
||||
{
|
||||
return MPI_ERR_REQUEST;
|
||||
}
|
||||
|
||||
static int request_free(struct ompi_request_t **ompi_req)
|
||||
{
|
||||
ompi_osc_rdma_request_t *request =
|
||||
(ompi_osc_rdma_request_t*) *ompi_req;
|
||||
|
||||
if (true != request->super.req_complete) {
|
||||
return MPI_ERR_REQUEST;
|
||||
}
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(request);
|
||||
|
||||
*ompi_req = MPI_REQUEST_NULL;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int request_complete (struct ompi_request_t *request)
|
||||
{
|
||||
ompi_osc_rdma_request_t *parent_request = ((ompi_osc_rdma_request_t *) request)->parent_request;
|
||||
|
||||
if (parent_request && 0 == OPAL_THREAD_ADD32 (&parent_request->outstanding_requests, -1)) {
|
||||
ompi_osc_rdma_request_complete (parent_request, OMPI_SUCCESS);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void request_construct(ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
request->super.req_type = OMPI_REQUEST_WIN;
|
||||
request->super.req_status._cancelled = 0;
|
||||
request->super.req_free = request_free;
|
||||
request->super.req_cancel = request_cancel;
|
||||
request->super.req_complete_cb = request_complete;
|
||||
request->parent_request = 0;
|
||||
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
|
||||
}
|
||||
|
||||
static void request_destruct(ompi_osc_rdma_request_t *request)
|
||||
{
|
||||
OBJ_DESTRUCT(&request->convertor);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
|
||||
ompi_request_t,
|
||||
request_construct,
|
||||
request_destruct);
|
109
ompi/mca/osc/rdma/osc_rdma_request.h
Обычный файл
109
ompi/mca/osc/rdma/osc_rdma_request.h
Обычный файл
@ -0,0 +1,109 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_RDMA_REQUEST_H
|
||||
#define OMPI_OSC_RDMA_REQUEST_H
|
||||
|
||||
#include "osc_rdma.h"
|
||||
|
||||
enum ompi_osc_rdma_request_type_t {
|
||||
OMPI_OSC_RDMA_TYPE_GET,
|
||||
OMPI_OSC_RDMA_TYPE_PUT,
|
||||
OMPI_OSC_RDMA_TYPE_RDMA,
|
||||
OMPI_OSC_RDMA_TYPE_ACC,
|
||||
OMPI_OSC_RDMA_TYPE_GET_ACC,
|
||||
OMPI_OSC_RDMA_TYPE_CSWAP,
|
||||
};
|
||||
typedef enum ompi_osc_rdma_request_type_t ompi_osc_rdma_request_type_t;
|
||||
|
||||
struct ompi_osc_rdma_request_t {
|
||||
ompi_request_t super;
|
||||
|
||||
ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
ompi_osc_rdma_request_type_t type;
|
||||
void *origin_addr;
|
||||
int origin_count;
|
||||
struct ompi_datatype_t *origin_dt;
|
||||
|
||||
void *result_addr;
|
||||
int result_count;
|
||||
struct ompi_datatype_t *result_dt;
|
||||
|
||||
const void *compare_addr;
|
||||
|
||||
ompi_op_t *op;
|
||||
|
||||
ompi_osc_rdma_module_t *module;
|
||||
int32_t outstanding_requests;
|
||||
bool internal;
|
||||
|
||||
ptrdiff_t offset;
|
||||
size_t len;
|
||||
void *ctx;
|
||||
void *frag;
|
||||
|
||||
uint64_t target_address;
|
||||
|
||||
struct ompi_osc_rdma_request_t *parent_request;
|
||||
/* used for non-contiguous get accumulate operations */
|
||||
opal_convertor_t convertor;
|
||||
|
||||
/** synchronization object */
|
||||
struct ompi_osc_rdma_sync_t *sync;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
|
||||
|
||||
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput,
|
||||
rdma_rget, etc.), so it's ok to spin here... */
|
||||
#define OMPI_OSC_RDMA_REQUEST_ALLOC(rmodule, rpeer, req) \
|
||||
do { \
|
||||
opal_free_list_item_t *item; \
|
||||
do { \
|
||||
item = opal_free_list_get (&mca_osc_rdma_component.requests); \
|
||||
if (NULL == item) { \
|
||||
ompi_osc_rdma_progress (rmodule); \
|
||||
} \
|
||||
} while (NULL == item); \
|
||||
req = (ompi_osc_rdma_request_t*) item; \
|
||||
OMPI_REQUEST_INIT(&req->super, false); \
|
||||
req->super.req_mpi_object.win = module->win; \
|
||||
req->super.req_complete = false; \
|
||||
req->super.req_state = OMPI_REQUEST_ACTIVE; \
|
||||
req->module = rmodule; \
|
||||
req->internal = false; \
|
||||
req->outstanding_requests = 0; \
|
||||
req->parent_request = NULL; \
|
||||
req->peer = (rpeer); \
|
||||
} while (0)
|
||||
|
||||
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
|
||||
do { \
|
||||
OMPI_REQUEST_FINI(&(req)->super); \
|
||||
opal_free_list_return (&mca_osc_rdma_component.requests, \
|
||||
(opal_free_list_item_t *) (req)); \
|
||||
} while (0)
|
||||
|
||||
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
|
||||
{
|
||||
if (!request->internal) {
|
||||
request->super.req_status.MPI_ERROR = mpi_error;
|
||||
|
||||
/* mark the request complete at the mpi level */
|
||||
ompi_request_complete (&request->super, true);
|
||||
} else {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN (request);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_REQUEST_H */
|
83
ompi/mca/osc/rdma/osc_rdma_sync.c
Обычный файл
83
ompi/mca/osc/rdma/osc_rdma_sync.c
Обычный файл
@ -0,0 +1,83 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "osc_rdma.h"
|
||||
#include "osc_rdma_sync.h"
|
||||
|
||||
static void ompi_osc_rdma_sync_constructor (ompi_osc_rdma_sync_t *rdma_sync)
|
||||
{
|
||||
rdma_sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
|
||||
rdma_sync->epoch_active = false;
|
||||
rdma_sync->outstanding_rdma = 0;
|
||||
OBJ_CONSTRUCT(&rdma_sync->aggregations, opal_list_t);
|
||||
OBJ_CONSTRUCT(&rdma_sync->lock, opal_mutex_t);
|
||||
}
|
||||
|
||||
static void ompi_osc_rdma_sync_destructor (ompi_osc_rdma_sync_t *rdma_sync)
|
||||
{
|
||||
OBJ_DESTRUCT(&rdma_sync->aggregations);
|
||||
OBJ_DESTRUCT(&rdma_sync->lock);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_osc_rdma_sync_t, opal_object_t, ompi_osc_rdma_sync_constructor,
|
||||
ompi_osc_rdma_sync_destructor);
|
||||
|
||||
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module)
|
||||
{
|
||||
ompi_osc_rdma_sync_t *rdma_sync;
|
||||
|
||||
rdma_sync = OBJ_NEW (ompi_osc_rdma_sync_t);
|
||||
if (OPAL_UNLIKELY(NULL == rdma_sync)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rdma_sync->module = module;
|
||||
return rdma_sync;
|
||||
}
|
||||
|
||||
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync)
|
||||
{
|
||||
OBJ_RELEASE(rdma_sync);
|
||||
}
|
||||
|
||||
static inline bool ompi_osc_rdma_sync_array_peer (int rank, ompi_osc_rdma_peer_t **peers, size_t nranks,
|
||||
struct ompi_osc_rdma_peer_t **peer)
|
||||
{
|
||||
int mid = nranks / 2;
|
||||
|
||||
/* base cases */
|
||||
if (0 == nranks || (1 == nranks && peers[0]->rank != rank)) {
|
||||
*peer = NULL;
|
||||
return false;
|
||||
} else if (peers[0]->rank == rank) {
|
||||
*peer = peers[0];
|
||||
return true;
|
||||
}
|
||||
|
||||
if (peers[mid]->rank > rank) {
|
||||
return ompi_osc_rdma_sync_array_peer (rank, peers, mid, peer);
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_sync_array_peer (rank, peers + mid, nranks - mid, peer);
|
||||
}
|
||||
|
||||
bool ompi_osc_rdma_sync_pscw_peer (ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer)
|
||||
{
|
||||
ompi_osc_rdma_sync_t *rdma_sync = &module->all_sync;
|
||||
|
||||
/* check synchronization type */
|
||||
if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW != rdma_sync->type) {
|
||||
*peer = NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
return ompi_osc_rdma_sync_array_peer (target, rdma_sync->peer_list.peers, rdma_sync->num_peers, peer);
|
||||
}
|
158
ompi/mca/osc/rdma/osc_rdma_sync.h
Обычный файл
158
ompi/mca/osc/rdma/osc_rdma_sync.h
Обычный файл
@ -0,0 +1,158 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#if !defined(OSC_RDMA_SYNC_H)
|
||||
#define OSC_RDMA_SYNC_H
|
||||
|
||||
#include "osc_rdma_types.h"
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "opal/threads/threads.h"
|
||||
|
||||
/**
|
||||
* @brief synchronization types
|
||||
*/
|
||||
enum ompi_osc_rdma_sync_type_t {
|
||||
/** default value */
|
||||
OMPI_OSC_RDMA_SYNC_TYPE_NONE,
|
||||
/** lock access epoch */
|
||||
OMPI_OSC_RDMA_SYNC_TYPE_LOCK,
|
||||
/** fence access epoch */
|
||||
OMPI_OSC_RDMA_SYNC_TYPE_FENCE,
|
||||
/* post-start-complete-wait access epoch */
|
||||
OMPI_OSC_RDMA_SYNC_TYPE_PSCW,
|
||||
};
|
||||
typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;
|
||||
|
||||
struct ompi_osc_rdma_module_t;
|
||||
|
||||
/**
|
||||
* @brief synchronization object
|
||||
*
|
||||
* This structure holds information about an access epoch.
|
||||
*/
|
||||
struct ompi_osc_rdma_sync_t {
|
||||
opal_object_t super;
|
||||
|
||||
/** osc rdma module */
|
||||
struct ompi_osc_rdma_module_t *module;
|
||||
|
||||
/** synchronization type */
|
||||
ompi_osc_rdma_sync_type_t type;
|
||||
|
||||
/** synchronization data */
|
||||
union {
|
||||
/** lock specific synchronization data */
|
||||
struct {
|
||||
/** lock target rank (-1 for all) */
|
||||
int target;
|
||||
|
||||
/** lock type: MPI_LOCK_SHARED, MPI_LOCK_EXCLUSIVE */
|
||||
int16_t type;
|
||||
|
||||
/** assert specified at lock acquire time. at this time Open MPI
|
||||
* only uses 5-bits for asserts. if this number goes over 16 this
|
||||
* will need to be changed to accomodate. */
|
||||
int16_t assert;
|
||||
} lock;
|
||||
|
||||
/** post/start/complete/wait specific synchronization data */
|
||||
struct {
|
||||
/** group passed to ompi_osc_rdma_start */
|
||||
ompi_group_t *group;
|
||||
} pscw;
|
||||
} sync;
|
||||
|
||||
/** array of peers for this sync */
|
||||
union {
|
||||
/** multiple peers (lock all, pscw, fence) */
|
||||
struct ompi_osc_rdma_peer_t **peers;
|
||||
/** single peer (targeted lock) */
|
||||
struct ompi_osc_rdma_peer_t *peer;
|
||||
} peer_list;
|
||||
|
||||
/** number of peers */
|
||||
int num_peers;
|
||||
|
||||
/** communication has started on this epoch */
|
||||
bool epoch_active;
|
||||
|
||||
/** outstanding rdma operations on epoch */
|
||||
osc_rdma_counter_t outstanding_rdma;
|
||||
|
||||
/** aggregated operations in this epoch */
|
||||
opal_list_t aggregations;
|
||||
|
||||
/** lock to protect sync structure members */
|
||||
opal_mutex_t lock;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_sync_t ompi_osc_rdma_sync_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_sync_t);
|
||||
|
||||
/**
|
||||
* @brief allocate a new synchronization object
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
*
|
||||
* @returns NULL on failure
|
||||
* @returns a new synchronization object on success
|
||||
*/
|
||||
ompi_osc_rdma_sync_t *ompi_osc_rdma_sync_allocate (struct ompi_osc_rdma_module_t *module);
|
||||
|
||||
/**
|
||||
* @brief release a synchronization object
|
||||
*
|
||||
* @param[in] rdma_sync synchronization object allocated by ompi_osc_rdma_sync_allocate()
|
||||
*/
|
||||
void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);
|
||||
|
||||
/**
|
||||
* Check if the target is part of a PSCW access epoch
|
||||
*
|
||||
* @param[in] module osc rdma module
|
||||
* @param[in] target target rank
|
||||
* @param[out] peer peer object
|
||||
*
|
||||
* @returns false if the window is not in a PSCW access epoch or the peer is not
|
||||
* in the group passed to MPI_Win_start
|
||||
* @returns true otherwise
|
||||
*
|
||||
* This functions verifies the target is part of an active PSCW access epoch.
|
||||
*/
|
||||
bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer);
|
||||
|
||||
/**
|
||||
* @brief increment the outstanding rdma operation counter (atomic)
|
||||
*
|
||||
* @param[in] rdma_sync osc rdma synchronization object
|
||||
*/
|
||||
static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
|
||||
{
|
||||
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, 1);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "INC: there are %ld outstanding RDMA operations",
|
||||
(unsigned long) rdma_sync->outstanding_rdma));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief decrement the outstanding rdma operation counter (atomic)
|
||||
*
|
||||
* @param[in] rdma_sync osc rdma synchronization object
|
||||
*/
|
||||
static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
|
||||
{
|
||||
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, -1);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "DEC: there are %ld outstanding RDMA operations",
|
||||
(unsigned long) rdma_sync->outstanding_rdma));
|
||||
}
|
||||
|
||||
#endif /* OSC_RDMA_SYNC_H */
|
213
ompi/mca/osc/rdma/osc_rdma_types.h
Обычный файл
213
ompi/mca/osc/rdma/osc_rdma_types.h
Обычный файл
@ -0,0 +1,213 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OMPI_OSC_RDMA_TYPES_H
|
||||
#define OMPI_OSC_RDMA_TYPES_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
/* forward declarations of some other component types */
|
||||
struct ompi_osc_rdma_frag_t;
|
||||
struct ompi_osc_rdma_sync_t;
|
||||
struct ompi_osc_rdma_peer_t;
|
||||
|
||||
#if OPAL_HAVE_ATOMIC_MATH_64
|
||||
|
||||
typedef int64_t osc_rdma_base_t;
|
||||
typedef int64_t osc_rdma_size_t;
|
||||
typedef int64_t osc_rdma_counter_t;
|
||||
|
||||
#define ompi_osc_rdma_counter_add opal_atomic_add_64
|
||||
|
||||
#else
|
||||
|
||||
typedef int32_t osc_rdma_base_t;
|
||||
typedef int32_t osc_rdma_size_t;
|
||||
typedef int32_t osc_rdma_counter_t;
|
||||
|
||||
#define ompi_osc_rdma_counter_add opal_atomic_add_32
|
||||
|
||||
#endif
|
||||
|
||||
#if OPAL_HAVE_ATOMIC_MATH_64
|
||||
|
||||
#define OMPI_OSC_RDMA_LOCK_EXCLUSIVE 0x8000000000000000l
|
||||
|
||||
typedef int64_t ompi_osc_rdma_lock_t;
|
||||
|
||||
static inline int64_t ompi_osc_rdma_lock_add (volatile int64_t *p, int64_t value)
|
||||
{
|
||||
int64_t new;
|
||||
|
||||
opal_atomic_mb ();
|
||||
new = opal_atomic_add_64 (p, value) - value;
|
||||
opal_atomic_mb ();
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_lock_cmpset (volatile int64_t *p, int64_t comp, int64_t value)
|
||||
{
|
||||
int ret;
|
||||
|
||||
opal_atomic_mb ();
|
||||
ret = opal_atomic_cmpset_64 (p, comp, value);
|
||||
opal_atomic_mb ();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define OMPI_OSC_RDMA_LOCK_EXCLUSIVE 0x80000000l
|
||||
|
||||
typedef int32_t ompi_osc_rdma_lock_t;
|
||||
|
||||
static inline int32_t ompi_osc_rdma_lock_add (volatile int32_t *p, int32_t value)
|
||||
{
|
||||
int32_t new;
|
||||
|
||||
opal_atomic_mb ();
|
||||
/* opal_atomic_add_32 differs from normal atomics in that is returns the new value */
|
||||
new = opal_atomic_add_32 (p, value) - value;
|
||||
opal_atomic_mb ();
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
static inline int ompi_osc_rdma_lock_cmpset (volatile int32_t *p, int32_t comp, int32_t value)
|
||||
{
|
||||
int ret;
|
||||
|
||||
opal_atomic_mb ();
|
||||
ret = opal_atomic_cmpset_32 (p, comp, value);
|
||||
opal_atomic_mb ();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* OPAL_HAVE_ATOMIC_MATH_64 */
|
||||
|
||||
/**
|
||||
* @brief structure describing a window memory region
|
||||
*/
|
||||
struct ompi_osc_rdma_region_t {
|
||||
/** base of the region */
|
||||
osc_rdma_base_t base;
|
||||
/** length (in bytes) of the region */
|
||||
osc_rdma_size_t len;
|
||||
/** BTL segment for the region (may be empty) */
|
||||
unsigned char btl_handle_data[];
|
||||
};
|
||||
typedef struct ompi_osc_rdma_region_t ompi_osc_rdma_region_t;
|
||||
|
||||
/**
|
||||
* @brief data handle for dynamic memory regions
|
||||
*
|
||||
* This structure holds the btl handle (if one exists) and the
|
||||
* reference count for a dynamically attached region. The reference
|
||||
* count is used to keep track of the number of times a memory
|
||||
* region associated with a page (or set of pages) has been attached.
|
||||
*/
|
||||
struct ompi_osc_rdma_handle_t {
|
||||
/** btl handle for the memory region */
|
||||
mca_btl_base_registration_handle_t *btl_handle;
|
||||
/** number of attaches assocated with this region */
|
||||
int refcnt;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_handle_t ompi_osc_rdma_handle_t;
|
||||
|
||||
/**
|
||||
* @brief number of state buffers that can be used for storing
|
||||
* post messages.
|
||||
*
|
||||
* This value was chosen because post exposure epochs are expected to be
|
||||
* small relative to the size of the communicator. The value is constant
|
||||
* and not exposed as an MCA variable to keep the layout of the
|
||||
* \ref ompi_osc_rdma_state_t structure simple.
|
||||
*/
|
||||
#define OMPI_OSC_RDMA_POST_PEER_MAX 32
|
||||
|
||||
/**
|
||||
* @brief window state structure
|
||||
*
|
||||
* This structure holds the information relevant to the window state
|
||||
* of a peer. The structure synchronization data and includes useful
|
||||
* information that can be remotely read by other peers in the window.
|
||||
*/
|
||||
struct ompi_osc_rdma_state_t {
|
||||
/** used when rdma is in use to handle excusive locks and global shared locks (lock_all) */
|
||||
ompi_osc_rdma_lock_t global_lock;
|
||||
/** lock state for this node. the top bit indicates if a exclusive lock exists and the
|
||||
* remaining bits count the number of shared locks */
|
||||
ompi_osc_rdma_lock_t local_lock;
|
||||
/** lock for the accumulate state to ensure ordering and consistency */
|
||||
ompi_osc_rdma_lock_t accumulate_lock;
|
||||
/** persistent scratch space for fetch and op/cswap when the result is not needed */
|
||||
ompi_osc_rdma_lock_t scratch_lock;
|
||||
/** current index to post to. compare-and-swap must be used to ensure
|
||||
* the index is free */
|
||||
osc_rdma_counter_t post_index;
|
||||
/** post buffers */
|
||||
osc_rdma_counter_t post_peers[OMPI_OSC_RDMA_POST_PEER_MAX];
|
||||
/** counter for number of post messages received */
|
||||
osc_rdma_counter_t num_post_msgs;
|
||||
/** counter for number of complete messages received */
|
||||
osc_rdma_counter_t num_complete_msgs;
|
||||
/** lock for the region state to ensure consistency */
|
||||
ompi_osc_rdma_lock_t regions_lock;
|
||||
/** displacement unit for this process */
|
||||
int64_t disp_unit;
|
||||
/** number of attached regions. this count will be 1 in non-dynamic regions */
|
||||
osc_rdma_counter_t region_count;
|
||||
/** attached memory regions */
|
||||
unsigned char regions[];
|
||||
};
|
||||
typedef struct ompi_osc_rdma_state_t ompi_osc_rdma_state_t;
|
||||
|
||||
struct ompi_osc_rdma_aggregation_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
/** associated peer */
|
||||
struct ompi_osc_rdma_peer_t *peer;
|
||||
|
||||
/** aggregation buffer frag */
|
||||
struct ompi_osc_rdma_frag_t *frag;
|
||||
|
||||
/** synchronization object */
|
||||
struct ompi_osc_rdma_sync_t *sync;
|
||||
|
||||
/** aggregation buffer */
|
||||
char *buffer;
|
||||
|
||||
/** target for the operation */
|
||||
osc_rdma_base_t target_address;
|
||||
|
||||
/** handle for target memory address */
|
||||
mca_btl_base_registration_handle_t *target_handle;
|
||||
|
||||
/** buffer size */
|
||||
size_t buffer_size;
|
||||
|
||||
/** buffer used */
|
||||
size_t buffer_used;
|
||||
|
||||
/** type */
|
||||
int type;
|
||||
|
||||
/** list of associated requests */
|
||||
opal_list_t requests;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t);
|
||||
|
||||
#endif /* OMPI_OSC_RDMA_TYPES_H */
|
@ -4,4 +4,4 @@
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner: LANL
|
||||
status: active?
|
||||
status: active
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user