1
1

osc/rdma: performance improvments and bug fixes

This commit is a large update to the osc/rdma component. Included in
this commit:

 - Add support for using hardware atomics for fetch-and-op and single
   count accumulate  when using the accumulate lock. This will improve
   the performance of these operations even when not setting the
   single intrinsic info key.

 - Rework how large accumulates are done. They now block on the get
   operation to fix some bugs discovered by an IBM one-sided test. I
   may roll back some of the changes if the underlying bug in the
   original design is discovered. There appear to be no real
   difference (on the hardware this was tested with) in performance so
   its probably a non-issue. References #2530.

 - Add support for an additional lock-all algorithm: on-demand. The
   on-demand algorithm will attempt to acquire the peer lock when
   starting an RMA operation. The lock algorithm default has not
   changed. The algorithm can be selected by setting the
   osc_rdma_locking_mode MCA variable. The valid values are two_level
   and on_demand.

 - Make use of the btl_flush function if available. This can improve
   performance with some btls.

 - When using btl_flush do not keep track of the number of put
   operations. This reduces the number of atomic operations in the
   critical path.

 - Make the window buffers more friendly to multi-threaded
   applications. This was done by dropping support for multiple
   buffers per MPI window. I intend to re-add that support once the
   underlying performance bug under the old buffering scheme is
   fixed.

 - Fix a bug in request completion in the accumulate, get, and put
   paths. This also helps with #2530.

 - General code cleanup and fixes.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2018-03-15 12:20:41 -06:00 коммит произвёл Nathan Hjelm
родитель 5f58e7b961
Коммит 7f4872d483
16 изменённых файлов: 1030 добавлений и 929 удалений

Просмотреть файл

@ -8,7 +8,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
@ -50,6 +50,11 @@
#include "opal_stdint.h"
enum {
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
};
/**
* @brief osc rdma component structure
*/
@ -87,6 +92,9 @@ struct ompi_osc_rdma_component_t {
/** Default value of the no_locks info key for new windows */
bool no_locks;
/** Locking mode to use as the default for all windows */
int locking_mode;
/** Accumulate operations will only operate on a single intrinsic datatype */
bool acc_single_intrinsic;
@ -119,6 +127,8 @@ struct ompi_osc_rdma_module_t {
/** Mutex lock protecting module data */
opal_mutex_t lock;
/** locking mode to use */
int locking_mode;
/* window configuration */
@ -147,10 +157,12 @@ struct ompi_osc_rdma_module_t {
/** Local displacement unit. */
int disp_unit;
/** global leader */
ompi_osc_rdma_peer_t *leader;
/** my peer structure */
ompi_osc_rdma_peer_t *my_peer;
/** pointer to free on cleanup (may be NULL) */
void *free_after;
@ -276,6 +288,16 @@ int ompi_osc_rdma_free (struct ompi_win_t *win);
*/
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
/**
* @brief demand lock a peer
*
* @param[in] module osc rdma module
* @param[in] peer peer to lock
*
* @returns OMPI_SUCCESS on success
*/
int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
/**
* @brief check if a peer object is cached for a remote rank
*
@ -449,10 +471,18 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
}
return NULL;
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
case OMPI_OSC_RDMA_SYNC_TYPE_LOCK:
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence/lock_all access epoch for target %d", target);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found lock_all access epoch for target %d", target);
*peer = ompi_osc_rdma_module_peer (module, target);
if (OPAL_UNLIKELY(OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode &&
!ompi_osc_rdma_peer_is_demand_locked (*peer))) {
ompi_osc_rdma_demand_lock_peer (module, *peer);
}
return &module->all_sync;
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence access epoch for target %d", target);
/* fence epoch is now active */
module->all_sync.epoch_active = true;
*peer = ompi_osc_rdma_module_peer (module, target);
@ -470,6 +500,62 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
return NULL;
}
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
return !!(module->selected_btl->btl_flush);
#else
return false;
#endif
}
/**
* @brief increment the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_inc_always (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, 1);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma.counter);
}
static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
return;
}
#endif
ompi_osc_rdma_sync_rdma_inc_always (rdma_sync);
}
/**
* @brief decrement the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_dec_always (ompi_osc_rdma_sync_t *rdma_sync)
{
opal_atomic_wmb ();
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, -1);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma.counter);
}
static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
return;
}
#endif
ompi_osc_rdma_sync_rdma_dec_always (rdma_sync);
}
/**
* @brief complete all outstanding rdma operations to all peers
*
@ -477,18 +563,31 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
*/
static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
{
ompi_osc_rdma_aggregation_t *aggregation, *next;
if (opal_list_get_size (&sync->aggregations)) {
ompi_osc_rdma_aggregation_t *aggregation, *next;
OPAL_THREAD_SCOPED_LOCK(&sync->lock,
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer);
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
});
}
#if !defined(BTL_VERSION) || (BTL_VERSION < 310)
do {
opal_progress ();
} while (sync->outstanding_rdma);
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btl;
do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
opal_progress ();
} else {
btl_module->btl_flush (btl_module, NULL);
}
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
#endif
}
/**

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -8,7 +8,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
@ -55,6 +55,7 @@ static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pend
pending_op->op_buffer = NULL;
pending_op->op_result = NULL;
pending_op->op_complete = false;
pending_op->cbfunc = NULL;
}
static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
@ -79,10 +80,16 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
{
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", pending_op, status);
if (pending_op->op_result) {
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
}
if (NULL != pending_op->cbfunc) {
pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status);
}
if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
pending_op->op_frag = NULL;
@ -194,7 +201,8 @@ static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank,
if (rank == peers[j]->rank) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "got expected post from %d. still expecting posts from %d processes",
rank, (int) (npeers - state->num_post_msgs - 1));
++state->num_post_msgs;
/* an atomic is not really necessary as this function is currently used but it doesn't hurt */
ompi_osc_rdma_counter_add (&state->num_post_msgs, 1);
return;
}
}
@ -206,13 +214,91 @@ static void ompi_osc_rdma_handle_post (ompi_osc_rdma_module_t *module, int rank,
OPAL_THREAD_SCOPED_LOCK(&module->lock, opal_list_append (&module->pending_posts, &pending_post->super));
}
static void ompi_osc_rdma_check_posts (ompi_osc_rdma_module_t *module)
{
ompi_osc_rdma_state_t *state = module->state;
ompi_osc_rdma_sync_t *sync = &module->all_sync;
int count = 0;
if (OMPI_OSC_RDMA_SYNC_TYPE_PSCW == sync->type) {
count = sync->num_peers;
}
for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) {
/* no post at this index (yet) */
if (0 == state->post_peers[i]) {
continue;
}
ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, count);
state->post_peers[i] = 0;
}
}
static int ompi_osc_rdma_post_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
{
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
ompi_osc_rdma_lock_t post_index, result, _tmp_value;
int my_rank = ompi_comm_rank (module->comm);
int ret;
if (peer->rank == my_rank) {
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
return OMPI_SUCCESS;
}
/* get a post index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
} else {
post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
}
post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1;
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
sizeof (osc_rdma_counter_t) * post_index;
do {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", (int)post_index, peer->rank);
_tmp_value = 0;
/* try to post. if the value isn't 0 then another rank is occupying this index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
} else {
result = !ompi_osc_rdma_lock_compare_exchange ((osc_rdma_counter_t *) target, &_tmp_value,
1 + (osc_rdma_counter_t) my_rank);
}
if (OPAL_LIKELY(0 == result)) {
break;
}
/* prevent circular wait by checking for post messages received */
ompi_osc_rdma_check_posts (module);
/* zzzzzzzzzzzzz */
nanosleep (&(struct timespec) {.tv_sec = 0, .tv_nsec = 100}, NULL);
} while (1);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t **peers;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_state_t *state = module->state;
int ret;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name);
@ -253,67 +339,17 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
/* send a hello counter to everyone in group */
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
ompi_osc_rdma_lock_t post_index;
if (peer->rank == my_rank) {
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
continue;
ret = ompi_osc_rdma_post_peer (module, peers[i]);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
/* get a post index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true);
assert (OMPI_SUCCESS == ret);
} else {
post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
}
post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1;
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
sizeof (osc_rdma_counter_t) * post_index;
do {
ompi_osc_rdma_lock_t result;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", (int)post_index, peer->rank);
/* try to post. if the value isn't 0 then another rank is occupying this index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result);
assert (OMPI_SUCCESS == ret);
} else {
ompi_osc_rdma_lock_t _tmp_value = 0;
result = !ompi_osc_rdma_lock_compare_exchange ((osc_rdma_counter_t *) target, &_tmp_value, 1 + (osc_rdma_counter_t) my_rank);
}
if (OPAL_LIKELY(0 == result)) {
break;
}
/* prevent circular wait by checking for post messages received */
for (int j = 0 ; j < OMPI_OSC_RDMA_POST_PEER_MAX ; ++j) {
/* no post at this index (yet) */
if (0 == state->post_peers[j]) {
continue;
}
ompi_osc_rdma_handle_post (module, state->post_peers[j] - 1, NULL, 0);
state->post_peers[j] = 0;
}
usleep (100);
} while (1);
}
ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete");
return OMPI_SUCCESS;
return ret;
}
int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
@ -379,8 +415,7 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win
"from %d processes", peer->rank, (int) (group_size - state->num_post_msgs - 1));
opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post);
/* only one thread can process post messages so there is no need of atomics here */
++state->num_post_msgs;
ompi_osc_rdma_counter_add (&state->num_post_msgs, 1);
break;
}
}
@ -390,16 +425,7 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int assert, ompi_win_t *win
while (state->num_post_msgs != group_size) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "waiting for post messages. have %d of %d",
(int) state->num_post_msgs, group_size);
for (int i = 0 ; i < OMPI_OSC_RDMA_POST_PEER_MAX ; ++i) {
/* no post at this index (yet) */
if (0 == state->post_peers[i]) {
continue;
}
ompi_osc_rdma_handle_post (module, state->post_peers[i] - 1, sync->peer_list.peers, group_size);
state->post_peers[i] = 0;
}
ompi_osc_rdma_check_posts (module);
ompi_osc_rdma_progress (module);
}
} else {
@ -500,7 +526,6 @@ int ompi_osc_rdma_wait_atomic (ompi_win_t *win)
}
OPAL_THREAD_LOCK(&module->lock);
state->num_complete_msgs = 0;
group = module->pw_group;
module->pw_group = NULL;
OPAL_THREAD_UNLOCK(&module->lock);
@ -551,6 +576,8 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
OBJ_RELEASE(group);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "test complete. returning flag: true");
return OMPI_SUCCESS;
}
@ -567,6 +594,8 @@ int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win)
return OMPI_ERR_RMA_SYNC;
}
/* NTH: locking here isn't really needed per-se but it may make user synchronization errors more
* predicable. if the user is using RMA correctly then there will be no contention on this lock. */
OPAL_THREAD_LOCK(&module->lock);
/* active sends are now active (we will close the epoch if NOSUCCEED is specified) */
@ -578,22 +607,17 @@ int ompi_osc_rdma_fence_atomic (int assert, ompi_win_t *win)
}
/* technically it is possible to enter a lock epoch (which will close the fence epoch) if
* no communication has occurred. this flag will be set on the next put, get, accumulate, etc. */
* no communication has occurred. this flag will be set to true on the next put, get,
* accumulate, etc if no other synchronization call is made. <sarcasm> yay fence </sarcasm> */
module->all_sync.epoch_active = false;
/* short-circuit the noprecede case */
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fence complete (short circuit)");
/* no communication can occur until a peer has entered the same fence epoch. for now
* a barrier is used to ensure this is the case. */
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
/* there really is no practical difference between NOPRECEDE and the normal case. in both cases there
* may be local stores that will not be visible as they should if we do not barrier. since that is the
* case there is no optimization for NOPRECEDE */
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
/* ensure all writes to my memory are complete */
/* ensure all writes to my memory are complete (both local stores, and RMA operations) */
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
if (assert & MPI_MODE_NOSUCCEED) {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Research Organization for Information Science
@ -21,6 +21,27 @@
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "opal/align.h"
/* helper functions */
static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag,
mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
{
if (frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, handle);
}
if (request) {
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
}
if (dec_always) {
ompi_osc_rdma_sync_rdma_dec_always (sync);
} else {
ompi_osc_rdma_sync_rdma_dec (sync);
}
}
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
ompi_osc_rdma_request_t *request);
@ -37,17 +58,30 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
void *data, size_t len)
{
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
volatile bool read_complete = false;
size_t aligned_len, offset;
uint64_t aligned_addr = (source_address + btl_alignment_mask) & ~btl_alignment_mask;
char *ptr = data;
int ret;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading state data from endpoint %p. source: 0x%" PRIx64 ", len: %lu",
(void *) endpoint, source_address, (unsigned long) len);
offset = source_address & btl_alignment_mask;
aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "reading data from endpoint %p. source: 0x%" PRIx64 " (aligned: 0x%" PRIx64
"), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len,
(unsigned long) aligned_len);
if (module->selected_btl->btl_register_mem && len >= module->selected_btl->btl_get_local_registration_threshold) {
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
do {
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) {
ompi_osc_rdma_progress (module);
}
} while (OMPI_ERR_OUT_OF_RESOURCE == ret);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer");
return ret;
@ -61,10 +95,10 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
assert (!(source_address & ALIGNMENT_MASK(module->selected_btl->btl_get_alignment)));
do {
ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, source_address,
local_handle, source_handle, len, 0, MCA_BTL_NO_ORDER,
ret = module->selected_btl->btl_get (module->selected_btl, endpoint, ptr, aligned_addr,
local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
if (OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
if (!ompi_osc_rdma_oor (ret)) {
break;
}
@ -91,7 +125,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
opal_memchecker_base_mem_defined (ptr, len);
if (frag) {
memcpy (data, ptr, len);
memcpy (data, ptr + offset, len);
/* done with the fragment */
ompi_osc_rdma_frag_complete (frag);
@ -191,7 +225,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
remote_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
remote_iov_index = 0;
/* opal_convertor_raw returns done when it has reached the end of the data */
/* opal_convertor_raw returns true when it has reached the end of the data */
done = opal_convertor_raw (&remote_convertor, remote_iovec, &remote_iov_count, &remote_size);
/* loop on the target segments until we have exhaused the decoded source data */
@ -232,7 +266,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
if (request) {
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
ompi_osc_rdma_request_deref (request);
}
if (alloc_reqs) {
@ -262,11 +296,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
if (request) {
/* release our reference so the request can complete */
if (1 == request->outstanding_requests) {
ompi_osc_rdma_request_complete (request, OMPI_SUCCESS);
}
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
ompi_osc_rdma_request_deref (request);
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
@ -353,14 +383,12 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc
void *context, void *data, int status)
{
ompi_osc_rdma_sync_t *sync = (ompi_osc_rdma_sync_t *) context;
ompi_osc_rdma_frag_t *frag = (ompi_osc_rdma_frag_t *) data;
ompi_osc_rdma_request_t *request = NULL;
assert (OPAL_SUCCESS == status);
/* the lowest bit is used as a flag indicating this put operation has a request */
if ((intptr_t) context & 0x1) {
request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
sync = request->sync;
/* NTH -- TODO: better error handling */
@ -370,15 +398,42 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc
OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on sync %p. local "
"address %p. opal status %d", (void *) sync, local_address, status);
if (frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
if (data) {
ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
} else if (local_handle) {
ompi_osc_rdma_deregister (sync->module, local_handle);
}
ompi_osc_rdma_sync_rdma_dec (sync);
}
static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) context;
assert (OPAL_SUCCESS == status);
/* the lowest bit is used as a flag indicating this put operation has a request */
if ((intptr_t) context & 0x1) {
ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1);
module = request->module;
/* NTH -- TODO: better error handling */
ompi_osc_rdma_request_complete (request, status);
}
OSC_RDMA_VERBOSE(status ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_TRACE, "btl put complete on module %p. local "
"address %p. opal status %d", (void *) module, local_address, status);
if (data) {
ompi_osc_rdma_frag_complete ((ompi_osc_rdma_frag_t *) data);
} else if (local_handle) {
ompi_osc_rdma_deregister (module, local_handle);
}
}
static void ompi_osc_rdma_aggregate_put_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
@ -424,14 +479,12 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
++module->put_retry_count;
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
if (!ompi_osc_rdma_oor (ret)) {
break;
}
/* spin a bit on progress */
for (int i = 0 ; i < 10 ; ++i) {
ompi_osc_rdma_progress (module);
}
ompi_osc_rdma_progress (module);
} while (1);
OSC_RDMA_VERBOSE(10, "btl put failed with opal error code %d", ret);
@ -498,18 +551,20 @@ static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_r
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
ompi_osc_rdma_request_t *request)
int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
char *ptr = source_buffer;
void *cbcontext;
int ret;
#if 0
if (aggregation) {
if (size <= (aggregation->buffer_size - aggregation->buffer_used) && (target_handle == aggregation->target_handle) &&
(target_address == aggregation->target_address + aggregation->buffer_used)) {
@ -535,6 +590,7 @@ static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
return ret;
}
}
#endif
if (module->selected_btl->btl_register_mem && size > module->selected_btl->btl_put_local_registration_threshold) {
ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr);
@ -549,23 +605,36 @@ static int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
}
}
if (ompi_osc_rdma_use_btl_flush (module)) {
/* NTH: when using the btl_flush function there is no guarantee that the callback will happen
* before the flush is complete. because of this there is a chance that the sync object will be
* released before there is a callback. to handle this case we call different callback that doesn't
* use the sync object. its possible the btl sematics will change in the future and the callback
* will happen *before* flush is considered complete. if that is the case this workaround can be
* removed */
cbcontext = (void *) module;
if (request || local_handle || frag) {
cbfunc = ompi_osc_rdma_put_complete_flush;
}
/* else the callback function is a no-op so do not bother specifying one */
} else {
cbcontext = (void *) sync;
cbfunc = ompi_osc_rdma_put_complete;
}
/* increment the outstanding request counter in the request object */
if (request) {
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
cbcontext = (void *) ((intptr_t) request | 1);
request->sync = sync;
} else {
cbcontext = (void *) sync;
}
ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, ompi_osc_rdma_put_complete,
ret = ompi_osc_rdma_put_real (sync, peer, target_address, target_handle, ptr, local_handle, size, cbfunc,
cbcontext, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
ompi_osc_rdma_cleanup_rdma (sync, false, frag, local_handle, request);
}
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
return ret;
}
@ -584,20 +653,26 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc
assert (OPAL_SUCCESS == status);
if (request->buffer || NULL != frag) {
if (request->buffer || frag) {
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
memcpy (origin_addr, (void *) source, request->len);
}
}
if (NULL == request->buffer) {
/* completion detection can handle this case without the counter when using btl_flush */
ompi_osc_rdma_sync_rdma_dec (sync);
} else {
/* the counter was needed to keep track of the number of outstanding operations */
ompi_osc_rdma_sync_rdma_dec_always (sync);
}
if (NULL != frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, local_handle);
}
ompi_osc_rdma_sync_rdma_dec (sync);
ompi_osc_rdma_request_complete (request, status);
}
@ -624,7 +699,7 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
return OMPI_SUCCESS;
}
ompi_osc_rdma_cleanup_rdma (aggregation->sync, aggregation->frag, NULL, NULL);
ompi_osc_rdma_cleanup_rdma (aggregation->sync, false, aggregation->frag, NULL, NULL);
ompi_osc_rdma_aggregation_return (aggregation);
@ -648,7 +723,7 @@ static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_
ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
ompi_osc_rdma_request_deref (request);
}
return ret;
@ -665,6 +740,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
osc_rdma_size_t aligned_len;
osc_rdma_base_t aligned_source_base, aligned_source_bound;
char *ptr = target_buffer;
bool counter_needs_inc = false;
int ret;
aligned_source_base = source_address & ~btl_alignment_mask;
@ -746,19 +822,31 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
request->origin_addr = target_buffer;
request->sync = sync;
ompi_osc_rdma_sync_rdma_inc (sync);
if (request->buffer) {
/* always increment the outstanding RDMA counter as the btl_flush function does not guarantee callback completion,
* just operation completion. */
counter_needs_inc = true;
ompi_osc_rdma_sync_rdma_inc_always (sync);
} else {
/* if this operation is being buffered with a frag then ompi_osc_rdma_sync_rdma_complete() can use the number
* of pending operations on the rdma_frag as an indicator as to whether the operation is complete. this can
* only be done since there is only on rdma frag per module. if that changes this logic will need to be changed
* as well. this path also covers the case where the get operation is not buffered. */
ompi_osc_rdma_sync_rdma_inc (sync);
}
do {
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, aligned_source_base, local_handle,
source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr,
aligned_source_base, local_handle, source_handle,
aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
request, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
}
++module->get_retry_count;
if (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret) {
if (!ompi_osc_rdma_oor (ret)) {
break;
}
@ -770,7 +858,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "btl get failed with opal error code %d", ret);
ompi_osc_rdma_cleanup_rdma (sync, frag, local_handle, request);
ompi_osc_rdma_cleanup_rdma (sync, counter_needs_inc, frag, local_handle, request);
return ret;
}

Просмотреть файл

@ -24,23 +24,6 @@
#define min(a,b) ((a) < (b) ? (a) : (b))
#define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0)
/* helper functions */
static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_frag_t *frag,
mca_btl_base_registration_handle_t *handle, ompi_osc_rdma_request_t *request)
{
if (frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, handle);
}
if (request) {
(void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
}
ompi_osc_rdma_sync_rdma_dec (sync);
}
/**
* @brief find a remote segment associate with the memory region
*
@ -134,4 +117,8 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, struct mca_btl_b
uint64_t source_address, mca_btl_base_registration_handle_t *source_handle,
void *data, size_t len);
int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
ompi_osc_rdma_request_t *request);
#endif /* OMPI_OSC_RDMA_COMM_H */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
@ -78,6 +78,12 @@ static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *ke
static char *ompi_osc_rdma_btl_names;
static char *ompi_osc_rdma_mtl_names;
static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
{.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
{.value = OMPI_OSC_RDMA_LOCKING_ON_DEMAND, .string = "on_demand"},
{.string = NULL},
};
ompi_osc_rdma_component_t mca_osc_rdma_component = {
.super = {
.osc_version = {
@ -171,88 +177,97 @@ static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *va
static int ompi_osc_rdma_component_register (void)
{
char *description_str;
mca_base_var_enum_t *new_enum;
mca_osc_rdma_component.no_locks = false;
asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is "
"not used. Info key of same name overrides this value (default: %s)",
mca_osc_rdma_component.no_locks ? "true" : "false");
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
"no_locks", description_str,
"not used. Info key of same name overrides this value (default: %s)",
mca_osc_rdma_component.no_locks ? "true" : "false");
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "no_locks", description_str,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks);
free(description_str);
mca_osc_rdma_component.acc_single_intrinsic = false;
asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
"that will not use anything more than a single predefined datatype (default: %s)",
mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false");
"that will not use anything more than a single predefined datatype (default: %s)",
mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false");
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic",
description_str,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic);
free(description_str);
mca_osc_rdma_component.acc_use_amo = true;
asprintf(&description_str, "Enable the use of network atomic memory operations when using single "
"intrinsic optimizations. If not set network compare-and-swap will be "
"used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo",
description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo);
"intrinsic optimizations. If not set network compare-and-swap will be "
"used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo", description_str,
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP,
&mca_osc_rdma_component.acc_use_amo);
free(description_str);
mca_osc_rdma_component.buffer_size = 32768;
asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&mca_osc_rdma_component.buffer_size);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size", description_str,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
free(description_str);
mca_osc_rdma_component.max_attach = 32;
asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
"Keep in mind that each attached buffer will use a potentially limited "
"resource (default: %d)", mca_osc_rdma_component.max_attach);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach",
description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
"Keep in mind that each attached buffer will use a potentially limited "
"resource (default: %d)", mca_osc_rdma_component.max_attach);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
free(description_str);
mca_osc_rdma_component.aggregation_limit = 1024;
asprintf(&description_str, "Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
"put and get operations. In some cases this may lead to higher latency but "
"should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)",
mca_osc_rdma_component.aggregation_limit);
"put and get operations. In some cases this may lead to higher latency but "
"should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)",
mca_osc_rdma_component.aggregation_limit);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "aggregation_limit",
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.aggregation_limit);
free(description_str);
mca_osc_rdma_component.priority = 90;
mca_osc_rdma_component.priority = 101;
asprintf(&description_str, "Priority of the osc/rdma component (default: %d)",
mca_osc_rdma_component.priority);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority",
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
mca_osc_rdma_component.priority);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority);
free(description_str);
ompi_osc_rdma_btl_names = "openib,ugni";
(void) mca_base_var_enum_create ("osc_rdma_locking_mode", ompi_osc_rdma_locking_modes, &new_enum);
mca_osc_rdma_component.locking_mode = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL;
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "locking_mode",
"Locking mode to use for passive-target synchronization (default: two_level)",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.locking_mode);
OBJ_RELEASE(new_enum);
ompi_osc_rdma_btl_names = "openib,ugni,uct,ucp";
asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying "
"connectivity. Do not add a BTL to to this list unless it can reach all "
"processes in any communicator used with an MPI window (default: %s)",
ompi_osc_rdma_btl_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls",
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
"connectivity. Do not add a BTL to to this list unless it can reach all "
"processes in any communicator used with an MPI window (default: %s)",
ompi_osc_rdma_btl_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", description_str,
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
free(description_str);
ompi_osc_rdma_mtl_names = "psm2";
asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
"osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls",
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
"osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", description_str,
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names);
free(description_str);
/* register performance variables */
(void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
@ -481,6 +496,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
return ret;
}
module->my_peer = my_peer;
module->free_after = module->rank_array;
my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
my_peer->state = (uint64_t) (uintptr_t) module->state;
@ -503,8 +519,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
ex_peer->size = size;
}
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
ex_peer->super.base_handle = module->state_handle;
if (!module->use_cpu_atomics) {
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
/* base is local and cpu atomics are available */
ex_peer->super.base_handle = module->state_handle;
} else {
ex_peer->super.base_handle = module->base_handle;
}
}
}
@ -701,6 +722,10 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, temp[0].rank);
}
if (my_rank == peer_rank) {
module->my_peer = peer;
}
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor || MPI_WIN_FLAVOR_CREATE == module->flavor) {
/* use the peer's BTL endpoint directly */
peer->data_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_rank);
@ -745,20 +770,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
static int ompi_osc_rdma_query_mtls (void)
{
char **mtls_to_use;
bool mtl_match = false;
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
if (mtls_to_use && ompi_mtl_base_selected_component) {
for (int i = 0 ; mtls_to_use[i] ; ++i) {
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
mtl_match = true;
break;
}
}
for (int i = 0 ; mtls_to_use[i] ; ++i) {
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
return OMPI_SUCCESS;
}
}
}
opal_argv_free (mtls_to_use);
return mtl_match ? OMPI_SUCCESS : OMPI_ERR_NOT_FOUND;
return -1;
}
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
@ -1115,7 +1136,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
module->same_disp_unit = check_config_value_bool ("same_disp_unit", info);
module->same_size = check_config_value_bool ("same_size", info);
module->no_locks = check_config_value_bool ("no_locks", info);
module->acc_single_intrinsic = check_config_value_bool ("ompi_single_accumulate", info);
module->locking_mode = mca_osc_rdma_component.locking_mode;
module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
module->acc_use_amo = mca_osc_rdma_component.acc_use_amo;
module->all_sync.module = module;

Просмотреть файл

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -16,34 +16,14 @@
#include "osc_rdma.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_free_list_item_t super;
/* start of unused space */
unsigned char *top;
/* space remaining in buffer */
uint32_t remain_len;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int32_t pending;
ompi_osc_rdma_module_t *module;
mca_btl_base_registration_handle_t *handle;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag)
{
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "returning frag. pending = %d", frag->pending);
if (0 == OPAL_THREAD_ADD_FETCH32(&frag->pending, -1)) {
opal_atomic_rmb ();
ompi_osc_rdma_deregister (frag->module, frag->handle);
frag->handle = NULL;
opal_free_list_return (&mca_osc_rdma_component.frags, (opal_free_list_item_t *) frag);
(void) opal_atomic_swap_32 (&frag->pending, 1);
(void) opal_atomic_swap_64 (&frag->curr_index, 0);
}
}
@ -53,7 +33,8 @@ static inline void ompi_osc_rdma_frag_complete (ompi_osc_rdma_frag_t *frag)
static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size_t request_len,
ompi_osc_rdma_frag_t **buffer, char **ptr)
{
ompi_osc_rdma_frag_t *curr;
ompi_osc_rdma_frag_t *curr = module->rdma_frag;
int64_t my_index;
int ret;
/* ensure all buffers are 8-byte aligned */
@ -63,59 +44,55 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
OPAL_THREAD_LOCK(&module->lock);
curr = module->rdma_frag;
if (OPAL_UNLIKELY(NULL == curr || curr->remain_len < request_len)) {
if (NULL == curr || (NULL != curr && curr->pending > 1)) {
opal_free_list_item_t *item = NULL;
if (NULL == curr) {
opal_free_list_item_t *item = NULL;
/* release the initial reference to the buffer */
module->rdma_frag = NULL;
item = opal_free_list_get (&mca_osc_rdma_component.frags);
if (OPAL_UNLIKELY(NULL == item)) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (curr) {
ompi_osc_rdma_frag_complete (curr);
}
curr = (ompi_osc_rdma_frag_t *) item;
item = opal_free_list_get (&mca_osc_rdma_component.frags);
if (OPAL_UNLIKELY(NULL == item)) {
OPAL_THREAD_UNLOCK(&module->lock);
curr->handle = NULL;
curr->pending = 1;
curr->module = module;
curr->curr_index = 0;
if (module->selected_btl->btl_register_mem) {
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size,
MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
curr = module->rdma_frag = (ompi_osc_rdma_frag_t *) item;
if (!opal_atomic_compare_exchange_strong_ptr (&module->rdma_frag, &(void *){NULL}, curr)) {
ompi_osc_rdma_deregister (module, curr->handle);
curr->handle = NULL;
curr->pending = 1;
curr->module = module;
}
curr->top = curr->super.ptr;
curr->remain_len = mca_osc_rdma_component.buffer_size;
opal_free_list_return (&mca_osc_rdma_component.frags, &curr->super);
if (curr->remain_len < request_len) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
curr = module->rdma_frag;
}
}
if (!curr->handle && module->selected_btl->btl_register_mem) {
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, curr->super.ptr, mca_osc_rdma_component.buffer_size,
MCA_BTL_REG_FLAG_ACCESS_ANY, &curr->handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
}
*ptr = (char *) curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "allocating frag. pending = %d", curr->pending);
OPAL_THREAD_ADD_FETCH32(&curr->pending, 1);
OPAL_THREAD_UNLOCK(&module->lock);
my_index = opal_atomic_fetch_add_64 (&curr->curr_index, request_len);
if (my_index + request_len > mca_osc_rdma_component.buffer_size) {
if (my_index <= mca_osc_rdma_component.buffer_size) {
/* this thread caused the buffer to spill over */
ompi_osc_rdma_frag_complete (curr);
}
ompi_osc_rdma_frag_complete (curr);
return OPAL_ERR_OUT_OF_RESOURCE;
}
*ptr = (void *) ((intptr_t) curr->super.ptr + my_index);
*buffer = curr;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -34,9 +34,10 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
void *context, void *data, int status);
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result,
const bool wait_for_completion)
static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op,
int64_t operand, int flags, int64_t *result, const bool wait_for_completion,
ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext)
{
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
@ -49,8 +50,13 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om
}
pending_op->op_result = (void *) result;
pending_op->op_size = sizeof (ompi_osc_rdma_lock_t);
pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
OBJ_RETAIN(pending_op);
if (cbfunc) {
pending_op->cbfunc = cbfunc;
pending_op->cbdata = cbdata;
pending_op->cbcontext = cbcontext;
}
/* spin until the btl has accepted the operation */
do {
@ -59,9 +65,9 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om
}
if (NULL != pending_op->op_frag) {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, pending_op->op_buffer,
(intptr_t) address, pending_op->op_frag->handle, peer->state_handle,
op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, endpoint, pending_op->op_buffer,
(intptr_t) address, pending_op->op_frag->handle, address_handle,
op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) pending_op, NULL);
}
@ -71,10 +77,78 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS != ret) {
if (OPAL_LIKELY(1 == ret)) {
*result = ((int64_t *) pending_op->op_buffer)[0];
ret = OMPI_SUCCESS;
ompi_osc_rdma_atomic_complete (module->selected_btl, endpoint, pending_op->op_buffer,
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
}
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
} else if (wait_for_completion) {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
}
OBJ_RELEASE(pending_op);
return ret;
}
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result,
const bool wait_for_completion)
{
return ompi_osc_rdma_btl_fop (module, peer->state_endpoint, address, peer->state_handle, op, operand, 0, result,
wait_for_completion, NULL, NULL, NULL);
}
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
uint64_t address, mca_btl_base_registration_handle_t *address_handle,
int op, int64_t operand, int flags, const bool wait_for_completion,
ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext)
{
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
return ompi_osc_rdma_btl_fop (module, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion,
cbfunc, cbdata, cbcontext);
}
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
OBJ_RETAIN(pending_op);
if (cbfunc) {
pending_op->cbfunc = cbfunc;
pending_op->cbdata = cbdata;
pending_op->cbcontext = cbcontext;
}
/* spin until the btl has accepted the operation */
do {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, endpoint, (intptr_t) address, address_handle,
op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) pending_op, NULL);
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
break;
}
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
if (cbfunc) {
cbfunc (cbdata, cbcontext, OMPI_SUCCESS);
}
ret = OMPI_SUCCESS;
}
} else if (wait_for_completion) {
@ -91,23 +165,38 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion)
{
return ompi_osc_rdma_btl_op (module, peer->state_endpoint, address, peer->state_handle, op, operand, 0, wait_for_completion,
NULL, NULL, NULL);
}
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
uint64_t address, mca_btl_base_registration_handle_t *address_handle,
int64_t compare, int64_t value, int flags, int64_t *result)
{
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL, wait_for_completion);
}
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
OBJ_RETAIN(pending_op);
pending_op->op_result = (void *) result;
pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
/* spin until the btl has accepted the operation */
do {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) address, peer->state_handle,
op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) pending_op, NULL);
if (NULL == pending_op->op_frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer);
}
if (NULL != pending_op->op_frag) {
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, endpoint, pending_op->op_buffer,
address, pending_op->op_frag->handle, address_handle, compare,
value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op,
NULL);
}
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
break;
@ -116,12 +205,14 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp
} while (1);
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
*result = ((int64_t *) pending_op->op_buffer)[0];
ret = OMPI_SUCCESS;
}
} else if (wait_for_completion) {
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
} else {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
@ -136,49 +227,7 @@ __opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result)
{
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
OBJ_RETAIN(pending_op);
pending_op->op_result = (void *) result;
pending_op->op_size = sizeof (*result);
/* spin until the btl has accepted the operation */
do {
if (NULL == pending_op->op_frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer);
}
if (NULL != pending_op->op_frag) {
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, pending_op->op_buffer,
address, pending_op->op_frag->handle, peer->state_handle, compare,
value, 0, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL);
}
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
break;
}
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
ret = OMPI_SUCCESS;
}
} else {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
}
OBJ_RELEASE(pending_op);
return ret;
return ompi_osc_rdma_btl_cswap (module, peer->state_endpoint, address, peer->state_handle, compare, value, 0, result);
}
/**
@ -311,7 +360,8 @@ static inline int ompi_osc_rdma_lock_try_acquire_exclusive (ompi_osc_rdma_module
if (0 == lock_state) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "exclusive lock acquired");
} else {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "could not acquire exclusive lock");
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "could not acquire exclusive lock. lock state 0x%" PRIx64,
(unsigned long) lock_state);
}
#endif
@ -362,11 +412,14 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d\n", lock, peer->rank);
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE,
false);
if (OMPI_SUCCESS != ret) {
abort ();
}
} else {
ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock);
}

Просмотреть файл

@ -8,7 +8,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
@ -113,23 +113,29 @@ int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
const int locking_mode = module->locking_mode;
int ret;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
do {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock");
/* lock the master lock. this requires no rank has a global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), 0xffffffff00000000L);
if (OMPI_SUCCESS != ret) {
ompi_osc_rdma_progress (module);
continue;
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
/* lock the master lock. this requires no rank has a global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock),
0xffffffff00000000L);
if (OMPI_SUCCESS != ret) {
ompi_osc_rdma_progress (module);
continue;
}
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer");
ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
if (ret) {
/* release the global lock */
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
}
ompi_osc_rdma_progress (module);
continue;
}
@ -157,20 +163,48 @@ static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *mo
static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
const int locking_mode = module->locking_mode;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer");
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
}
peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
} else {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock");
ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
peer->flags &= ~OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
{
ompi_osc_rdma_sync_t *lock = &module->all_sync;
int ret = OMPI_SUCCESS;
/* check for bad usage */
assert (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == lock->type);
OPAL_THREAD_SCOPED_LOCK(&peer->lock,
do {
if (!ompi_osc_rdma_peer_is_demand_locked (peer)) {
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
OPAL_THREAD_SCOPED_LOCK(&lock->lock, opal_list_append (&lock->demand_locked_peers, &peer->super));
peer->flags |= OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
}
} while (0);
);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
@ -315,9 +349,14 @@ int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
if (0 == (assert & MPI_MODE_NOCHECK)) {
/* increment the global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
offsetof(ompi_osc_rdma_state_t, global_lock),
0x00000000ffffffffUL);
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == module->locking_mode) {
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
offsetof(ompi_osc_rdma_state_t, global_lock),
0x00000000ffffffffUL);
} else {
/* always lock myself */
ret = ompi_osc_rdma_demand_lock_peer (module, module->my_peer);
}
}
if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
@ -357,8 +396,19 @@ int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
ompi_osc_rdma_sync_rdma_complete (lock);
if (0 == (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
/* decrement the master lock shared count */
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock));
if (OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode) {
ompi_osc_rdma_peer_t *peer, *next;
/* drop all on-demand locks */
OPAL_LIST_FOREACH_SAFE(peer, next, &lock->demand_locked_peers, ompi_osc_rdma_peer_t) {
(void) ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
opal_list_remove_item (&lock->demand_locked_peers, &peer->super);
}
} else {
/* decrement the master lock shared count */
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL,
offsetof (ompi_osc_rdma_state_t, global_lock));
}
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;

Просмотреть файл

@ -303,7 +303,7 @@ static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer)
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_object_t,
OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_list_item_t,
ompi_osc_rdma_peer_construct,
ompi_osc_rdma_peer_destruct);

Просмотреть файл

@ -22,7 +22,7 @@ struct ompi_osc_rdma_module_t;
* This object is used as a cache for information associated with a peer.
*/
struct ompi_osc_rdma_peer_t {
opal_object_t super;
opal_list_item_t super;
/** rdma data endpoint for this peer */
struct mca_btl_base_endpoint_t *data_endpoint;
@ -36,6 +36,9 @@ struct ompi_osc_rdma_peer_t {
/** registration handle associated with the state */
mca_btl_base_registration_handle_t *state_handle;
/** lock to protrct peer structure */
opal_mutex_t lock;
/** rank of this peer in the window */
int rank;
@ -134,6 +137,8 @@ enum {
OMPI_OSC_RDMA_PEER_STATE_FREE = 0x20,
/** peer base handle should be freed */
OMPI_OSC_RDMA_PEER_BASE_FREE = 0x40,
/** peer was demand locked as part of lock-all (when in demand locking mode) */
OMPI_OSC_RDMA_PEER_DEMAND_LOCKED = 0x80,
};
/**
@ -248,5 +253,15 @@ static inline bool ompi_osc_rdma_peer_local_state (ompi_osc_rdma_peer_t *peer)
return !!(peer->flags & OMPI_OSC_RDMA_PEER_LOCAL_STATE);
}
/**
* @brief check if the peer has been demand locked as part of the current epoch
*
* @param[in] peer peer object to check
*
*/
static inline bool ompi_osc_rdma_peer_is_demand_locked (ompi_osc_rdma_peer_t *peer)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_DEMAND_LOCKED);
}
#endif /* OMPI_OSC_RDMA_PEER_H */

Просмотреть файл

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
@ -44,27 +44,17 @@ static int request_free(struct ompi_request_t **ompi_req)
return OMPI_SUCCESS;
}
static int request_complete (struct ompi_request_t *request)
{
ompi_osc_rdma_request_t *parent_request = ((ompi_osc_rdma_request_t *) request)->parent_request;
if (parent_request && 0 == OPAL_THREAD_ADD_FETCH32 (&parent_request->outstanding_requests, -1)) {
ompi_osc_rdma_request_complete (parent_request, OMPI_SUCCESS);
}
return OMPI_SUCCESS;
}
static void request_construct(ompi_osc_rdma_request_t *request)
{
request->super.req_type = OMPI_REQUEST_WIN;
request->super.req_status._cancelled = 0;
request->super.req_free = request_free;
request->super.req_cancel = request_cancel;
request->super.req_complete_cb = request_complete;
request->parent_request = NULL;
request->to_free = NULL;
request->buffer = NULL;
request->internal = false;
request->cleanup = NULL;
request->outstanding_requests = 0;
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
}

Просмотреть файл

@ -1,7 +1,7 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -25,26 +25,22 @@ enum ompi_osc_rdma_request_type_t {
};
typedef enum ompi_osc_rdma_request_type_t ompi_osc_rdma_request_type_t;
struct ompi_osc_rdma_request_t;
typedef void (*ompi_osc_rdma_request_cleanup_fn_t) (struct ompi_osc_rdma_request_t *);
struct ompi_osc_rdma_request_t {
ompi_request_t super;
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_request_cleanup_fn_t cleanup;
ompi_osc_rdma_request_type_t type;
void *to_free;
void *origin_addr;
int origin_count;
struct ompi_datatype_t *origin_dt;
void *result_addr;
int result_count;
struct ompi_datatype_t *result_dt;
const void *compare_addr;
ompi_op_t *op;
ompi_osc_rdma_module_t *module;
int32_t outstanding_requests;
volatile int32_t outstanding_requests;
bool internal;
ptrdiff_t offset;
@ -69,35 +65,45 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
rdma_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_RDMA_REQUEST_ALLOC(rmodule, rpeer, req) \
do { \
opal_free_list_item_t *item; \
do { \
item = opal_free_list_get (&mca_osc_rdma_component.requests); \
if (NULL == item) { \
ompi_osc_rdma_progress (rmodule); \
} \
} while (NULL == item); \
req = (ompi_osc_rdma_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = module->win; \
req->super.req_state = OMPI_REQUEST_ACTIVE; \
req->module = rmodule; \
req->peer = (rpeer); \
(req) = OBJ_NEW(ompi_osc_rdma_request_t); \
OMPI_REQUEST_INIT(&(req)->super, false); \
(req)->super.req_mpi_object.win = (rmodule)->win; \
(req)->super.req_state = OMPI_REQUEST_ACTIVE; \
(req)->module = rmodule; \
(req)->peer = (rpeer); \
} while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
do { \
OMPI_REQUEST_FINI(&(req)->super); \
free ((req)->buffer); \
(req)->buffer = NULL; \
(req)->parent_request = NULL; \
(req)->internal = false; \
(req)->outstanding_requests = 0; \
opal_free_list_return (&mca_osc_rdma_component.requests, \
(opal_free_list_item_t *) (req)); \
free (req); \
} while (0)
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error);
static inline void ompi_osc_rdma_request_deref (ompi_osc_rdma_request_t *request)
{
if (1 == OPAL_THREAD_FETCH_ADD32 (&request->outstanding_requests, -1)) {
ompi_osc_rdma_request_complete (request, OMPI_SUCCESS);
}
}
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
{
ompi_osc_rdma_request_t *parent_request = request->parent_request;
if (request->cleanup) {
request->cleanup (request);
}
free (request->to_free);
if (parent_request) {
ompi_osc_rdma_request_deref (parent_request);
}
if (!request->internal) {
request->super.req_status.MPI_ERROR = mpi_error;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -16,15 +16,17 @@ static void ompi_osc_rdma_sync_constructor (ompi_osc_rdma_sync_t *rdma_sync)
{
rdma_sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
rdma_sync->epoch_active = false;
rdma_sync->outstanding_rdma = 0;
rdma_sync->outstanding_rdma.counter = 0;
OBJ_CONSTRUCT(&rdma_sync->aggregations, opal_list_t);
OBJ_CONSTRUCT(&rdma_sync->lock, opal_mutex_t);
OBJ_CONSTRUCT(&rdma_sync->demand_locked_peers, opal_list_t);
}
static void ompi_osc_rdma_sync_destructor (ompi_osc_rdma_sync_t *rdma_sync)
{
OBJ_DESTRUCT(&rdma_sync->aggregations);
OBJ_DESTRUCT(&rdma_sync->lock);
OBJ_DESTRUCT(&rdma_sync->demand_locked_peers);
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_sync_t, opal_object_t, ompi_osc_rdma_sync_constructor,

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -33,6 +33,13 @@ typedef enum ompi_osc_rdma_sync_type_t ompi_osc_rdma_sync_type_t;
struct ompi_osc_rdma_module_t;
struct ompi_osc_rdma_sync_aligned_counter_t {
volatile osc_rdma_counter_t counter;
/* pad out to next cache line */
uint64_t padding[7];
};
typedef struct ompi_osc_rdma_sync_aligned_counter_t ompi_osc_rdma_sync_aligned_counter_t;
/**
* @brief synchronization object
*
@ -78,6 +85,9 @@ struct ompi_osc_rdma_sync_t {
struct ompi_osc_rdma_peer_t *peer;
} peer_list;
/** demand locked peers (lock-all) */
opal_list_t demand_locked_peers;
/** number of peers */
int num_peers;
@ -85,7 +95,7 @@ struct ompi_osc_rdma_sync_t {
bool epoch_active;
/** outstanding rdma operations on epoch */
osc_rdma_counter_t outstanding_rdma;
ompi_osc_rdma_sync_aligned_counter_t outstanding_rdma __opal_attribute_aligned__(64);
/** aggregated operations in this epoch */
opal_list_t aggregations;
@ -129,30 +139,10 @@ void ompi_osc_rdma_sync_return (ompi_osc_rdma_sync_t *rdma_sync);
*/
bool ompi_osc_rdma_sync_pscw_peer (struct ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer);
/**
* @brief increment the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
static inline int64_t ompi_osc_rdma_sync_get_count (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, 1);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma);
}
/**
* @brief decrement the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma, -1);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma);
return rdma_sync->outstanding_rdma.counter;
}
#endif /* OSC_RDMA_SYNC_H */

Просмотреть файл

@ -205,6 +205,8 @@ typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t);
typedef void (*ompi_osc_rdma_pending_op_cb_fn_t) (void *, void *, int);
struct ompi_osc_rdma_pending_op_t {
opal_list_item_t super;
struct ompi_osc_rdma_frag_t *op_frag;
@ -212,12 +214,29 @@ struct ompi_osc_rdma_pending_op_t {
void *op_result;
size_t op_size;
volatile bool op_complete;
ompi_osc_rdma_pending_op_cb_fn_t cbfunc;
void *cbdata;
void *cbcontext;
};
typedef struct ompi_osc_rdma_pending_op_t ompi_osc_rdma_pending_op_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_op_t);
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_free_list_item_t super;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
volatile int32_t pending;
volatile int64_t curr_index;
struct ompi_osc_rdma_module_t *module;
mca_btl_base_registration_handle_t *handle;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
#define OSC_RDMA_VERBOSE(x, ...) OPAL_OUTPUT_VERBOSE((x, ompi_osc_base_framework.framework_output, __VA_ARGS__))
#endif /* OMPI_OSC_RDMA_TYPES_H */