1
1
openmpi/ompi/mca/osc/rdma/osc_rdma_passive_target.c
Nathan Hjelm 7f4872d483 osc/rdma: performance improvments and bug fixes
This commit is a large update to the osc/rdma component. Included in
this commit:

 - Add support for using hardware atomics for fetch-and-op and single
   count accumulate  when using the accumulate lock. This will improve
   the performance of these operations even when not setting the
   single intrinsic info key.

 - Rework how large accumulates are done. They now block on the get
   operation to fix some bugs discovered by an IBM one-sided test. I
   may roll back some of the changes if the underlying bug in the
   original design is discovered. There appear to be no real
   difference (on the hardware this was tested with) in performance so
   its probably a non-issue. References #2530.

 - Add support for an additional lock-all algorithm: on-demand. The
   on-demand algorithm will attempt to acquire the peer lock when
   starting an RMA operation. The lock algorithm default has not
   changed. The algorithm can be selected by setting the
   osc_rdma_locking_mode MCA variable. The valid values are two_level
   and on_demand.

 - Make use of the btl_flush function if available. This can improve
   performance with some btls.

 - When using btl_flush do not keep track of the number of put
   operations. This reduces the number of atomic operations in the
   critical path.

 - Make the window buffers more friendly to multi-threaded
   applications. This was done by dropping support for multiple
   buffers per MPI window. I intend to re-add that support once the
   underlying performance bug under the old buffering scheme is
   fixed.

 - Fix a bug in request completion in the accumulate, get, and put
   paths. This also helps with #2530.

 - General code cleanup and fixes.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2018-03-15 14:53:53 -06:00

428 строки
15 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma_passive_target.h"
#include "osc_rdma_comm.h"
#include "mpi.h"
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
ompi_osc_rdma_progress (GET_MODULE(win));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
ompi_osc_rdma_peer_t *peer;
assert (0 <= target);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name);
OPAL_THREAD_LOCK(&module->lock);
lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "flush: target %d is not locked in window %s",
target, win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_UNLOCK(&module->lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush on target %d complete", target);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
uint32_t key;
void *node;
/* flush is only allowed from within a passive target epoch */
if (!ompi_osc_rdma_in_passive_epoch (module)) {
return OMPI_ERR_RMA_SYNC;
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all: %s", win->w_name);
/* globally complete all outstanding rdma requests */
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
}
/* flush all locks */
ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
while (OPAL_SUCCESS == ret) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "flushing lock %p", (void *) lock);
ompi_osc_rdma_sync_rdma_complete (lock);
ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
node, &node);
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all complete");
return OPAL_SUCCESS;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
return ompi_osc_rdma_flush (target, win);
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
return ompi_osc_rdma_flush_all (win);
}
/* locking via atomics */
static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
const int locking_mode = module->locking_mode;
int ret;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
do {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock");
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
/* lock the master lock. this requires no rank has a global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock),
0xffffffff00000000L);
if (OMPI_SUCCESS != ret) {
ompi_osc_rdma_progress (module);
continue;
}
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer");
ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
if (ret) {
/* release the global lock */
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
}
ompi_osc_rdma_progress (module);
continue;
}
peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
break;
} while (1);
} else {
do {
/* go right to the target to acquire a shared lock */
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global shared lock");
ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
if (OMPI_SUCCESS == ret) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_progress (module);
} while (1);
}
return OMPI_SUCCESS;
}
static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
const int locking_mode = module->locking_mode;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer");
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == locking_mode) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
}
peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
} else {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock");
ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
peer->flags &= ~OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
{
ompi_osc_rdma_sync_t *lock = &module->all_sync;
int ret = OMPI_SUCCESS;
/* check for bad usage */
assert (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == lock->type);
OPAL_THREAD_SCOPED_LOCK(&peer->lock,
do {
if (!ompi_osc_rdma_peer_is_demand_locked (peer)) {
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
OPAL_THREAD_SCOPED_LOCK(&lock->lock, opal_list_append (&lock->demand_locked_peers, &peer->super));
peer->flags |= OMPI_OSC_RDMA_PEER_DEMAND_LOCKED;
}
} while (0);
);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);
if (module->no_locks) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
return OMPI_ERR_RMA_SYNC;
}
if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
/* impossible to get an exclusive lock while holding a global shared lock or in a active
* target access epoch */
return OMPI_ERR_RMA_SYNC;
}
/* clear the global sync object (in case MPI_Win_fence was called) */
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
/* create lock item */
lock = ompi_osc_rdma_sync_allocate (module);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = target;
lock->sync.lock.type = lock_type;
lock->sync.lock.assert = assert;
lock->peer_list.peer = peer;
lock->num_peers = 1;
OBJ_RETAIN(peer);
if (0 == (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
}
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
++module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
} else {
OBJ_RELEASE(lock);
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);
return ret;
}
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock: %d, %s", target, win->w_name);
lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "target %d is not locked in window %s",
target, win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ompi_osc_rdma_module_lock_remove (module, lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
}
/* release our reference to this peer */
OBJ_RELEASE(peer);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock %d complete", target);
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
/* delete the lock */
ompi_osc_rdma_sync_return (lock);
return ret;
}
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all: %d, %s", assert, win->w_name);
if (module->no_locks) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
if (module->all_sync.epoch_active) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted lock_all when active target epoch is %s "
"and lock all epoch is %s",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
"active" : "inactive",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive");
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* set up lock */
lock = &module->all_sync;
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = -1;
lock->sync.lock.type = MPI_LOCK_SHARED;
lock->sync.lock.assert = assert;
lock->num_peers = ompi_comm_size (module->comm);
lock->epoch_active = true;
/* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
* without having to access the hash table. Such a change would likely increase performance
* at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
* be needed for this array. */
if (0 == (assert & MPI_MODE_NOCHECK)) {
/* increment the global shared lock */
if (OMPI_OSC_RDMA_LOCKING_TWO_LEVEL == module->locking_mode) {
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
offsetof(ompi_osc_rdma_state_t, global_lock),
0x00000000ffffffffUL);
} else {
/* always lock myself */
ret = ompi_osc_rdma_demand_lock_peer (module, module->my_peer);
}
}
if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
} else {
++module->passive_target_access_epoch;
}
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all complete");
return ret;
}
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all: %s", win->w_name);
OPAL_THREAD_LOCK(&module->lock);
lock = &module->all_sync;
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "not locked in window %s", win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (0 == (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
if (OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode) {
ompi_osc_rdma_peer_t *peer, *next;
/* drop all on-demand locks */
OPAL_LIST_FOREACH_SAFE(peer, next, &lock->demand_locked_peers, ompi_osc_rdma_peer_t) {
(void) ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
opal_list_remove_item (&lock->demand_locked_peers, &peer->super);
}
} else {
/* decrement the master lock shared count */
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL,
offsetof (ompi_osc_rdma_state_t, global_lock));
}
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all complete");
return OMPI_SUCCESS;
}