1
1
openmpi/ompi/mca/osc/rdma/osc_rdma_passive_target.c
Nathan Hjelm d3d779f6d9 osc/rdma: clear all_sync object when obtaining a lock
This commit fixes a bad synchronization detection bug that occurs when
mixing MPI_Win_fence() and MPI_Win_lock(). If no communication has
occurred in the fence epoch it is safe to just clear the all_sync
object (it was set up by fence).

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2016-05-02 15:28:47 -06:00

384 строки
13 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma_passive_target.h"
#include "osc_rdma_comm.h"
#include "mpi.h"
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
ompi_osc_rdma_progress (GET_MODULE(win));
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
ompi_osc_rdma_peer_t *peer;
assert (0 <= target);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush: %d, %s", target, win->w_name);
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush. call one round of progress */
ompi_osc_rdma_progress (module);
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
lock = ompi_osc_rdma_module_sync_lookup (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock || OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "flush: target %d is not locked in window %s",
target, win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_UNLOCK(&module->lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush on target %d complete", target);
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
uint32_t key;
void *node;
/* flush is only allowed from within a passive target epoch */
if (!ompi_osc_rdma_in_passive_epoch (module)) {
return OMPI_ERR_RMA_SYNC;
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all: %s", win->w_name);
/* globally complete all outstanding rdma requests */
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) {
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
}
/* flush all locks */
ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node);
while (OPAL_SUCCESS == ret) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "flushing lock %p", (void *) lock);
ompi_osc_rdma_sync_rdma_complete (lock);
ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock,
node, &node);
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all complete");
return OPAL_SUCCESS;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
return ompi_osc_rdma_flush (target, win);
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
return ompi_osc_rdma_flush_all (win);
}
/* locking via atomics */
static inline int ompi_osc_rdma_lock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
int ret;
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
do {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global exclusive lock");
/* lock the master lock. this requires no rank has a global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 1, offsetof (ompi_osc_rdma_state_t, global_lock), 0xffffffff00000000L);
if (OMPI_SUCCESS != ret) {
ompi_osc_rdma_progress (module);
continue;
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring exclusive lock on peer");
ret = ompi_osc_rdma_lock_try_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
if (ret) {
/* release the global lock */
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
ompi_osc_rdma_progress (module);
continue;
}
peer->flags |= OMPI_OSC_RDMA_PEER_EXCLUSIVE;
break;
} while (1);
} else {
do {
/* go right to the target to acquire a shared lock */
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "incrementing global shared lock");
ret = ompi_osc_rdma_lock_acquire_shared (module, peer, 1, offsetof (ompi_osc_rdma_state_t, local_lock),
OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
if (OMPI_SUCCESS == ret) {
return OMPI_SUCCESS;
}
ompi_osc_rdma_progress (module);
} while (1);
}
return OMPI_SUCCESS;
}
static inline int ompi_osc_rdma_unlock_atomic_internal (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer,
ompi_osc_rdma_sync_t *lock)
{
if (MPI_LOCK_EXCLUSIVE == lock->sync.lock.type) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock on peer");
ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, local_lock));
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global exclusive lock");
ompi_osc_rdma_lock_release_shared (module, module->leader, -1, offsetof (ompi_osc_rdma_state_t, global_lock));
peer->flags &= ~OMPI_OSC_RDMA_PEER_EXCLUSIVE;
} else {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "decrementing global shared lock");
ompi_osc_rdma_lock_release_shared (module, peer, -1, offsetof (ompi_osc_rdma_state_t, local_lock));
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer = ompi_osc_rdma_module_peer (module, target);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock: %d, %d, %d, %s", lock_type, target, assert, win->w_name);
if (module->no_locks) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
return OMPI_ERR_RMA_SYNC;
}
if (module->all_sync.epoch_active && (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type || MPI_LOCK_EXCLUSIVE == lock_type)) {
/* impossible to get an exclusive lock while holding a global shared lock or in a active
* target access epoch */
return OMPI_ERR_RMA_SYNC;
}
/* clear the global sync object (in case MPI_Win_fence was called) */
module->all_sync.type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
/* create lock item */
lock = ompi_osc_rdma_sync_allocate (module);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = target;
lock->sync.lock.type = lock_type;
lock->sync.lock.assert = assert;
lock->peer_list.peer = peer;
lock->num_peers = 1;
OBJ_RETAIN(peer);
if (0 == (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_atomic_internal (module, peer, lock);
}
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
++module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_SCOPED_LOCK(&module->lock, ompi_osc_rdma_module_lock_insert (module, lock));
} else {
OBJ_RELEASE(lock);
}
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock %d complete", target);
return ret;
}
int ompi_osc_rdma_unlock_atomic (int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_peer_t *peer;
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock: %d, %s", target, win->w_name);
lock = ompi_osc_rdma_module_lock_find (module, target, &peer);
if (OPAL_UNLIKELY(NULL == lock)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "target %d is not locked in window %s",
target, win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ompi_osc_rdma_module_lock_remove (module, lock);
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (!(lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_unlock_atomic_internal (module, peer, lock);
}
/* release our reference to this peer */
OBJ_RELEASE(peer);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock %d complete", target);
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
/* delete the lock */
ompi_osc_rdma_sync_return (lock);
return ret;
}
int ompi_osc_rdma_lock_all_atomic (int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
int ret = OMPI_SUCCESS;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all: %d, %s", assert, win->w_name);
if (module->no_locks) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted to lock with no_locks set");
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
if (module->all_sync.epoch_active) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "attempted lock_all when active target epoch is %s "
"and lock all epoch is %s",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK != module->all_sync.type && module->all_sync.epoch_active) ?
"active" : "inactive",
(OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) ? "active" : "inactive");
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* set up lock */
lock = &module->all_sync;
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_LOCK;
lock->sync.lock.target = -1;
lock->sync.lock.type = MPI_LOCK_SHARED;
lock->sync.lock.assert = assert;
lock->num_peers = ompi_comm_size (module->comm);
lock->epoch_active = true;
/* NTH: TODO -- like fence it might be a good idea to create an array to access all peers
* without having to access the hash table. Such a change would likely increase performance
* at the expense of memory usage. Ex. if a window has 1M peers then 8MB per process would
* be needed for this array. */
if (0 != (assert & MPI_MODE_NOCHECK)) {
/* increment the global shared lock */
ret = ompi_osc_rdma_lock_acquire_shared (module, module->leader, 0x0000000100000000UL,
offsetof(ompi_osc_rdma_state_t, global_lock),
0x00000000ffffffffUL);
}
if (OPAL_LIKELY(OMPI_SUCCESS != ret)) {
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
} else {
++module->passive_target_access_epoch;
}
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "lock_all complete");
return ret;
}
int ompi_osc_rdma_unlock_all_atomic (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *lock;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all: %s", win->w_name);
OPAL_THREAD_LOCK(&module->lock);
lock = &module->all_sync;
if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK != lock->type) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "not locked in window %s", win->w_name);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* finish all outstanding fragments */
ompi_osc_rdma_sync_rdma_complete (lock);
if (0 != (lock->sync.lock.assert & MPI_MODE_NOCHECK)) {
/* decrement the master lock shared count */
(void) ompi_osc_rdma_lock_release_shared (module, module->leader, -0x0000000100000000UL, offsetof (ompi_osc_rdma_state_t, global_lock));
}
lock->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
lock->num_peers = 0;
lock->epoch_active = false;
--module->passive_target_access_epoch;
opal_atomic_wmb ();
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "unlock_all complete");
return OMPI_SUCCESS;
}