1b564f62bd
This reverts commit ccaecf0fd6c862877e6a1e2643f95fa956c87769, reversing changes made to 6a19bf85dde5306f559f09952cf3919d97f52502.
967 строки
33 KiB
C
967 строки
33 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2010 IBM Corporation. All rights reserved.
|
|
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include "osc_rdma.h"
|
|
#include "osc_rdma_header.h"
|
|
#include "osc_rdma_data_move.h"
|
|
#include "osc_rdma_frag.h"
|
|
|
|
#include "mpi.h"
|
|
#include "opal/runtime/opal_progress.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "ompi/communicator/communicator.h"
|
|
#include "ompi/mca/osc/base/base.h"
|
|
#include "opal/include/opal_stdint.h"
|
|
|
|
/* target-side tracking of a lock request */
|
|
struct ompi_osc_rdma_pending_lock_t {
|
|
opal_list_item_t super;
|
|
int peer;
|
|
int lock_type;
|
|
uint64_t serial_number;
|
|
};
|
|
typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t;
|
|
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t,
|
|
NULL, NULL);
|
|
|
|
|
|
/* origin-side tracking of a lock request */
|
|
struct ompi_osc_rdma_outstanding_lock_t {
|
|
opal_list_item_t super;
|
|
int target;
|
|
int32_t lock_acks_received;
|
|
int32_t unlock_acks_received;
|
|
int32_t flush_acks_received;
|
|
uint64_t serial_number;
|
|
int32_t type;
|
|
};
|
|
typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t;
|
|
OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t,
|
|
NULL, NULL);
|
|
|
|
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module);
|
|
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
|
|
int lock_type, uint64_t serial_number);
|
|
|
|
/**
|
|
* Find the first outstanding lock to a target.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] target - Target rank
|
|
*
|
|
* @returns an outstanding lock on success
|
|
*
|
|
* This function traverses the outstanding_locks list in the module
|
|
* looking for a lock that matches target. The caller must hold the
|
|
* module lock.
|
|
*/
|
|
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target)
|
|
{
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
|
if (lock->target == target) {
|
|
return lock;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number)
|
|
{
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
|
if (lock->serial_number == serial_number) {
|
|
return lock;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
|
|
{
|
|
const int my_rank = ompi_comm_rank (module->comm);
|
|
|
|
if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) ||
|
|
(MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) {
|
|
/* we can aquire the lock immediately */
|
|
module->lock_status = lock->type;
|
|
if (MPI_LOCK_SHARED == lock->type) {
|
|
module->shared_count++;
|
|
}
|
|
|
|
lock->lock_acks_received++;
|
|
} else {
|
|
/* queue the lock */
|
|
queue_lock (module, my_rank, lock->type, lock->serial_number);
|
|
}
|
|
|
|
/* If locking local, can't be non-blocking according to the
|
|
standard. We need to wait for the ack here. */
|
|
while (0 == lock->lock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"local lock aquired"));
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
|
|
{
|
|
if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) {
|
|
module->lock_status = 0;
|
|
ompi_osc_activate_next_lock (module);
|
|
}
|
|
|
|
/* need to ensure we make progress */
|
|
opal_progress();
|
|
|
|
lock->unlock_acks_received++;
|
|
}
|
|
|
|
static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
|
|
{
|
|
ompi_osc_rdma_header_lock_t lock_req;
|
|
int ret;
|
|
|
|
/* generate a lock request */
|
|
lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ;
|
|
lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
|
lock_req.lock_type = lock->type;
|
|
lock_req.serial_number = lock->serial_number;
|
|
|
|
ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req));
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
/* make sure the request gets sent, so we can start eager sending... */
|
|
ret = ompi_osc_rdma_frag_flush_target (module, target);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
|
|
{
|
|
ompi_osc_rdma_header_unlock_t unlock_req;
|
|
|
|
unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ;
|
|
unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
|
unlock_req.frag_count = module->epoch_outgoing_frag_count[target];
|
|
unlock_req.lock_type = lock->type;
|
|
|
|
/* send control message with unlock request and count */
|
|
return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req));
|
|
}
|
|
|
|
|
|
|
|
int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
ompi_osc_rdma_peer_t *peer = module->peers + target;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
/* Check if no_locks is set. TODO: we also need to track whether we are in an
|
|
* active target epoch. Fence can make this tricky to track. */
|
|
if (NULL == module->passive_eager_send_active || module->sc_group) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
assert(module->epoch_outgoing_frag_count[target] == 0);
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: lock %d %d", target, lock_type));
|
|
|
|
/* delay all eager sends until we've heard back.. */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
module->passive_eager_send_active[target] = false;
|
|
module->passive_target_access_epoch = true;
|
|
|
|
/* when the lock ack returns we will be in an access epoch with this peer */
|
|
peer->access_epoch = true;
|
|
|
|
/* create lock item */
|
|
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
|
|
if (OPAL_UNLIKELY(NULL == lock)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
lock->target = target;
|
|
lock->lock_acks_received = 0;
|
|
lock->unlock_acks_received = 0;
|
|
lock->serial_number = module->lock_serial_number++;
|
|
lock->type = lock_type;
|
|
opal_list_append(&module->outstanding_locks, &lock->super);
|
|
|
|
if (0 == (assert & MPI_MODE_NOCHECK)) {
|
|
if (ompi_comm_rank (module->comm) != target) {
|
|
ret = ompi_osc_rdma_lock_remote (module, target, lock);
|
|
} else {
|
|
ret = ompi_osc_rdma_lock_self (module, lock);
|
|
}
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
goto exit_error;
|
|
}
|
|
} else {
|
|
lock->lock_acks_received = 1;
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
exit_error:
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
|
OBJ_RELEASE(lock);
|
|
|
|
/* return */
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
|
|
ompi_osc_rdma_peer_t *peer = module->peers + target;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
lock = find_outstanding_lock (module, target);
|
|
if (OPAL_UNLIKELY(NULL == lock)) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
|
|
target, win->w_name));
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
if (ompi_comm_rank (module->comm) != target) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: unlock %d, lock_acks_received = %d", target,
|
|
lock->lock_acks_received));
|
|
|
|
/* wait until ack has arrived from target */
|
|
while (0 == lock->lock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
ret = ompi_osc_rdma_unlock_remote (module, target, lock);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
goto cleanup;
|
|
}
|
|
|
|
/* start all sendreqs to target */
|
|
ret = ompi_osc_rdma_frag_flush_target(module, target);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
goto cleanup;
|
|
}
|
|
|
|
/* wait for all the requests and the unlock ack (meaning remote completion) */
|
|
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
|
0 == lock->unlock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_unlock: unlock of %d complete", target));
|
|
} else {
|
|
ompi_osc_rdma_unlock_self (module, lock);
|
|
}
|
|
|
|
module->passive_eager_send_active[target] = false;
|
|
module->epoch_outgoing_frag_count[target] = 0;
|
|
module->passive_target_access_epoch = false;
|
|
|
|
peer->access_epoch = false;
|
|
|
|
/* delete the lock */
|
|
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
|
OBJ_RELEASE(lock);
|
|
|
|
cleanup:
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
int ret, my_rank = ompi_comm_rank (module->comm);
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
/* Check if no_locks is set. TODO: we also need to track whether we are in an active
|
|
* target epoch. Fence can make this tricky to track. */
|
|
if (NULL == module->passive_eager_send_active) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
/* delay all eager sends until we've heard back.. */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
|
module->passive_eager_send_active[i] = false;
|
|
}
|
|
module->passive_target_access_epoch = true;
|
|
module->all_access_epoch = true;
|
|
|
|
/* create lock item */
|
|
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
|
|
lock->target = -1;
|
|
lock->lock_acks_received = 0;
|
|
lock->unlock_acks_received = 0;
|
|
lock->serial_number = module->lock_serial_number++;
|
|
lock->type = MPI_LOCK_SHARED;
|
|
opal_list_append(&module->outstanding_locks, &lock->super);
|
|
|
|
/* if nocheck is not specified, send a lock request to everyone
|
|
and wait for the local response */
|
|
if (0 != (assert & MPI_MODE_NOCHECK)) {
|
|
ret = ompi_osc_rdma_lock_self (module, lock);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
goto exit_error;
|
|
}
|
|
|
|
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
|
if (my_rank == i) {
|
|
continue;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_lock_remote (module, i, lock);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
|
}
|
|
}
|
|
} else {
|
|
lock->lock_acks_received = ompi_comm_size(module->comm);
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
exit_error:
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
opal_list_remove_item(&module->outstanding_locks, &lock->super);
|
|
OBJ_RELEASE(lock);
|
|
|
|
/* return */
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
int my_rank = ompi_comm_rank (module->comm);
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_unlock_all entering..."));
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
lock = find_outstanding_lock (module, -1);
|
|
if (OPAL_UNLIKELY(NULL == lock)) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_unlock_all: not locked in window %s",
|
|
win->w_name));
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
/* wait for lock acks */
|
|
while (ompi_comm_size(module->comm) != lock->lock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
/* send unlock messages to all of my peers */
|
|
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
|
if (my_rank == i) {
|
|
continue;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_unlock_remote (module, i, lock);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
goto cleanup;
|
|
}
|
|
}
|
|
|
|
/* unlock myself */
|
|
ompi_osc_rdma_unlock_self (module, lock);
|
|
|
|
/* start all sendreqs to target */
|
|
ret = ompi_osc_rdma_frag_flush_all(module);
|
|
if (OMPI_SUCCESS != ret) goto cleanup;
|
|
|
|
/* wait for all the requests and the unlock ack (meaning remote completion) */
|
|
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
|
ompi_comm_size(module->comm) != lock->unlock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
/* reset all fragment counters */
|
|
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
|
|
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
|
|
|
|
opal_list_remove_item (&module->outstanding_locks, &lock->super);
|
|
OBJ_RELEASE(lock);
|
|
|
|
module->passive_target_access_epoch = false;
|
|
module->all_access_epoch = false;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_unlock_all complete"));
|
|
|
|
cleanup:
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_sync (struct ompi_win_t *win)
|
|
{
|
|
opal_progress();
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock,
|
|
int target)
|
|
{
|
|
ompi_osc_rdma_header_flush_t flush_req;
|
|
int peer_count, ret, flush_count;
|
|
int my_rank = ompi_comm_rank (module->comm);
|
|
|
|
if (-1 == lock->target) {
|
|
peer_count = ompi_comm_size(module->comm);
|
|
} else {
|
|
peer_count = 1;
|
|
}
|
|
|
|
/* wait until ack has arrived from target, since we need to be
|
|
able to eager send before we can transfer all the data... */
|
|
while (peer_count > lock->lock_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
lock->flush_acks_received = 0;
|
|
|
|
flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ;
|
|
flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
|
|
flush_req.serial_number = lock->serial_number;
|
|
|
|
if (-1 == target) {
|
|
/* NTH: no local flush */
|
|
flush_count = ompi_comm_size(module->comm) - 1;
|
|
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
|
|
if (i == my_rank) {
|
|
continue;
|
|
}
|
|
|
|
flush_req.frag_count = module->epoch_outgoing_frag_count[i];
|
|
|
|
/* send control message with flush request and count */
|
|
ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req));
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
/* start all sendreqs to target */
|
|
ret = ompi_osc_rdma_frag_flush_target (module, i);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
}
|
|
} else {
|
|
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
|
|
flush_count = 1;
|
|
/* send control message with flush request and count */
|
|
ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req));
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
/* start all sendreqs to target */
|
|
ret = ompi_osc_rdma_frag_flush_target (module, target);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
/* wait for all the requests and the flush ack (meaning remote completion) */
|
|
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
|
|
flush_count != lock->flush_acks_received) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
if (-1 == target) {
|
|
memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0]));
|
|
} else {
|
|
module->epoch_outgoing_frag_count[target] = 0;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
int ret;
|
|
|
|
assert (0 <= target);
|
|
|
|
/* flush is only allowed from within a passive target epoch */
|
|
if (!module->passive_target_access_epoch) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_flush starting..."));
|
|
|
|
if (ompi_comm_rank (module->comm) == target) {
|
|
/* nothing to flush */
|
|
opal_progress ();
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
lock = find_outstanding_lock (module, target);
|
|
if (NULL == lock) {
|
|
lock = find_outstanding_lock (module, -1);
|
|
}
|
|
if (OPAL_UNLIKELY(NULL == lock)) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_flush: target %d is not locked in window %s",
|
|
target, win->w_name));
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_flush_lock (module, lock, target);
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
/* flush is only allowed from within a passive target epoch */
|
|
if (!module->passive_target_access_epoch) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_flush_all: no targets are locked in window %s",
|
|
win->w_name));
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_flush_all entering..."));
|
|
|
|
/* flush all locks */
|
|
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
|
ret = ompi_osc_rdma_flush_lock (module, lock, lock->target);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_flush_all complete"));
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
int ret;
|
|
|
|
/* flush is only allowed from within a passive target epoch */
|
|
if (!module->passive_target_access_epoch) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
ret = ompi_osc_rdma_frag_flush_target(module, target);
|
|
if (OMPI_SUCCESS != ret) goto cleanup;
|
|
|
|
/* wait for all the requests */
|
|
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
cleanup:
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
|
|
{
|
|
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
/* flush is only allowed from within a passive target epoch */
|
|
if (!module->passive_target_access_epoch) {
|
|
return OMPI_ERR_RMA_SYNC;
|
|
}
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
ret = ompi_osc_rdma_frag_flush_all(module);
|
|
if (OMPI_SUCCESS != ret) goto cleanup;
|
|
|
|
/* wait for all the requests */
|
|
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
|
|
opal_condition_wait(&module->cond, &module->lock);
|
|
}
|
|
|
|
cleanup:
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* target side operation to acknowledge to initiator side that the
|
|
lock is now held by the initiator */
|
|
static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
|
|
uint64_t serial_number)
|
|
{
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
if (ompi_comm_rank (module->comm) != requestor) {
|
|
ompi_osc_rdma_header_lock_ack_t lock_ack;
|
|
|
|
lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK;
|
|
lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
|
lock_ack.source = ompi_comm_rank(module->comm);
|
|
lock_ack.windx = ompi_comm_get_cid(module->comm);
|
|
lock_ack.serial_number = serial_number;
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: sending lock to %d", requestor));
|
|
|
|
/* we don't want to send any data, since we're the exposure
|
|
epoch only, so use an unbuffered send */
|
|
return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
|
|
}
|
|
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: releasing local lock"));
|
|
|
|
lock = find_outstanding_lock (module, requestor);
|
|
if (NULL == lock) {
|
|
lock = find_outstanding_lock (module, -1);
|
|
if (OPAL_UNLIKELY(NULL == lock)) {
|
|
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
|
|
"lock could not be located"));
|
|
}
|
|
}
|
|
|
|
lock->lock_acks_received++;
|
|
opal_condition_broadcast (&module->cond);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/* target side operation to create a pending lock request for a lock
|
|
request that could not be satisfied */
|
|
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
|
|
int lock_type, uint64_t serial_number)
|
|
{
|
|
ompi_osc_rdma_pending_lock_t *pending =
|
|
OBJ_NEW(ompi_osc_rdma_pending_lock_t);
|
|
if (NULL == pending) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
pending->peer = requestor;
|
|
pending->lock_type = lock_type;
|
|
pending->serial_number = serial_number;
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: queueing lock request from %d", requestor));
|
|
|
|
opal_list_append(&module->locks_pending, &pending->super);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) {
|
|
/* release any other pending locks we can */
|
|
ompi_osc_rdma_pending_lock_t *pending_lock, *next;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
|
|
ompi_osc_rdma_pending_lock_t) {
|
|
if (MPI_LOCK_SHARED == pending_lock->lock_type) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n",
|
|
pending_lock->peer));
|
|
/* acquire shared lock */
|
|
module->lock_status = MPI_LOCK_SHARED;
|
|
module->shared_count++;
|
|
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
|
|
|
|
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
|
|
OBJ_RELEASE(pending_lock);
|
|
} else {
|
|
if (0 == module->lock_status) {
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n",
|
|
pending_lock->peer));
|
|
/* acquire exclusive lock */
|
|
module->lock_status = MPI_LOCK_EXCLUSIVE;
|
|
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
|
|
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
|
|
OBJ_RELEASE(pending_lock);
|
|
}
|
|
/* if the lock was acquired (ie, status was 0), then
|
|
we're done. If the lock was not acquired, we're
|
|
also done, because all the shared locks have to
|
|
finish first */
|
|
break;
|
|
}
|
|
|
|
if (OMPI_SUCCESS != ret) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
/* target side function called when the initiator sends a lock
|
|
request. Lock will either be activated and acknowledged or
|
|
queued. */
|
|
int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source,
|
|
ompi_osc_rdma_header_lock_t* lock_header)
|
|
{
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d",
|
|
source, module->lock_status, module->shared_count));
|
|
|
|
if (MPI_LOCK_SHARED == lock_header->lock_type) {
|
|
if (module->lock_status != MPI_LOCK_EXCLUSIVE) {
|
|
/* acquire shared lock */
|
|
module->lock_status = MPI_LOCK_SHARED;
|
|
module->shared_count++;
|
|
ret = activate_lock(module, source, lock_header->serial_number);
|
|
} else {
|
|
/* lock not available, queue */
|
|
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
|
|
}
|
|
} else {
|
|
if (0 == module->lock_status) {
|
|
/* acquire exclusive lock */
|
|
module->lock_status = MPI_LOCK_EXCLUSIVE;
|
|
ret = activate_lock(module, source, lock_header->serial_number);
|
|
} else {
|
|
/* lock not available, queue */
|
|
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
/* initiator-side function called when the target acks the lock
|
|
request. */
|
|
void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module,
|
|
ompi_osc_rdma_header_lock_ack_t *lock_ack_header)
|
|
{
|
|
ompi_osc_rdma_outstanding_lock_t *lock, *next;
|
|
|
|
OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
|
|
if (lock->serial_number == lock_ack_header->serial_number) {
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: lock ack %d", lock_ack_header->source));
|
|
|
|
lock->lock_acks_received++;
|
|
module->passive_eager_send_active[lock_ack_header->source] = true;
|
|
return;
|
|
}
|
|
}
|
|
|
|
opal_output(ompi_osc_base_framework.framework_output,
|
|
"osc rdma: lock ack %d, %ld for unfindable lock request",
|
|
lock_ack_header->source, (unsigned long) lock_ack_header->serial_number);
|
|
}
|
|
|
|
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_flush_ack_t *flush_ack_header) {
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
|
|
source, flush_ack_header->serial_number));
|
|
|
|
/* NTH: need to verify that this will work as expected */
|
|
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
|
|
assert (NULL != lock);
|
|
|
|
lock->flush_acks_received++;
|
|
|
|
opal_condition_broadcast(&module->cond);
|
|
}
|
|
|
|
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) {
|
|
ompi_osc_rdma_outstanding_lock_t *lock;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d",
|
|
source));
|
|
|
|
/* NTH: need to verify that this will work as expected */
|
|
lock = find_outstanding_lock (module, source);
|
|
if (NULL == lock) {
|
|
lock = find_outstanding_lock(module, -1);
|
|
assert (NULL != lock);
|
|
}
|
|
|
|
lock->unlock_acks_received++;
|
|
}
|
|
|
|
/**
|
|
* Process an unlock request.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] unlock_header - Incoming unlock header
|
|
*
|
|
* This functions is the target-side functio for handling an unlock
|
|
* request. Once all pending operations from the target are complete
|
|
* this functions sends an unlock acknowledgement then attempts to
|
|
* active a pending lock if the lock becomes free.
|
|
*/
|
|
int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_unlock_t *unlock_header)
|
|
{
|
|
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_process_unlock entering (finished %d/%d)...",
|
|
module->passive_incoming_frag_count[source],
|
|
module->passive_incoming_frag_signal_count[source]));
|
|
|
|
/* we cannot block when processing an incoming request */
|
|
if (module->passive_incoming_frag_signal_count[source] !=
|
|
module->passive_incoming_frag_count[source]) {
|
|
return OMPI_ERR_WOULD_BLOCK;
|
|
}
|
|
|
|
unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK;
|
|
unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
|
|
|
ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
module->passive_incoming_frag_signal_count[source] = 0;
|
|
module->passive_incoming_frag_count[source] = 0;
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) {
|
|
module->lock_status = 0;
|
|
|
|
ompi_osc_activate_next_lock (module);
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: finished processing unlock fragment"));
|
|
|
|
return ret;
|
|
}
|
|
|
|
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_flush_t *flush_header)
|
|
{
|
|
ompi_osc_rdma_header_flush_ack_t flush_ack;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_process_flush entering (finished %d/%d)...",
|
|
module->passive_incoming_frag_count[source],
|
|
module->passive_incoming_frag_signal_count[source]));
|
|
|
|
/* we cannot block when processing an incoming request */
|
|
if (module->passive_incoming_frag_signal_count[source] !=
|
|
module->passive_incoming_frag_count[source]) {
|
|
return OMPI_ERR_WOULD_BLOCK;
|
|
}
|
|
|
|
module->passive_incoming_frag_signal_count[source] = 0;
|
|
module->passive_incoming_frag_count[source] = 0;
|
|
|
|
flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK;
|
|
flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
|
|
flush_ack.serial_number = flush_header->serial_number;
|
|
|
|
return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
|
|
}
|