1fc9a55d08
This fixes a bug in r31029 which removes the use of the pml base request (also not a good way since cm doesn't use the base request). We now allocate a data structure (ugh) to determine the needed information. Tested with mtt/onesided. cmr=v1.7.5:ticket=trac:4379 This commit was SVN r31044. The following SVN revision numbers were found above: r31029 --> open-mpi/ompi@29e00f9161 The following Trac tickets were found above: Ticket 4379 --> https://svn.open-mpi.org/trac/ompi/ticket/4379
1598 строки
56 KiB
C
1598 строки
56 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2009-2011 Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include "osc_rdma.h"
|
|
#include "osc_rdma_header.h"
|
|
#include "osc_rdma_data_move.h"
|
|
#include "osc_rdma_obj_convert.h"
|
|
#include "osc_rdma_frag.h"
|
|
#include "osc_rdma_request.h"
|
|
|
|
#include "opal/threads/condition.h"
|
|
#include "opal/threads/mutex.h"
|
|
#include "opal/util/arch.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/sys/atomic.h"
|
|
#include "opal/align.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/pml/base/pml_base_sendreq.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/mca/osc/base/base.h"
|
|
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
|
#include "ompi/datatype/ompi_datatype.h"
|
|
#include "ompi/op/op.h"
|
|
#include "ompi/memchecker.h"
|
|
|
|
/**
|
|
* struct osc_rdma_accumulate_data_t:
|
|
*
|
|
* @short Data associated with an in-progress accumulation operation.
|
|
*/
|
|
struct osc_rdma_accumulate_data_t {
|
|
opal_list_item_t super;
|
|
ompi_osc_rdma_module_t* module;
|
|
void *target;
|
|
void *source;
|
|
size_t source_len;
|
|
ompi_proc_t *proc;
|
|
int count;
|
|
int peer;
|
|
ompi_datatype_t *datatype;
|
|
ompi_op_t *op;
|
|
int request_count;
|
|
};
|
|
typedef struct osc_rdma_accumulate_data_t osc_rdma_accumulate_data_t;
|
|
|
|
static void osc_rdma_accumulate_data_constructor (osc_rdma_accumulate_data_t *acc_data)
|
|
{
|
|
acc_data->source = NULL;
|
|
acc_data->datatype = NULL;
|
|
acc_data->op = NULL;
|
|
}
|
|
|
|
static void osc_rdma_accumulate_data_destructor (osc_rdma_accumulate_data_t *acc_data)
|
|
{
|
|
if (acc_data->source) {
|
|
/* the source buffer is always alloc'd */
|
|
free (acc_data->source);
|
|
}
|
|
|
|
if (acc_data->datatype) {
|
|
OBJ_RELEASE(acc_data->datatype);
|
|
}
|
|
|
|
if (acc_data->op) {
|
|
OBJ_RELEASE(acc_data->op);
|
|
}
|
|
}
|
|
|
|
OBJ_CLASS_DECLARATION(osc_rdma_accumulate_data_t);
|
|
OBJ_CLASS_INSTANCE(osc_rdma_accumulate_data_t, opal_list_item_t, osc_rdma_accumulate_data_constructor,
|
|
osc_rdma_accumulate_data_destructor);
|
|
|
|
/**
|
|
* osc_rdma_pending_acc_t:
|
|
*
|
|
* @short Keep track of accumulate and cswap operations that are
|
|
* waiting on the accumulate lock.
|
|
*
|
|
* @long Since accumulate operations may take several steps to
|
|
* complete we need to lock the accumulate lock until the operation
|
|
* is complete. While the lock is held it is possible that additional
|
|
* accumulate operations will arrive. This structure keep track of
|
|
* those operations.
|
|
*/
|
|
struct osc_rdma_pending_acc_t {
|
|
opal_list_item_t super;
|
|
ompi_osc_rdma_header_t header;
|
|
int source;
|
|
void *data;
|
|
size_t data_len;
|
|
ompi_datatype_t *datatype;
|
|
};
|
|
typedef struct osc_rdma_pending_acc_t osc_rdma_pending_acc_t;
|
|
|
|
static void osc_rdma_pending_acc_constructor (osc_rdma_pending_acc_t *pending)
|
|
{
|
|
pending->data = NULL;
|
|
pending->datatype = NULL;
|
|
}
|
|
|
|
static void osc_rdma_pending_acc_destructor (osc_rdma_pending_acc_t *pending)
|
|
{
|
|
if (NULL != pending->data) {
|
|
free (pending->data);
|
|
}
|
|
|
|
if (NULL != pending->datatype) {
|
|
OBJ_RELEASE(pending->datatype);
|
|
}
|
|
}
|
|
|
|
OBJ_CLASS_DECLARATION(osc_rdma_pending_acc_t);
|
|
OBJ_CLASS_INSTANCE(osc_rdma_pending_acc_t, opal_list_item_t,
|
|
osc_rdma_pending_acc_constructor, osc_rdma_pending_acc_destructor);
|
|
/* end ompi_osc_rdma_pending_acc_t class */
|
|
|
|
/**
|
|
* datatype_buffer_length:
|
|
*
|
|
* @short Determine the buffer size needed to hold count elements of datatype.
|
|
*
|
|
* @param[in] datatype - Element type
|
|
* @param[in] count - Element count
|
|
*
|
|
* @returns buflen Buffer length needed to hold count elements of datatype
|
|
*/
|
|
static inline int datatype_buffer_length (ompi_datatype_t *datatype, int count)
|
|
{
|
|
ompi_datatype_t *primitive_datatype = NULL;
|
|
uint32_t primitive_count;
|
|
size_t buflen;
|
|
|
|
ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count);
|
|
primitive_count *= count;
|
|
|
|
/* figure out how big a buffer we need */
|
|
ompi_datatype_type_size(primitive_datatype, &buflen);
|
|
|
|
return buflen * primitive_count;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_control_send:
|
|
*
|
|
* @short send a control message as part of a fragment
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] target - Target peer's rank
|
|
* @param[in] data - Data to send
|
|
* @param[in] len - Length of data
|
|
*
|
|
* @returns error OMPI error code or OMPI_SUCCESS
|
|
*
|
|
* @long "send" a control messages. Adds it to the active fragment, so the
|
|
* caller will still need to explicitly flush (either to everyone or
|
|
* to a target) before this is sent.
|
|
*/
|
|
int ompi_osc_rdma_control_send (ompi_osc_rdma_module_t *module, int target,
|
|
void *data, size_t len)
|
|
{
|
|
ompi_osc_rdma_frag_t *frag;
|
|
char *ptr;
|
|
int ret;
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
ret = ompi_osc_rdma_frag_alloc(module, target, len, &frag, &ptr);
|
|
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
|
memcpy (ptr, data, len);
|
|
|
|
ret = ompi_osc_rdma_frag_finish(module, frag);
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int ompi_osc_rdma_control_send_unbuffered_cb (ompi_request_t *request)
|
|
{
|
|
void *ctx = request->req_complete_cb_data;
|
|
ompi_osc_rdma_module_t *module;
|
|
|
|
/* get module pointer and data */
|
|
module = *(ompi_osc_rdma_module_t **)ctx;
|
|
|
|
/* mark this send as complete */
|
|
mark_outgoing_completion (module);
|
|
|
|
/* free the temporary buffer */
|
|
free (ctx);
|
|
|
|
/* put this request on the garbage collection list */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_control_send_unbuffered:
|
|
*
|
|
* @short Send an unbuffered control message to a peer.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] target - Target rank
|
|
* @param[in] data - Data to send
|
|
* @param[in] len - Length of data
|
|
*
|
|
* @long Directly send a control message. This does not allocate a
|
|
* fragment, so should only be used when sending other messages would
|
|
* be erroneous (such as complete messages, when there may be queued
|
|
* transactions from an overlapping post that has already heard back
|
|
* from its peer). The buffer specified by data will be available
|
|
* when this call returns.
|
|
*/
|
|
int ompi_osc_rdma_control_send_unbuffered(ompi_osc_rdma_module_t *module,
|
|
int target, void *data, size_t len)
|
|
{
|
|
void *ctx, *data_copy;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: sending unbuffered fragment to %d", target));
|
|
|
|
/* allocate a temporary buffer for this send */
|
|
ctx = malloc (sizeof(ompi_osc_rdma_module_t*) + len);
|
|
if (OPAL_UNLIKELY(NULL == ctx)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* increment outgoing signal count. this send is not part of a passive epoch
|
|
* so there it would be erroneous to increment the epoch counters. */
|
|
ompi_osc_signal_outgoing (module, MPI_PROC_NULL, 1);
|
|
|
|
/* store module pointer and data in the buffer */
|
|
*(ompi_osc_rdma_module_t**)ctx = module;
|
|
data_copy = (ompi_osc_rdma_module_t**)ctx + 1;
|
|
memcpy (data_copy, data, len);
|
|
|
|
return ompi_osc_rdma_isend_w_cb (data_copy, len, MPI_BYTE, target, OSC_RDMA_FRAG_TAG,
|
|
module->comm, ompi_osc_rdma_control_send_unbuffered_cb, ctx);
|
|
}
|
|
|
|
/**
|
|
* datatype_create:
|
|
*
|
|
* @short Utility function that creates a new datatype from a packed
|
|
* description.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] peer - Peer rank
|
|
* @param[out] datatype - New datatype. Must be released with OBJ_RELEASE.
|
|
* @param[out] proc - Optional. Proc for peer.
|
|
* @param[inout] data - Pointer to a pointer where the description is stored. This
|
|
* pointer will be updated to the location after the packed
|
|
* description.
|
|
*/
|
|
static inline int datatype_create (ompi_osc_rdma_module_t *module, int peer, ompi_proc_t **proc, ompi_datatype_t **datatype, void **data)
|
|
{
|
|
ompi_datatype_t *new_datatype = NULL;
|
|
ompi_proc_t *peer_proc;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
do {
|
|
peer_proc = ompi_comm_peer_lookup(module->comm, peer);
|
|
if (OPAL_UNLIKELY(NULL == peer_proc)) {
|
|
OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output,
|
|
"%d: datatype_create: could not resolve proc pointer for peer %d",
|
|
ompi_comm_rank(module->comm),
|
|
peer));
|
|
ret = OMPI_ERROR;
|
|
break;
|
|
}
|
|
|
|
new_datatype = ompi_osc_base_datatype_create(peer_proc, data);
|
|
if (OPAL_UNLIKELY(NULL == new_datatype)) {
|
|
OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output,
|
|
"%d: datatype_create: could not resolve datatype for peer %d",
|
|
ompi_comm_rank(module->comm), peer));
|
|
ret = OMPI_ERROR;
|
|
}
|
|
} while (0);
|
|
|
|
*datatype = new_datatype;
|
|
if (proc) *proc = peer_proc;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* process_put:
|
|
*
|
|
* @shoer Process a put w/ data message
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Message source
|
|
* @param[in] put_header - Message header + data
|
|
*
|
|
* @long Process a put message and copy the message data to the specified
|
|
* memory region. Note, this function does not handle any bounds
|
|
* checking at the moment.
|
|
*/
|
|
static inline int process_put(ompi_osc_rdma_module_t* module, int source,
|
|
ompi_osc_rdma_header_put_t* put_header)
|
|
{
|
|
char *data = (char*) (put_header + 1);
|
|
ompi_proc_t *proc;
|
|
struct ompi_datatype_t *datatype;
|
|
size_t data_len;
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) put_header->displacement * module->disp_unit);
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_put: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, &proc, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
data_len = put_header->len - ((uintptr_t) data - (uintptr_t) put_header);
|
|
|
|
osc_rdma_copy_on_recv (target, data, data_len, proc, put_header->count, datatype);
|
|
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return put_header->len;
|
|
}
|
|
|
|
static inline int process_put_long(ompi_osc_rdma_module_t* module, int source,
|
|
ompi_osc_rdma_header_put_t* put_header)
|
|
{
|
|
char *data = (char*) (put_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) put_header->displacement * module->disp_unit);
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_put_long: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_component_irecv (module, target,
|
|
put_header->count,
|
|
datatype, source,
|
|
put_header->tag,
|
|
module->comm);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
OPAL_OUTPUT_VERBOSE((1, ompi_osc_base_framework.framework_output,
|
|
"%d: process_put_long: irecv error: %d",
|
|
ompi_comm_rank(module->comm),
|
|
ret));
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return put_header->len;
|
|
}
|
|
|
|
/**
|
|
* osc_rdma_incomming_req_omplete:
|
|
*
|
|
* @short Completion callback for a send/receive associate with an access
|
|
* epoch.
|
|
*
|
|
* @param[in] request - PML request with an OSC RMDA module as the callback data.
|
|
*
|
|
* @long This function is called when a send or recieve associated with an
|
|
* access epoch completes. When fired this function will increment the
|
|
* passive or active incoming count.
|
|
*/
|
|
static int osc_rdma_incomming_req_omplete (ompi_request_t *request)
|
|
{
|
|
ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data;
|
|
int rank = MPI_PROC_NULL;
|
|
|
|
if (request->req_status.MPI_TAG & 0x01) {
|
|
rank = request->req_status.MPI_SOURCE;
|
|
}
|
|
|
|
mark_incoming_completion (module, rank);
|
|
|
|
/* put this request on the garbage colletion list */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
struct osc_rdma_get_post_send_cb_data_t {
|
|
ompi_osc_rdma_module_t *module;
|
|
int peer;
|
|
};
|
|
|
|
static int osc_rdma_get_post_send_cb (ompi_request_t *request)
|
|
{
|
|
struct osc_rdma_get_post_send_cb_data_t *data =
|
|
(struct osc_rdma_get_post_send_cb_data_t *) request->req_complete_cb_data;
|
|
ompi_osc_rdma_module_t *module = data->module;
|
|
int rank = data->peer;
|
|
|
|
free (data);
|
|
|
|
/* mark this as a completed "incomming" request */
|
|
mark_incoming_completion (module, rank);
|
|
|
|
/* put this request on the garbage colletion list */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* @short Post a send to match the remote receive for a get operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source buffer
|
|
* @param[in] count - Number of elements in the source buffer
|
|
* @param[in] datatype - Type of source elements.
|
|
* @param[in] peer - Remote process that has the receive posted
|
|
* @param[in] tag - Tag for the send
|
|
*
|
|
* @long This function posts a send to match the receive posted as part
|
|
* of a get operation. When this send is complete the get is considered
|
|
* complete at the target (this process).
|
|
*/
|
|
static int osc_rdma_get_post_send (ompi_osc_rdma_module_t *module, void *source, int count,
|
|
ompi_datatype_t *datatype, int peer, int tag)
|
|
{
|
|
struct osc_rdma_get_post_send_cb_data_t *data;
|
|
|
|
data = malloc (sizeof (*data));
|
|
if (OPAL_UNLIKELY(NULL == data)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
data->module = module;
|
|
/* for incomming completion we need to know the peer (MPI_PROC_NULL if this is
|
|
* in an active target epoch) */
|
|
data->peer = (tag & 0x1) ? peer : MPI_PROC_NULL;
|
|
|
|
return ompi_osc_rdma_isend_w_cb (source, count, datatype, peer, tag, module->comm,
|
|
osc_rdma_get_post_send_cb, (void *) data);
|
|
}
|
|
|
|
/**
|
|
* process_get:
|
|
*
|
|
* @short Process a get message from a remote peer
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] target - Peer process
|
|
* @param[in] get_header - Incoming message header
|
|
*/
|
|
static inline int process_get (ompi_osc_rdma_module_t* module, int target,
|
|
ompi_osc_rdma_header_get_t* get_header)
|
|
{
|
|
char *data = (char *) (get_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
void *source = (unsigned char*) module->baseptr +
|
|
((unsigned long) get_header->displacement * module->disp_unit);
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_get: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
target));
|
|
|
|
ret = datatype_create (module, target, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
/* send get data */
|
|
ret = osc_rdma_get_post_send (module, source, get_header->count, datatype, target, get_header->tag);
|
|
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return OMPI_SUCCESS == ret ? (int) get_header->len : ret;
|
|
}
|
|
|
|
/**
|
|
* osc_rdma_accumulate_buffer:
|
|
*
|
|
* @short Accumulate data into the target buffer.
|
|
*
|
|
* @param[in] target - Target buffer
|
|
* @param[in] source - Source buffer
|
|
* @param[in] source_len - Length of source buffer in bytes
|
|
* @param[in] proc - Source proc
|
|
* @param[in] count - Number of elements in target buffer
|
|
* @param[in] datatype - Type of elements in target buffer
|
|
* @param[in] op - Operation to be performed
|
|
*/
|
|
static inline int osc_rdma_accumulate_buffer (void *target, void *source, size_t source_len, ompi_proc_t *proc,
|
|
int count, ompi_datatype_t *datatype, ompi_op_t *op)
|
|
{
|
|
void *buffer = source;
|
|
int ret;
|
|
|
|
assert (NULL != target && NULL != source);
|
|
|
|
if (op == &ompi_mpi_op_replace.op) {
|
|
osc_rdma_copy_on_recv (target, source, source_len, proc, count, datatype);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
|
if (proc->proc_arch != ompi_proc_local()->proc_arch) {
|
|
ompi_datatype_t *primitive_datatype = NULL;
|
|
uint32_t primitive_count;
|
|
size_t buflen;
|
|
|
|
ompi_osc_base_get_primitive_type_info(datatype, &primitive_datatype, &primitive_count);
|
|
primitive_count *= count;
|
|
|
|
/* figure out how big a buffer we need */
|
|
ompi_datatype_type_size(primitive_datatype, &buflen);
|
|
buflen *= primitive_count;
|
|
|
|
buffer = malloc (buflen);
|
|
if (OPAL_UNLIKELY(NULL == buffer)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
osc_rdma_copy_on_recv (buffer, source, source_len, proc, count, datatype);
|
|
}
|
|
#endif
|
|
|
|
/* copy the data from the temporary buffer into the user window */
|
|
ret = ompi_osc_base_process_op(target, buffer, source_len, datatype,
|
|
count, op);
|
|
|
|
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
|
if (proc->proc_arch != ompi_proc_local()->proc_arch) {
|
|
free(buffer);
|
|
}
|
|
#endif
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* @short Create an accumulate data object.
|
|
*
|
|
* @param[in] module - RDMA OSC module
|
|
* @param[in] target - Target for the accumulation
|
|
* @param[in] source - Source of accumulate data. Must be allocated with malloc/calloc/etc
|
|
* @param[in] source_len - Length of the source buffer in bytes
|
|
* @param[in] proc - Source proc
|
|
* @param[in] count - Number of elements to accumulate
|
|
* @param[in] datatype - Datatype to accumulate
|
|
* @oaram[in] op - Operator
|
|
* @param[in] request_count - Number of prerequisite requests
|
|
* @param[out] acc_data_out - New accumulation data
|
|
*
|
|
* @long This function is used to create a copy of the data needed to perform an accumulation.
|
|
* This data should be provided to ompi_osc_rdma_isend_w_cb or ompi_osc_rdma_irecv_w_cb
|
|
* as the ctx parameter with accumulate_cb as the cb parameter.
|
|
*/
|
|
static int osc_rdma_accumulate_allocate (ompi_osc_rdma_module_t *module, int peer, void *target, void *source, size_t source_len,
|
|
ompi_proc_t *proc, int count, ompi_datatype_t *datatype, ompi_op_t *op,
|
|
int request_count, osc_rdma_accumulate_data_t **acc_data_out)
|
|
{
|
|
osc_rdma_accumulate_data_t *acc_data;
|
|
|
|
acc_data = OBJ_NEW(osc_rdma_accumulate_data_t);
|
|
if (OPAL_UNLIKELY(NULL == acc_data)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
acc_data->module = module;
|
|
acc_data->peer = peer;
|
|
acc_data->target = target;
|
|
acc_data->source = source;
|
|
acc_data->source_len = source_len;
|
|
acc_data->proc = proc;
|
|
acc_data->count = count;
|
|
acc_data->datatype = datatype;
|
|
OBJ_RETAIN(datatype);
|
|
acc_data->op = op;
|
|
OBJ_RETAIN(op);
|
|
acc_data->request_count = request_count;
|
|
|
|
*acc_data_out = acc_data;
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* @short Execute the accumulate once the request counter reaches 0.
|
|
*
|
|
* @param[in] request - request
|
|
*
|
|
* The request should be created with ompi_osc_rdma_isend_w_cb or ompi_osc_rdma_irecv_w_cb
|
|
* with ctx allocated by osc_rdma_accumulate_allocate. This callback will free the accumulate
|
|
* data once the accumulation operation is complete.
|
|
*/
|
|
static int accumulate_cb (ompi_request_t *request)
|
|
{
|
|
struct osc_rdma_accumulate_data_t *acc_data = (struct osc_rdma_accumulate_data_t *) request->req_complete_cb_data;
|
|
ompi_osc_rdma_module_t *module = acc_data->module;
|
|
int rank = MPI_PROC_NULL;
|
|
int ret = OMPI_SUCCESS;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"accumulate_cb, request_count = %d", acc_data->request_count));
|
|
|
|
if (request->req_status.MPI_TAG & 0x01) {
|
|
rank = acc_data->peer;
|
|
}
|
|
|
|
mark_incoming_completion (module, rank);
|
|
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
if (0 == --acc_data->request_count) {
|
|
/* no more requests needed before the buffer can be accumulated */
|
|
|
|
if (acc_data->source) {
|
|
ret = osc_rdma_accumulate_buffer (acc_data->target, acc_data->source, acc_data->source_len,
|
|
acc_data->proc, acc_data->count, acc_data->datatype, acc_data->op);
|
|
}
|
|
|
|
/* drop the accumulate lock */
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
|
|
opal_list_append (&module->buffer_gc, &acc_data->super);
|
|
}
|
|
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
static int ompi_osc_rdma_acc_op_queue (ompi_osc_rdma_module_t *module, ompi_osc_rdma_header_t *header, int source,
|
|
char *data, size_t data_len, ompi_datatype_t *datatype)
|
|
{
|
|
osc_rdma_pending_acc_t *pending_acc;
|
|
|
|
pending_acc = OBJ_NEW(osc_rdma_pending_acc_t);
|
|
if (OPAL_UNLIKELY(NULL == pending_acc)) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
pending_acc->source = source;
|
|
|
|
/* save any inline data (eager acc, gacc only) */
|
|
pending_acc->data_len = data_len;
|
|
|
|
if (data_len) {
|
|
pending_acc->data = malloc (data_len);
|
|
memcpy (pending_acc->data, data, data_len);
|
|
}
|
|
|
|
/* save the datatype */
|
|
pending_acc->datatype = datatype;
|
|
OBJ_RETAIN(datatype);
|
|
|
|
/* save the header */
|
|
switch (header->base.type) {
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC:
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG:
|
|
pending_acc->header.acc = header->acc;
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC:
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG:
|
|
pending_acc->header.get_acc = header->get_acc;
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_CSWAP:
|
|
pending_acc->header.cswap = header->cswap;
|
|
break;
|
|
default:
|
|
/* it is a coding error if any other header types are queued this way */
|
|
assert (0);
|
|
}
|
|
|
|
/* add to the pending acc queue */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
opal_list_append (&module->pending_acc, &pending_acc->super);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int replace_cb (ompi_request_t *request)
|
|
{
|
|
ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data;
|
|
|
|
/* unlock the accumulate lock */
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_acc_start:
|
|
*
|
|
* @short Start an accumulate with data operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] data - Accumulate data
|
|
* @param[in] data_len - Length of the accumulate data
|
|
* @param[in] datatype - Accumulation datatype
|
|
* @param[in] acc_header - Accumulate header
|
|
*
|
|
* The module's accumulation lock must be held before calling this
|
|
* function. It will release the lock when the operation is complete.
|
|
*/
|
|
static int ompi_osc_rdma_acc_start (ompi_osc_rdma_module_t *module, int source, void *data, size_t data_len,
|
|
ompi_datatype_t *datatype, ompi_osc_rdma_header_acc_t *acc_header)
|
|
{
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) acc_header->displacement * module->disp_unit);
|
|
struct ompi_op_t *op = ompi_osc_base_op_create(acc_header->op);
|
|
ompi_proc_t *proc;
|
|
int ret;
|
|
|
|
proc = ompi_comm_peer_lookup(module->comm, source);
|
|
assert (NULL != proc);
|
|
|
|
ret = osc_rdma_accumulate_buffer (target, data, data_len, proc, acc_header->count,
|
|
datatype, op);
|
|
|
|
OBJ_RELEASE(op);
|
|
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_acc_start:
|
|
*
|
|
* @short Start a long accumulate operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] datatype - Accumulation datatype
|
|
* @param[in] acc_header - Accumulate header
|
|
*
|
|
* The module's accumulation lock must be held before calling this
|
|
* function. It will release the lock when the operation is complete.
|
|
*/
|
|
static int ompi_osc_rdma_acc_long_start (ompi_osc_rdma_module_t *module, int source, ompi_datatype_t *datatype,
|
|
ompi_osc_rdma_header_acc_t *acc_header) {
|
|
struct osc_rdma_accumulate_data_t *acc_data;
|
|
size_t buflen;
|
|
void *buffer;
|
|
ompi_proc_t *proc;
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) acc_header->displacement * module->disp_unit);
|
|
struct ompi_op_t *op = ompi_osc_base_op_create(acc_header->op);
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"ompi_osc_rdma_acc_long_start starting..."));
|
|
|
|
proc = ompi_comm_peer_lookup(module->comm, source);
|
|
assert (NULL != proc);
|
|
|
|
do {
|
|
if (op == &ompi_mpi_op_replace.op) {
|
|
ret = ompi_osc_rdma_irecv_w_cb (target, acc_header->count, datatype, source,
|
|
acc_header->tag, module->comm, NULL,
|
|
replace_cb, module);
|
|
break;
|
|
}
|
|
|
|
buflen = datatype_buffer_length (datatype, acc_header->count);
|
|
|
|
/* allocate a temporary buffer to receive the accumulate data */
|
|
buffer = malloc (buflen);
|
|
if (OPAL_UNLIKELY(NULL == buffer)) {
|
|
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
|
break;
|
|
}
|
|
|
|
ret = osc_rdma_accumulate_allocate (module, source, target, buffer, buflen, proc, acc_header->count,
|
|
datatype, op, 1, &acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
free (buffer);
|
|
break;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_irecv_w_cb (buffer, acc_header->count, datatype, source, acc_header->tag,
|
|
module->comm, NULL, accumulate_cb, acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
OBJ_RELEASE(acc_data);
|
|
}
|
|
} while (0);
|
|
|
|
OBJ_RELEASE(op);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_gacc_start:
|
|
*
|
|
* @short Start a accumulate with data + get operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] data - Accumulate data. Must be allocated on the heap.
|
|
* @param[in] data_len - Length of the accumulate data
|
|
* @param[in] datatype - Accumulation datatype
|
|
* @param[in] get_acc_header - Accumulate header
|
|
*
|
|
* The module's accumulation lock must be held before calling this
|
|
* function. It will release the lock when the operation is complete.
|
|
*/
|
|
static int ompi_osc_rdma_gacc_start (ompi_osc_rdma_module_t *module, int source, void *data, size_t data_len,
|
|
ompi_datatype_t *datatype, ompi_osc_rdma_header_get_acc_t *get_acc_header)
|
|
{
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) get_acc_header->displacement * module->disp_unit);
|
|
struct ompi_op_t *op = ompi_osc_base_op_create(get_acc_header->op);
|
|
struct osc_rdma_accumulate_data_t *acc_data;
|
|
ompi_proc_t *proc;
|
|
int ret;
|
|
|
|
proc = ompi_comm_peer_lookup(module->comm, source);
|
|
assert (NULL != proc);
|
|
|
|
do {
|
|
ret = osc_rdma_accumulate_allocate (module, source, target, data, data_len, proc, get_acc_header->count,
|
|
datatype, op, 1, &acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
break;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_isend_w_cb (target, get_acc_header->count, datatype, source, get_acc_header->tag,
|
|
module->comm, accumulate_cb, acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
OBJ_RELEASE(acc_data);
|
|
}
|
|
} while (0);
|
|
|
|
OBJ_RELEASE(op);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_gacc_long_start:
|
|
*
|
|
* @short Start a long accumulate + get operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] datatype - Accumulation datatype
|
|
* @param[in] get_acc_header - Accumulate header
|
|
*
|
|
* The module's accumulation lock must be held before calling this
|
|
* function. It will release the lock when the operation is complete.
|
|
*/
|
|
static int ompi_osc_gacc_long_start (ompi_osc_rdma_module_t *module, int source, ompi_datatype_t *datatype,
|
|
ompi_osc_rdma_header_get_acc_t *get_acc_header)
|
|
{
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) get_acc_header->displacement * module->disp_unit);
|
|
struct ompi_op_t *op = ompi_osc_base_op_create(get_acc_header->op);
|
|
struct osc_rdma_accumulate_data_t *acc_data;
|
|
ompi_request_t *recv_request;
|
|
ompi_proc_t *proc;
|
|
size_t buflen;
|
|
void *buffer;
|
|
int ret;
|
|
|
|
proc = ompi_comm_peer_lookup(module->comm, source);
|
|
assert (NULL != proc);
|
|
|
|
/* allocate a temporary buffer to receive the accumulate data */
|
|
buflen = datatype_buffer_length (datatype, get_acc_header->count);
|
|
|
|
do {
|
|
buffer = malloc (buflen);
|
|
if (OPAL_UNLIKELY(NULL == buffer)) {
|
|
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
|
break;
|
|
}
|
|
|
|
ret = osc_rdma_accumulate_allocate (module, source, target, buffer, buflen, proc, get_acc_header->count,
|
|
datatype, op, 2, &acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
break;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_irecv_w_cb (buffer, get_acc_header->count, datatype, source, get_acc_header->tag,
|
|
module->comm, &recv_request, accumulate_cb, acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
OBJ_RELEASE(acc_data);
|
|
break;
|
|
}
|
|
|
|
ret = ompi_osc_rdma_isend_w_cb (target, get_acc_header->count, datatype, source, get_acc_header->tag,
|
|
module->comm, accumulate_cb, acc_data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
|
|
/* cancel the receive and free the accumulate data */
|
|
ompi_request_cancel (recv_request);
|
|
OBJ_RELEASE(acc_data);
|
|
break;
|
|
}
|
|
} while (0);
|
|
|
|
OBJ_RELEASE(op);
|
|
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_cswap_start:
|
|
*
|
|
* @short Start a compare and swap operation
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
* @param[in] source - Source rank
|
|
* @param[in] data - Compare and swap data
|
|
* @param[in] data_len - Length of the compare and swap data. Must be exactly
|
|
* twice the size of the datatype.
|
|
* @param[in] datatype - Compare and swap datatype
|
|
* @param[in] cswap_header - Compare and swap header
|
|
*
|
|
* The module's accumulation lock must be held before calling this
|
|
* function. It will release the lock when the operation is complete.
|
|
*/
|
|
static int ompi_osc_rdma_cswap_start (ompi_osc_rdma_module_t *module, int source, void *data, ompi_datatype_t *datatype,
|
|
ompi_osc_rdma_header_cswap_t *cswap_header)
|
|
{
|
|
void *target = (unsigned char*) module->baseptr +
|
|
((unsigned long) cswap_header->displacement * module->disp_unit);
|
|
void *compare_addr, *origin_addr;
|
|
size_t datatype_size;
|
|
ompi_proc_t *proc;
|
|
int ret;
|
|
|
|
proc = ompi_comm_peer_lookup(module->comm, source);
|
|
assert (NULL != proc);
|
|
|
|
datatype_size = datatype->super.size;
|
|
|
|
origin_addr = data;
|
|
compare_addr = (void *)((uintptr_t) data + datatype_size);
|
|
|
|
do {
|
|
/* no reason to do a non-blocking send here */
|
|
ret = MCA_PML_CALL(send(target, 1, datatype, source, cswap_header->tag, MCA_PML_BASE_SEND_STANDARD,
|
|
module->comm));
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
break;
|
|
}
|
|
|
|
/* increment the incomming fragment count so it matches what is expected */
|
|
mark_incoming_completion (module, (cswap_header->tag & 0x1) ? source : MPI_PROC_NULL);
|
|
|
|
if (0 == memcmp (target, compare_addr, datatype_size)) {
|
|
osc_rdma_copy_on_recv (target, origin_addr, datatype_size, proc, 1, datatype);
|
|
}
|
|
} while (0);
|
|
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* ompi_osc_rdma_progress_pending_acc:
|
|
*
|
|
* @short Progress one pending accumulation or compare and swap operation.
|
|
*
|
|
* @param[in] module - OSC RDMA module
|
|
*
|
|
* @long If the accumulation lock can be aquired progress one pending
|
|
* accumulate or compare and swap operation.
|
|
*/
|
|
int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module)
|
|
{
|
|
osc_rdma_pending_acc_t *pending_acc;
|
|
int ret;
|
|
|
|
/* try to aquire the lock. it will be unlocked when the accumulate or cswap
|
|
* operation completes */
|
|
if (ompi_osc_rdma_accumulate_trylock (module)) {
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
pending_acc = (osc_rdma_pending_acc_t *) opal_list_remove_first (&module->pending_acc);
|
|
if (OPAL_UNLIKELY(NULL == pending_acc)) {
|
|
/* called without any pending accumulation operations */
|
|
ompi_osc_rdma_accumulate_unlock (module);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
switch (pending_acc->header.base.type) {
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC:
|
|
ret = ompi_osc_rdma_acc_start (module, pending_acc->source, pending_acc->data, pending_acc->data_len,
|
|
pending_acc->datatype, &pending_acc->header.acc);
|
|
free (pending_acc->data);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG:
|
|
ret = ompi_osc_rdma_acc_long_start (module, pending_acc->source, pending_acc->datatype,
|
|
&pending_acc->header.acc);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC:
|
|
ret = ompi_osc_rdma_gacc_start (module, pending_acc->source, pending_acc->data,
|
|
pending_acc->data_len, pending_acc->datatype,
|
|
&pending_acc->header.get_acc);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG:
|
|
ret = ompi_osc_gacc_long_start (module, pending_acc->source, pending_acc->datatype,
|
|
&pending_acc->header.get_acc);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_CSWAP:
|
|
ret = ompi_osc_rdma_cswap_start (module, pending_acc->source, pending_acc->data,
|
|
pending_acc->datatype, &pending_acc->header.cswap);
|
|
break;
|
|
default:
|
|
ret = OMPI_ERROR;
|
|
/* it is a coding error if this point is reached */
|
|
assert (0);
|
|
}
|
|
|
|
pending_acc->data = NULL;
|
|
OBJ_RELEASE(pending_acc);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline int process_acc (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_acc_t *acc_header)
|
|
{
|
|
char *data = (char *) (acc_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
uint64_t data_len;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_acc: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
data_len = acc_header->len - ((char*) data - (char*) acc_header);
|
|
|
|
/* try to aquire the accumulate lock */
|
|
if (0 == ompi_osc_rdma_accumulate_trylock (module)) {
|
|
ret = ompi_osc_rdma_acc_start (module, source, data, data_len, datatype,
|
|
acc_header);
|
|
} else {
|
|
/* couldn't aquire the accumulate lock so queue up the accumulate operation */
|
|
ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) acc_header,
|
|
source, data, data_len, datatype);
|
|
}
|
|
|
|
/* Release datatype & op */
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return (OMPI_SUCCESS == ret) ? (int) acc_header->len : ret;
|
|
}
|
|
|
|
static inline int process_acc_long (ompi_osc_rdma_module_t* module, int source,
|
|
ompi_osc_rdma_header_acc_t* acc_header)
|
|
{
|
|
char *data = (char *) (acc_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_acc_long: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
if (0 == ompi_osc_rdma_accumulate_trylock (module)) {
|
|
ret = ompi_osc_rdma_acc_long_start (module, source, datatype, acc_header);
|
|
} else {
|
|
/* queue the operation */
|
|
ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) acc_header, source,
|
|
NULL, 0, datatype);
|
|
}
|
|
|
|
/* Release datatype & op */
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return (OMPI_SUCCESS == ret) ? (int) acc_header->len : ret;
|
|
}
|
|
|
|
static inline int process_get_acc(ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_get_acc_t *get_acc_header)
|
|
{
|
|
char *data = (char *) (get_acc_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
void *buffer = NULL;
|
|
uint64_t data_len;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_get_acc: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
data_len = get_acc_header->len - ((char*) data - (char*) get_acc_header);
|
|
|
|
if (0 == ompi_osc_rdma_accumulate_trylock (module)) {
|
|
/* make a copy of the data since the buffer needs to be returned */
|
|
if (data_len) {
|
|
buffer = malloc (data_len);
|
|
if (OPAL_UNLIKELY(NULL == buffer)) {
|
|
OBJ_RELEASE(datatype);
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
memcpy (buffer, data, data_len);
|
|
}
|
|
|
|
ret = ompi_osc_rdma_gacc_start (module, source, buffer, data_len, datatype,
|
|
get_acc_header);
|
|
} else {
|
|
/* queue the operation */
|
|
ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) get_acc_header,
|
|
source, data, data_len, datatype);
|
|
}
|
|
|
|
/* Release datatype & op */
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return (OMPI_SUCCESS == ret) ? (int) get_acc_header->len : ret;
|
|
}
|
|
|
|
static inline int process_get_acc_long(ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_get_acc_t *get_acc_header)
|
|
{
|
|
char *data = (char *) (get_acc_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_acc: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
if (0 == ompi_osc_rdma_accumulate_trylock (module)) {
|
|
ret = ompi_osc_gacc_long_start (module, source, datatype, get_acc_header);
|
|
} else {
|
|
/* queue the operation */
|
|
ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) get_acc_header,
|
|
source, NULL, 0, datatype);
|
|
}
|
|
|
|
/* Release datatype & op */
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return OMPI_SUCCESS == ret ? (int) get_acc_header->len : ret;
|
|
}
|
|
|
|
|
|
static inline int process_cswap (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_cswap_t *cswap_header)
|
|
{
|
|
char *data = (char*) (cswap_header + 1);
|
|
struct ompi_datatype_t *datatype;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_cswap: received message from %d",
|
|
ompi_comm_rank(module->comm),
|
|
source));
|
|
|
|
ret = datatype_create (module, source, NULL, &datatype, (void **) &data);
|
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
|
return ret;
|
|
}
|
|
|
|
if (0 == ompi_osc_rdma_accumulate_trylock (module)) {
|
|
ret = ompi_osc_rdma_cswap_start (module, source, data, datatype, cswap_header);
|
|
} else {
|
|
/* queue the operation */
|
|
ret = ompi_osc_rdma_acc_op_queue (module, (ompi_osc_rdma_header_t *) cswap_header, source,
|
|
data, 2 * datatype->super.size, datatype);
|
|
}
|
|
|
|
/* Release datatype */
|
|
OBJ_RELEASE(datatype);
|
|
|
|
return (OMPI_SUCCESS == ret) ? (int) cswap_header->len : ret;
|
|
}
|
|
|
|
static inline int process_complete (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_complete_t *complete_header)
|
|
{
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: process_complete got complete message from %d", source));
|
|
module->num_complete_msgs++;
|
|
|
|
return sizeof (*complete_header);
|
|
}
|
|
|
|
/* flush and unlock headers cannot be processed from the request callback
|
|
* because some btls do not provide re-entrant progress functions. these
|
|
* fragment will be progressed by the rdma component's progress function */
|
|
static inline int process_flush (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_flush_t *flush_header)
|
|
{
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"process_flush header = {.frag_count = %d}", flush_header->frag_count));
|
|
|
|
/* increase signal count by incoming frags */
|
|
module->passive_incoming_frag_signal_count[source] += flush_header->frag_count;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"%d: process_flush: received message from %d. passive_incoming_frag_signal_count = %d, passive_incoming_frag_count = %d",
|
|
ompi_comm_rank(module->comm), source, module->passive_incoming_frag_signal_count[source], module->passive_incoming_frag_count[source]));
|
|
|
|
ret = ompi_osc_rdma_process_flush (module, source, flush_header);
|
|
if (OMPI_SUCCESS != ret) {
|
|
ompi_osc_rdma_pending_t *pending;
|
|
|
|
pending = OBJ_NEW(ompi_osc_rdma_pending_t);
|
|
pending->module = module;
|
|
pending->source = source;
|
|
pending->header.flush = *flush_header;
|
|
|
|
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
|
opal_list_append (&mca_osc_rdma_component.pending_operations, &pending->super);
|
|
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
|
|
|
/* we now have to count the current fragment */
|
|
module->passive_incoming_frag_signal_count[source]++;
|
|
} else {
|
|
/* need to account for the current fragment */
|
|
module->passive_incoming_frag_count[source] = -1;
|
|
}
|
|
|
|
return sizeof (*flush_header);
|
|
}
|
|
|
|
static inline int process_unlock (ompi_osc_rdma_module_t *module, int source,
|
|
ompi_osc_rdma_header_unlock_t *unlock_header)
|
|
{
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"process_unlock header = {.frag_count = %d}", unlock_header->frag_count));
|
|
|
|
/* increase signal count by incoming frags */
|
|
module->passive_incoming_frag_signal_count[source] += unlock_header->frag_count;
|
|
|
|
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: processing unlock request from %d. frag count = %d, signal_count = %d, processed_count = %d",
|
|
source, unlock_header->frag_count, (int) module->passive_incoming_frag_signal_count[source],
|
|
(int) module->passive_incoming_frag_count[source]));
|
|
|
|
ret = ompi_osc_rdma_process_unlock (module, source, unlock_header);
|
|
if (OMPI_SUCCESS != ret) {
|
|
ompi_osc_rdma_pending_t *pending;
|
|
|
|
pending = OBJ_NEW(ompi_osc_rdma_pending_t);
|
|
pending->module = module;
|
|
pending->source = source;
|
|
pending->header.unlock = *unlock_header;
|
|
|
|
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
|
|
opal_list_append (&mca_osc_rdma_component.pending_operations, &pending->super);
|
|
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
|
|
|
|
/* we now have to count the current fragment */
|
|
module->passive_incoming_frag_signal_count[source]++;
|
|
} else {
|
|
/* need to account for the current fragment */
|
|
module->passive_incoming_frag_count[source] = -1;
|
|
}
|
|
|
|
return sizeof (*unlock_header);
|
|
}
|
|
|
|
/*
|
|
* Do all the data movement associated with a fragment
|
|
*/
|
|
static inline int process_frag (ompi_osc_rdma_module_t *module,
|
|
ompi_osc_rdma_frag_header_t *frag)
|
|
{
|
|
ompi_osc_rdma_header_t *header;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: process_frag: from %d, ops %d",
|
|
(int) frag->source, (int) frag->num_ops));
|
|
|
|
header = (ompi_osc_rdma_header_t *) (frag + 1);
|
|
|
|
for (int i = 0 ; i < frag->num_ops ; ++i) {
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: process_frag: type 0x%x. offset = %u", header->base.type,
|
|
(unsigned) ((uintptr_t)header - (uintptr_t)frag)));
|
|
|
|
switch (header->base.type) {
|
|
case OMPI_OSC_RDMA_HDR_TYPE_PUT:
|
|
ret = process_put(module, frag->source, &header->put);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG:
|
|
ret = process_put_long(module, frag->source, &header->put);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC:
|
|
ret = process_acc(module, frag->source, &header->acc);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG:
|
|
ret = process_acc_long (module, frag->source, &header->acc);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ:
|
|
ret = ompi_osc_rdma_process_lock(module, frag->source, &header->lock);
|
|
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
|
|
ret = sizeof (header->lock);
|
|
}
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ:
|
|
ret = process_unlock(module, frag->source, &header->unlock);
|
|
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET:
|
|
ret = process_get (module, frag->source, &header->get);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_CSWAP:
|
|
ret = process_cswap (module, frag->source, &header->cswap);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC:
|
|
ret = process_get_acc (module, frag->source, &header->get_acc);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG:
|
|
ret = process_get_acc_long (module, frag->source, &header->get_acc);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ:
|
|
ret = process_flush (module, frag->source, &header->flush);
|
|
break;
|
|
|
|
case OMPI_OSC_RDMA_HDR_TYPE_COMPLETE:
|
|
ret = process_complete (module, frag->source, &header->complete);
|
|
break;
|
|
|
|
default:
|
|
opal_output(0, "Unsupported fragment type 0x%x\n", header->base.type);
|
|
abort(); /* FIX ME */
|
|
}
|
|
if (ret <= 0) {
|
|
opal_output(0, "Error processing fragment: %d", ret);
|
|
abort(); /* FIX ME */
|
|
}
|
|
|
|
/* the next header will start on an 8-byte boundary. this is done to ensure
|
|
* that the next header and the packed datatype is properly aligned */
|
|
header = (ompi_osc_rdma_header_t *) OPAL_ALIGN(((uintptr_t) header + ret), 8, uintptr_t);
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/* dispatch for callback on message completion */
|
|
static int ompi_osc_rdma_callback (ompi_request_t *request)
|
|
{
|
|
ompi_osc_rdma_module_t *module = (ompi_osc_rdma_module_t *) request->req_complete_cb_data;
|
|
ompi_osc_rdma_header_base_t *base_header =
|
|
(ompi_osc_rdma_header_base_t *) module->incomming_buffer;
|
|
size_t incomming_length = request->req_status._ucount;
|
|
int source = request->req_status.MPI_SOURCE;
|
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
assert(incomming_length >= sizeof(ompi_osc_rdma_header_base_t));
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"received rdma callback for fragment. source = %d, count = %u, type = 0x%x",
|
|
source, (unsigned) incomming_length, base_header->type));
|
|
|
|
switch (base_header->type) {
|
|
case OMPI_OSC_RDMA_HDR_TYPE_FRAG:
|
|
process_frag(module, (ompi_osc_rdma_frag_header_t *) base_header);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_POST:
|
|
(void) osc_rdma_incomming_post (module);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK:
|
|
ompi_osc_rdma_process_lock_ack(module, (ompi_osc_rdma_header_lock_ack_t *) base_header);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK:
|
|
ompi_osc_rdma_process_flush_ack (module, source, (ompi_osc_rdma_header_flush_ack_t *) base_header);
|
|
break;
|
|
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK:
|
|
ompi_osc_rdma_process_unlock_ack (module, source, (ompi_osc_rdma_header_unlock_ack_t *) base_header);
|
|
break;
|
|
default:
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"received unexpected message of type %x",
|
|
(int) base_header->type));
|
|
}
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"finished processing incomming messages"));
|
|
|
|
/* restart the receive request */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
|
|
mark_incoming_completion (module, (base_header->flags & OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET) ?
|
|
source : MPI_PROC_NULL);
|
|
|
|
osc_rdma_gc_clean (module);
|
|
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
ompi_osc_rdma_frag_start_receive (module);
|
|
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"finished posting receive request"));
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module)
|
|
{
|
|
return ompi_osc_rdma_irecv_w_cb (module->incomming_buffer, mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t),
|
|
MPI_BYTE, OMPI_ANY_SOURCE, OSC_RDMA_FRAG_TAG, module->comm, &module->frag_request,
|
|
ompi_osc_rdma_callback, module);
|
|
}
|
|
|
|
int ompi_osc_rdma_component_irecv (ompi_osc_rdma_module_t *module, void *buf,
|
|
size_t count, struct ompi_datatype_t *datatype,
|
|
int src, int tag, struct ompi_communicator_t *comm)
|
|
{
|
|
return ompi_osc_rdma_irecv_w_cb (buf, count, datatype, src, tag, comm, NULL,
|
|
osc_rdma_incomming_req_omplete, module);
|
|
}
|
|
|
|
|
|
static int
|
|
isend_completion_cb(ompi_request_t *request)
|
|
{
|
|
ompi_osc_rdma_module_t *module =
|
|
(ompi_osc_rdma_module_t*) request->req_complete_cb_data;
|
|
|
|
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
|
|
"isend_completion_cb called"));
|
|
|
|
mark_outgoing_completion(module);
|
|
|
|
/* put this request on the garbage colletion list */
|
|
OPAL_THREAD_LOCK(&module->lock);
|
|
opal_list_append (&module->request_gc, (opal_list_item_t *) request);
|
|
OPAL_THREAD_UNLOCK(&module->lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
int
|
|
ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
|
|
void *buf,
|
|
size_t count,
|
|
struct ompi_datatype_t *datatype,
|
|
int dest,
|
|
int tag,
|
|
struct ompi_communicator_t *comm)
|
|
{
|
|
return ompi_osc_rdma_isend_w_cb (buf, count, datatype, dest, tag, comm,
|
|
isend_completion_cb, module);
|
|
}
|
|
|
|
int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
|
|
ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx)
|
|
{
|
|
ompi_request_t *request;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: ompi_osc_rdma_isend_w_cb sending %d bytes to %d with tag %d",
|
|
count, target, tag));
|
|
|
|
ret = MCA_PML_CALL(isend_init(ptr, count, datatype, target, tag,
|
|
MCA_PML_BASE_SEND_STANDARD, comm, &request));
|
|
if (OMPI_SUCCESS != ret) {
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"error sending fragment. ret = %d", ret));
|
|
return ret;
|
|
}
|
|
|
|
request->req_complete_cb = cb;
|
|
request->req_complete_cb_data = ctx;
|
|
|
|
ret = MCA_PML_CALL(start(1, &request));
|
|
|
|
return ret;
|
|
}
|
|
|
|
int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
|
|
ompi_communicator_t *comm, ompi_request_t **request_out,
|
|
ompi_request_complete_fn_t cb, void *ctx)
|
|
{
|
|
ompi_request_t *request;
|
|
int ret;
|
|
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"osc rdma: ompi_osc_rdma_irecv_w_cb receiving %d bytes from %d with tag %d",
|
|
count, target, tag));
|
|
|
|
ret = MCA_PML_CALL(irecv_init(ptr, count, datatype, target, tag, comm, &request));
|
|
if (OMPI_SUCCESS != ret) {
|
|
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
|
"error posting receive. ret = %d", ret));
|
|
return ret;
|
|
}
|
|
|
|
request->req_complete_cb = cb;
|
|
request->req_complete_cb_data = ctx;
|
|
if (request_out) {
|
|
*request_out = request;
|
|
}
|
|
|
|
ret = MCA_PML_CALL(start(1, &request));
|
|
|
|
return ret;
|
|
}
|