1
1

Merge pull request #335 from hjelmn/osc_updates

Osc updates
Этот коммит содержится в:
Nathan Hjelm 2015-01-07 11:16:55 -06:00
родитель 49069bc661 e68ed2876c
Коммит 81dc3a5db9
24 изменённых файлов: 2557 добавлений и 2652 удалений

Просмотреть файл

@ -47,6 +47,10 @@ static ompi_errcode_intern_t ompi_err_request;
static ompi_errcode_intern_t ompi_err_buffer;
static ompi_errcode_intern_t ompi_err_rma_sync;
static ompi_errcode_intern_t ompi_err_rma_shared;
static ompi_errcode_intern_t ompi_err_rma_attach;
static ompi_errcode_intern_t ompi_err_rma_range;
static ompi_errcode_intern_t ompi_err_rma_conflict;
static ompi_errcode_intern_t ompi_err_win;
static void ompi_errcode_intern_construct(ompi_errcode_intern_t* errcode);
static void ompi_errcode_intern_destruct(ompi_errcode_intern_t* errcode);
@ -210,6 +214,38 @@ int ompi_errcode_intern_init (void)
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_shared.index,
&ompi_err_rma_shared);
OBJ_CONSTRUCT(&ompi_err_rma_attach, ompi_errcode_intern_t);
ompi_err_rma_attach.code = OMPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.mpi_code = MPI_ERR_RMA_ATTACH;
ompi_err_rma_attach.index = pos++;
strncpy(ompi_err_rma_attach.errstring, "OMPI_ERR_RMA_ATTACH", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_attach.index,
&ompi_err_rma_attach);
OBJ_CONSTRUCT(&ompi_err_rma_range, ompi_errcode_intern_t);
ompi_err_rma_range.code = OMPI_ERR_RMA_RANGE;
ompi_err_rma_range.mpi_code = MPI_ERR_RMA_RANGE;
ompi_err_rma_range.index = pos++;
strncpy(ompi_err_rma_range.errstring, "OMPI_ERR_RMA_RANGE", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_range.index,
&ompi_err_rma_range);
OBJ_CONSTRUCT(&ompi_err_rma_conflict, ompi_errcode_intern_t);
ompi_err_rma_conflict.code = OMPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.mpi_code = MPI_ERR_RMA_CONFLICT;
ompi_err_rma_conflict.index = pos++;
strncpy(ompi_err_rma_conflict.errstring, "OMPI_ERR_RMA_CONFLICT", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_rma_conflict.index,
&ompi_err_rma_conflict);
OBJ_CONSTRUCT(&ompi_err_win, ompi_errcode_intern_t);
ompi_err_win.code = OMPI_ERR_WIN;
ompi_err_win.mpi_code = MPI_ERR_WIN;
ompi_err_win.index = pos++;
strncpy(ompi_err_win.errstring, "OMPI_ERR_WIN", OMPI_MAX_ERROR_STRING);
opal_pointer_array_set_item(&ompi_errcodes_intern, ompi_err_win.index,
&ompi_err_win);
ompi_errcode_intern_lastused=pos;
return OMPI_SUCCESS;
}
@ -235,6 +271,10 @@ int ompi_errcode_intern_finalize(void)
OBJ_DESTRUCT(&ompi_err_request);
OBJ_DESTRUCT(&ompi_err_rma_sync);
OBJ_DESTRUCT(&ompi_err_rma_shared);
OBJ_DESTRUCT(&ompi_err_rma_attach);
OBJ_DESTRUCT(&ompi_err_rma_range);
OBJ_DESTRUCT(&ompi_err_rma_conflict);
OBJ_DESTRUCT(&ompi_err_win);
OBJ_DESTRUCT(&ompi_errcodes_intern);
return OMPI_SUCCESS;

Просмотреть файл

@ -66,7 +66,11 @@ enum {
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1,
OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2,
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3
OMPI_ERR_RMA_SHARED = OMPI_ERR_BASE - 3,
OMPI_ERR_RMA_ATTACH = OMPI_ERR_BASE - 4,
OMPI_ERR_RMA_RANGE = OMPI_ERR_BASE - 5,
OMPI_ERR_RMA_CONFLICT = OMPI_ERR_BASE - 6,
OMPI_ERR_WIN = OMPI_ERR_BASE - 7,
};
#define OMPI_ERR_MAX (OMPI_ERR_BASE - 100)

Просмотреть файл

@ -3,51 +3,53 @@
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
#
# Additional copyrights may follow
#
#
# $HEADER$
#
rdma_sources = \
osc_rdma.h \
osc_rdma.c \
osc_rdma_comm.c \
osc_rdma_component.c \
osc_rdma_data_move.h \
osc_rdma_data_move.c \
osc_rdma_frag.h \
osc_rdma_frag.c \
osc_rdma_header.h \
osc_rdma_obj_convert.h \
osc_rdma_request.h \
osc_rdma_request.c \
osc_rdma_active_target.c \
osc_rdma_passive_target.c
pt2pt_sources = \
osc_pt2pt.h \
osc_pt2pt_module.c \
osc_pt2pt_comm.c \
osc_pt2pt_component.c \
osc_pt2pt_data_move.h \
osc_pt2pt_data_move.c \
osc_pt2pt_frag.h \
osc_pt2pt_frag.c \
osc_pt2pt_header.h \
osc_pt2pt_obj_convert.h \
osc_pt2pt_request.h \
osc_pt2pt_request.c \
osc_pt2pt_active_target.c \
osc_pt2pt_passive_target.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_ompi_osc_rdma_DSO
if MCA_BUILD_ompi_osc_pt2pt_DSO
component_noinst =
component_install = mca_osc_rdma.la
component_install = mca_osc_pt2pt.la
else
component_noinst = libmca_osc_rdma.la
component_noinst = libmca_osc_pt2pt.la
component_install =
endif
mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_osc_rdma_la_SOURCES = $(rdma_sources)
mca_osc_rdma_la_LDFLAGS = -module -avoid-version
mca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
mca_osc_pt2pt_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_osc_rdma_la_SOURCES = $(rdma_sources)
libmca_osc_rdma_la_LDFLAGS = -module -avoid-version
libmca_osc_pt2pt_la_SOURCES = $(pt2pt_sources)
libmca_osc_pt2pt_la_LDFLAGS = -module -avoid-version

20
ompi/mca/osc/pt2pt/configure.m4 Обычный файл
Просмотреть файл

@ -0,0 +1,20 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# Copyright (c) 2014 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_pt2pt_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_pt2pt_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/pt2pt/Makefile])
[$1]
])dnl

Просмотреть файл

@ -19,8 +19,8 @@
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_H
#define OMPI_OSC_RDMA_H
#ifndef OMPI_OSC_PT2PT_H
#define OMPI_OSC_PT2PT_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
@ -39,13 +39,13 @@
#include "ompi/mca/bml/bml.h"
#include "ompi/memchecker.h"
#include "osc_rdma_header.h"
#include "osc_pt2pt_header.h"
BEGIN_C_DECLS
struct ompi_osc_rdma_frag_t;
struct ompi_osc_pt2pt_frag_t;
struct ompi_osc_rdma_component_t {
struct ompi_osc_pt2pt_component_t {
/** Extend the basic osc component interface */
ompi_osc_base_component_t super;
@ -58,46 +58,45 @@ struct ompi_osc_rdma_component_t {
/** module count */
int module_count;
/** free list of ompi_osc_rdma_frag_t structures */
opal_free_list_t frags;
/** free list of ompi_osc_pt2pt_frag_t structures */
ompi_free_list_t frags;
/** Free list of requests */
ompi_free_list_t requests;
/** RDMA component buffer size */
/** PT2PT component buffer size */
unsigned int buffer_size;
/** Lock for pending_operations */
opal_mutex_t pending_operations_lock;
/** List of operations that need to be processed */
opal_list_t pending_operations;
/** Is the progress function enabled? */
bool progress_enable;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
};
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
typedef struct ompi_osc_pt2pt_component_t ompi_osc_pt2pt_component_t;
struct ompi_osc_rdma_peer_t {
struct ompi_osc_pt2pt_peer_t {
/** Pointer to the current send fragment for each outgoing target */
struct ompi_osc_rdma_frag_t *active_frag;
struct ompi_osc_pt2pt_frag_t *active_frag;
/** Number of acks pending. New requests can not be sent out if there are
* acks pending (to fulfill the ordering constraints of accumulate) */
uint32_t num_acks_pending;
int32_t passive_incoming_frag_count;
bool access_epoch;
bool eager_send_active;
};
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
typedef struct ompi_osc_pt2pt_peer_t ompi_osc_pt2pt_peer_t;
#define SEQ_INVALID 0xFFFFFFFFFFFFFFFFULL
/** Module structure. Exactly one of these is associated with each
RDMA window */
struct ompi_osc_rdma_module_t {
PT2PT window */
struct ompi_osc_pt2pt_module_t {
/** Extend the basic osc module interface */
ompi_osc_base_module_t super;
@ -127,12 +126,15 @@ struct ompi_osc_rdma_module_t {
opal_mutex_t acc_lock;
/** peer data */
ompi_osc_rdma_peer_t *peers;
ompi_osc_pt2pt_peer_t *peers;
/** Nmber of communication fragments started for this epoch, by
peer. Not in peer data to make fence more manageable. */
uint32_t *epoch_outgoing_frag_count;
/** Lock for queued_frags */
opal_mutex_t queued_frags_lock;
/** List of full communication buffers queued to be sent. Should
be maintained in order (at least in per-target order). */
opal_list_t queued_frags;
@ -152,9 +154,6 @@ struct ompi_osc_rdma_module_t {
/* Next incoming buffer count at which we want a signal on cond */
uint32_t active_incoming_frag_signal_count;
uint32_t *passive_incoming_frag_count;
uint32_t *passive_incoming_frag_signal_count;
/* Number of flush ack requests send since beginning of time */
uint64_t flush_ack_requested_count;
/* Number of flush ack replies received since beginning of
@ -171,8 +170,6 @@ struct ompi_osc_rdma_module_t {
/** Indicates the window is in an all access epoch (fence, lock_all) */
bool all_access_epoch;
bool *passive_eager_send_active;
/* ********************* PWSC data ************************ */
struct ompi_group_t *pw_group;
struct ompi_group_t *sc_group;
@ -189,9 +186,11 @@ struct ompi_osc_rdma_module_t {
/** Status of the local window lock. One of 0 (unlocked),
MPI_LOCK_EXCLUSIVE, or MPI_LOCK_SHARED. */
int lock_status;
/** number of peers who hold a shared lock on the local window */
int32_t shared_count;
int32_t lock_status;
/** lock for locks_pending list */
opal_mutex_t locks_pending_lock;
/** target side list of lock requests we couldn't satisfy yet */
opal_list_t locks_pending;
@ -210,29 +209,38 @@ struct ompi_osc_rdma_module_t {
/* enforce pscw matching */
/** list of unmatched post messages */
opal_list_t pending_posts;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
struct ompi_osc_rdma_pending_t {
/** Lock for garbage collection lists */
opal_mutex_t gc_lock;
/** List of requests that need to be freed */
opal_list_t request_gc;
/** List of buffers that need to be freed */
opal_list_t buffer_gc;
};
typedef struct ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_t;
OMPI_MODULE_DECLSPEC extern ompi_osc_pt2pt_component_t mca_osc_pt2pt_component;
struct ompi_osc_pt2pt_pending_t {
opal_list_item_t super;
ompi_osc_rdma_module_t *module;
ompi_osc_pt2pt_module_t *module;
int source;
ompi_osc_rdma_header_t header;
ompi_osc_pt2pt_header_t header;
};
typedef struct ompi_osc_rdma_pending_t ompi_osc_rdma_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t);
typedef struct ompi_osc_pt2pt_pending_t ompi_osc_pt2pt_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_t);
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
#define GET_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module)
extern bool ompi_osc_rdma_no_locks;
extern bool ompi_osc_pt2pt_no_locks;
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base);
int ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base);
int ompi_osc_rdma_free(struct ompi_win_t *win);
int ompi_osc_pt2pt_free(struct ompi_win_t *win);
int ompi_osc_rdma_put(void *origin_addr,
int ompi_osc_pt2pt_put(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -241,7 +249,7 @@ int ompi_osc_rdma_put(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_accumulate(void *origin_addr,
int ompi_osc_pt2pt_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -251,7 +259,7 @@ int ompi_osc_rdma_accumulate(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_get(void *origin_addr,
int ompi_osc_pt2pt_get(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -260,7 +268,7 @@ int ompi_osc_rdma_get(void *origin_addr,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_compare_and_swap(void *origin_addr,
int ompi_osc_pt2pt_compare_and_swap(void *origin_addr,
void *compare_addr,
void *result_addr,
struct ompi_datatype_t *dt,
@ -268,7 +276,7 @@ int ompi_osc_rdma_compare_and_swap(void *origin_addr,
OPAL_PTRDIFF_TYPE target_disp,
struct ompi_win_t *win);
int ompi_osc_rdma_fetch_and_op(void *origin_addr,
int ompi_osc_pt2pt_fetch_and_op(void *origin_addr,
void *result_addr,
struct ompi_datatype_t *dt,
int target,
@ -276,7 +284,7 @@ int ompi_osc_rdma_fetch_and_op(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_get_accumulate(void *origin_addr,
int ompi_osc_pt2pt_get_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr,
@ -289,7 +297,7 @@ int ompi_osc_rdma_get_accumulate(void *origin_addr,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_rput(void *origin_addr,
int ompi_osc_pt2pt_rput(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -299,7 +307,7 @@ int ompi_osc_rdma_rput(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget(void *origin_addr,
int ompi_osc_pt2pt_rget(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -309,7 +317,7 @@ int ompi_osc_rdma_rget(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_raccumulate(void *origin_addr,
int ompi_osc_pt2pt_raccumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
@ -320,7 +328,7 @@ int ompi_osc_rdma_raccumulate(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_rget_accumulate(void *origin_addr,
int ompi_osc_pt2pt_rget_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_datatype,
void *result_addr,
@ -334,51 +342,51 @@ int ompi_osc_rdma_rget_accumulate(void *origin_addr,
struct ompi_win_t *win,
struct ompi_request_t **request);
int ompi_osc_rdma_fence(int assert, struct ompi_win_t *win);
int ompi_osc_pt2pt_fence(int assert, struct ompi_win_t *win);
/* received a post message */
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source);
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source);
int ompi_osc_rdma_start(struct ompi_group_t *group,
int ompi_osc_pt2pt_start(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_complete(struct ompi_win_t *win);
int ompi_osc_pt2pt_complete(struct ompi_win_t *win);
int ompi_osc_rdma_post(struct ompi_group_t *group,
int ompi_osc_pt2pt_post(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_wait(struct ompi_win_t *win);
int ompi_osc_pt2pt_wait(struct ompi_win_t *win);
int ompi_osc_rdma_test(struct ompi_win_t *win,
int ompi_osc_pt2pt_test(struct ompi_win_t *win,
int *flag);
int ompi_osc_rdma_lock(int lock_type,
int ompi_osc_pt2pt_lock(int lock_type,
int target,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_unlock(int target,
int ompi_osc_pt2pt_unlock(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_lock_all(int assert,
int ompi_osc_pt2pt_lock_all(int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_unlock_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_unlock_all(struct ompi_win_t *win);
int ompi_osc_rdma_sync(struct ompi_win_t *win);
int ompi_osc_pt2pt_sync(struct ompi_win_t *win);
int ompi_osc_rdma_flush(int target,
int ompi_osc_pt2pt_flush(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_flush_all(struct ompi_win_t *win);
int ompi_osc_rdma_flush_local(int target,
int ompi_osc_pt2pt_flush_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_flush_local(int target,
struct ompi_win_t *win);
int ompi_osc_rdma_flush_local_all(struct ompi_win_t *win);
int ompi_osc_pt2pt_flush_local_all(struct ompi_win_t *win);
int ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
int ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
int ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info);
int ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used);
int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_component_irecv(ompi_osc_pt2pt_module_t *module,
void *buf,
size_t count,
struct ompi_datatype_t *datatype,
@ -386,7 +394,7 @@ int ompi_osc_rdma_component_irecv(ompi_osc_rdma_module_t *module,
int tag,
struct ompi_communicator_t *comm);
int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_component_isend(ompi_osc_pt2pt_module_t *module,
void *buf,
size_t count,
struct ompi_datatype_t *datatype,
@ -395,16 +403,16 @@ int ompi_osc_rdma_component_isend(ompi_osc_rdma_module_t *module,
struct ompi_communicator_t *comm);
/**
* ompi_osc_rdma_progress_pending_acc:
* ompi_osc_pt2pt_progress_pending_acc:
*
* @short Progress one pending accumulation or compare and swap operation.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long If the accumulation lock can be aquired progress one pending
* accumulate or compare and swap operation.
*/
int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
int ompi_osc_pt2pt_progress_pending_acc (ompi_osc_pt2pt_module_t *module);
/**
@ -412,7 +420,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
*
* @short Increment incoming completeion count.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Passive target source or MPI_PROC_NULL (active target)
*
* @long This function incremements either the passive or active incoming counts.
@ -420,7 +428,7 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module);
* This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function.
*/
static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int source)
static inline void mark_incoming_completion (ompi_osc_pt2pt_module_t *module, int source)
{
if (MPI_PROC_NULL == source) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
@ -431,11 +439,12 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
opal_condition_broadcast(&module->cond);
}
} else {
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"mark_incoming_completion marking passive incoming complete. source = %d, count = %d",
source, (int) module->passive_incoming_frag_count[source] + 1));
OPAL_THREAD_ADD32((int32_t *) (module->passive_incoming_frag_count + source), 1);
if (module->passive_incoming_frag_count[source] >= module->passive_incoming_frag_signal_count[source]) {
source, (int) peer->passive_incoming_frag_count + 1));
OPAL_THREAD_ADD32((int32_t *) &peer->passive_incoming_frag_count, 1);
if (0 == peer->passive_incoming_frag_count) {
opal_condition_broadcast(&module->cond);
}
}
@ -446,7 +455,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
*
* @short Increment outgoing count.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function is used to signal that an outgoing send is complete. It
* incrememnts only the outgoing fragment count and signals the module
@ -454,7 +463,7 @@ static inline void mark_incoming_completion (ompi_osc_rdma_module_t *module, int
* uses atomics if necessary so it is not necessary to hold the module
* lock before calling this function.
*/
static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
static inline void mark_outgoing_completion (ompi_osc_pt2pt_module_t *module)
{
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_count, 1);
if (module->outgoing_frag_count >= module->outgoing_frag_signal_count) {
@ -467,14 +476,14 @@ static inline void mark_outgoing_completion (ompi_osc_rdma_module_t *module)
*
* @short Increment outgoing signal counters.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] target - Passive target rank or MPI_PROC_NULL (active target)
* @param[in] count - Number of outgoing messages to signal.
*
* @long This function uses atomics if necessary so it is not necessary to hold
* the module lock before calling this function.
*/
static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int target, int count)
static inline void ompi_osc_signal_outgoing (ompi_osc_pt2pt_module_t *module, int target, int count)
{
OPAL_THREAD_ADD32((int32_t *) &module->outgoing_frag_signal_count, count);
if (MPI_PROC_NULL != target) {
@ -486,7 +495,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
}
/**
* osc_rdma_copy_on_recv:
* osc_pt2pt_copy_on_recv:
*
* @short Helper function. Copies data from source to target through the
* convertor.
@ -502,7 +511,7 @@ static inline void ompi_osc_signal_outgoing (ompi_osc_rdma_module_t *module, int
* buffer. The copy is done with a convertor generated from proc,
* datatype, and count.
*/
static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
static inline void osc_pt2pt_copy_on_recv (void *target, void *source, size_t source_len, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype)
{
opal_convertor_t convertor;
@ -530,7 +539,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
}
/**
* osc_rdma_copy_for_send:
* osc_pt2pt_copy_for_send:
*
* @short: Helper function. Copies data from source to target through the
* convertor.
@ -546,7 +555,7 @@ static inline void osc_rdma_copy_on_recv (void *target, void *source, size_t sou
* buffer. The copy is done with a convertor generated from proc,
* datatype, and count.
*/
static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
static inline void osc_pt2pt_copy_for_send (void *target, size_t target_len, void *source, ompi_proc_t *proc,
int count, ompi_datatype_t *datatype)
{
opal_convertor_t convertor;
@ -567,7 +576,7 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
}
/**
* osc_rdma_request_gc_clean:
* osc_pt2pt_request_gc_clean:
*
* @short Release finished PML requests and accumulate buffers.
*
@ -576,71 +585,77 @@ static inline void osc_rdma_copy_for_send (void *target, size_t target_len, void
* and buffers on the module's garbage collection lists and release then
* at a later time.
*/
static inline void osc_rdma_gc_clean (void)
static inline void osc_pt2pt_gc_clean (ompi_osc_pt2pt_module_t *module)
{
ompi_request_t *request;
opal_list_item_t *item;
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_LOCK(&module->gc_lock);
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&mca_osc_rdma_component.request_gc))) {
while (NULL != (request = (ompi_request_t *) opal_list_remove_first (&module->request_gc))) {
OPAL_THREAD_UNLOCK(&module->gc_lock);
ompi_request_free (&request);
OPAL_THREAD_LOCK(&module->gc_lock);
}
while (NULL != (item = opal_list_remove_first (&mca_osc_rdma_component.buffer_gc))) {
while (NULL != (item = opal_list_remove_first (&module->buffer_gc))) {
OBJ_RELEASE(item);
}
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&module->gc_lock);
}
static inline void osc_rdma_gc_add_request (ompi_request_t *request)
static inline void osc_pt2pt_gc_add_request (ompi_osc_pt2pt_module_t *module, ompi_request_t *request)
{
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_list_append (&mca_osc_rdma_component.request_gc, (opal_list_item_t *) request);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&module->request_gc, (opal_list_item_t *) request));
}
static inline void osc_rdma_gc_add_buffer (opal_list_item_t *buffer)
static inline void osc_pt2pt_gc_add_buffer (ompi_osc_pt2pt_module_t *module, opal_list_item_t *buffer)
{
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_list_append (&mca_osc_rdma_component.buffer_gc, buffer);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&module->gc_lock,
opal_list_append (&module->buffer_gc, buffer));
}
#define OSC_RDMA_FRAG_TAG 0x10000
#define OSC_RDMA_FRAG_MASK 0x0ffff
static inline void osc_pt2pt_add_pending (ompi_osc_pt2pt_pending_t *pending)
{
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.pending_operations_lock,
opal_list_append (&mca_osc_pt2pt_component.pending_operations, &pending->super));
}
#define OSC_PT2PT_FRAG_TAG 0x10000
#define OSC_PT2PT_FRAG_MASK 0x0ffff
/**
* get_tag:
*
* @short Get a send/recv tag for large memory operations.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function aquires a 16-bit tag for use with large memory operations. The
* tag will be odd or even depending on if this is in a passive target access
* or not.
*/
static inline int get_tag(ompi_osc_rdma_module_t *module)
static inline int get_tag(ompi_osc_pt2pt_module_t *module)
{
/* the LSB of the tag is used be the receiver to determine if the
message is a passive or active target (ie, where to mark
completion). */
int tmp = module->tag_counter + !!(module->passive_target_access_epoch);
module->tag_counter = (module->tag_counter + 2) & OSC_RDMA_FRAG_MASK;
module->tag_counter = (module->tag_counter + 2) & OSC_PT2PT_FRAG_MASK;
return tmp;
}
/**
* ompi_osc_rdma_accumulate_lock:
* ompi_osc_pt2pt_accumulate_lock:
*
* @short Internal function that spins until the accumulation lock has
* been aquired.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @returns 0
*
@ -648,9 +663,9 @@ static inline int get_tag(ompi_osc_rdma_module_t *module)
* behavior is only acceptable from a user-level call as blocking in a
* callback may cause deadlock. If a callback needs the accumulate lock and
* it is not available it should be placed on the pending_acc list of the
* module. It will be released by ompi_osc_rdma_accumulate_unlock().
* module. It will be released by ompi_osc_pt2pt_accumulate_unlock().
*/
static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
static inline int ompi_osc_pt2pt_accumulate_lock (ompi_osc_pt2pt_module_t *module)
{
while (opal_atomic_trylock (&module->accumulate_lock)) {
opal_progress ();
@ -660,11 +675,11 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
}
/**
* ompi_osc_rdma_accumulate_trylock:
* ompi_osc_pt2pt_accumulate_trylock:
*
* @short Try to aquire the accumulation lock.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @returns 0 if the accumulation lock was aquired
* @returns 1 if the lock was not available
@ -672,34 +687,34 @@ static inline int ompi_osc_rdma_accumulate_lock (ompi_osc_rdma_module_t *module)
* @long This function will try to aquire the accumulation lock. This function
* is safe to call from a callback.
*/
static inline int ompi_osc_rdma_accumulate_trylock (ompi_osc_rdma_module_t *module)
static inline int ompi_osc_pt2pt_accumulate_trylock (ompi_osc_pt2pt_module_t *module)
{
return opal_atomic_trylock (&module->accumulate_lock);
}
/**
* ompi_osc_rdma_accumulate_unlock:
* ompi_osc_pt2pt_accumulate_unlock:
*
* @short Unlock the accumulation lock and release a pending accumulation operation.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
*
* @long This function unlocks the accumulation lock and release a single pending
* accumulation operation if one exists. This function may be called recursively.
*/
static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *module)
static inline void ompi_osc_pt2pt_accumulate_unlock (ompi_osc_pt2pt_module_t *module)
{
opal_atomic_unlock (&module->accumulate_lock);
if (0 != opal_list_get_size (&module->pending_acc)) {
ompi_osc_rdma_progress_pending_acc (module);
ompi_osc_pt2pt_progress_pending_acc (module);
}
}
static inline bool ompi_osc_rdma_check_access_epoch (ompi_osc_rdma_module_t *module, int rank)
static inline bool ompi_osc_pt2pt_check_access_epoch (ompi_osc_pt2pt_module_t *module, int rank)
{
return module->all_access_epoch || module->peers[rank].access_epoch;
}
END_C_DECLS
#endif /* OMPI_OSC_RDMA_H */
#endif /* OMPI_OSC_PT2PT_H */

Просмотреть файл

@ -4,27 +4,27 @@
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_header.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
@ -33,19 +33,19 @@
#include "ompi/mca/osc/base/base.h"
/**
* ompi_osc_rdma_pending_post_t:
* ompi_osc_pt2pt_pending_post_t:
*
* Describes a post operation that was encountered outside its
* matching start operation.
*/
struct ompi_osc_rdma_pending_post_t {
struct ompi_osc_pt2pt_pending_post_t {
opal_list_item_t super;
int rank;
};
typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_post_t);
typedef struct ompi_osc_pt2pt_pending_post_t ompi_osc_pt2pt_pending_post_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_post_t);
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_post_t, opal_list_item_t, NULL, NULL);
static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
{
@ -64,13 +64,13 @@ static bool group_contains_proc (ompi_group_t *group, ompi_proc_t *proc)
}
static int*
get_comm_ranks(ompi_osc_rdma_module_t *module,
get_comm_ranks(ompi_osc_pt2pt_module_t *module,
ompi_group_t *sub_group)
{
int *ranks1 = NULL, *ranks2 = NULL;
bool success = false;
int i, ret;
ranks1 = malloc(sizeof(int) * ompi_group_size(sub_group));
if (NULL == ranks1) goto cleanup;
ranks2 = malloc(sizeof(int) * ompi_group_size(sub_group));
@ -82,7 +82,7 @@ get_comm_ranks(ompi_osc_rdma_module_t *module,
ret = ompi_group_translate_ranks(sub_group,
ompi_group_size(sub_group),
ranks1,
ranks1,
module->comm->c_local_group,
ranks2);
if (OMPI_SUCCESS != ret) goto cleanup;
@ -100,14 +100,14 @@ get_comm_ranks(ompi_osc_rdma_module_t *module,
}
int
ompi_osc_rdma_fence(int assert, ompi_win_t *win)
ompi_osc_pt2pt_fence(int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
uint32_t incoming_reqs;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence start"));
"osc pt2pt: fence start"));
/* can't enter an active target epoch when in a passive target epoch */
if (module->passive_target_access_epoch) {
@ -122,34 +122,36 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
/* short-circuit the noprecede case */
if (0 != (assert & MPI_MODE_NOPRECEDE)) {
ret = module->comm->c_coll.coll_barrier(module->comm,
module->comm->c_coll.coll_barrier_module);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence end (short circuit)"));
"osc pt2pt: fence end (short circuit)"));
return ret;
}
/* try to start all the requests. */
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* try to start all requests. */
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) {
return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence done sending"));
"osc pt2pt: fence done sending"));
/* find out how much data everyone is going to send us. */
ret = module->comm->c_coll.coll_reduce_scatter_block (module->epoch_outgoing_frag_count,
&incoming_reqs, 1, MPI_UINT32_T,
MPI_SUM, module->comm,
module->comm->c_coll.coll_reduce_scatter_block_module);
if (OMPI_SUCCESS != ret) goto cleanup;
if (OMPI_SUCCESS != ret) {
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
OPAL_THREAD_LOCK(&module->lock);
bzero(module->epoch_outgoing_frag_count,
sizeof(uint32_t) * ompi_comm_size(module->comm));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: fence expects %d requests",
"osc pt2pt: fence expects %d requests",
incoming_reqs));
/* set our complete condition for incoming requests */
@ -161,32 +163,31 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
opal_condition_wait(&module->cond, &module->lock);
}
ret = OMPI_SUCCESS;
module->active_incoming_frag_signal_count = 0;
if (assert & MPI_MODE_NOSUCCEED) {
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
module->active_eager_send_active = false;
module->all_access_epoch = false;
}
cleanup:
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: fence end: %d", ret));
}
opal_condition_broadcast (&module->cond);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: fence end: %d", ret));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_start(ompi_group_t *group,
ompi_osc_pt2pt_start(ompi_group_t *group,
int assert,
ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_pending_post_t *pending_post, *next;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_pending_post_t *pending_post, *next;
int group_size;
int *ranks;
@ -203,13 +204,13 @@ ompi_osc_rdma_start(ompi_group_t *group,
OBJ_RETAIN(group);
ompi_group_increment_proc_count(group);
module->sc_group = group;
module->sc_group = group;
/* mark all procs in this group as being in an access epoch */
group_size = ompi_group_size (module->sc_group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering with group size %d...",
"ompi_osc_pt2pt_start entering with group size %d...",
group_size));
ranks = get_comm_ranks(module, module->sc_group);
@ -222,13 +223,17 @@ ompi_osc_rdma_start(ompi_group_t *group,
free (ranks);
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_pt2pt_pending_post_t) {
ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank);
if (group_contains_proc (module->sc_group, pending_proc)) {
ompi_osc_pt2pt_peer_t *peer = module->peers + pending_post->rank;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d",
pending_post->rank));
++module->num_post_msgs;
peer->eager_send_active = true;
opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post);
}
@ -254,7 +259,7 @@ ompi_osc_rdma_start(ompi_group_t *group,
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start complete"));
"ompi_osc_pt2pt_start complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
@ -262,11 +267,11 @@ ompi_osc_rdma_start(ompi_group_t *group,
int
ompi_osc_rdma_complete(ompi_win_t *win)
ompi_osc_pt2pt_complete(ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_complete_t complete_req;
ompi_osc_rdma_peer_t *peer;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_header_complete_t complete_req;
ompi_osc_pt2pt_peer_t *peer;
int ret = OMPI_SUCCESS;
int i;
int *ranks = NULL;
@ -274,7 +279,7 @@ ompi_osc_rdma_complete(ompi_win_t *win)
int my_rank = ompi_comm_rank (module->comm);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete entering..."));
"ompi_osc_pt2pt_complete entering..."));
if (NULL == module->sc_group) {
return OMPI_ERR_RMA_SYNC;
@ -291,9 +296,10 @@ ompi_osc_rdma_complete(ompi_win_t *win)
"waiting for post messages. num_post_msgs = %d", module->num_post_msgs));
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete sending complete message"));
"ompi_osc_pt2pt_complete sending complete message"));
/* for each process in group, send a control message with number
of updates coming, then start all the requests. Note that the
@ -302,38 +308,39 @@ ompi_osc_rdma_complete(ompi_win_t *win)
At the same time, clean out the outgoing count for the next
round. */
OPAL_THREAD_UNLOCK(&module->lock);
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
if (my_rank == ranks[i]) {
/* shortcut for self */
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self complete"));
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self complete"));
module->num_complete_msgs++;
continue;
}
complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
complete_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]];
peer = module->peers + ranks[i];
peer->access_epoch = false;
ret = ompi_osc_rdma_control_send(module,
ret = ompi_osc_pt2pt_control_send(module,
ranks[i],
&complete_req,
sizeof(ompi_osc_rdma_header_complete_t));
sizeof(ompi_osc_pt2pt_header_complete_t));
if (OMPI_SUCCESS != ret) goto cleanup;
}
OPAL_THREAD_LOCK(&module->lock);
/* start all requests */
ret = ompi_osc_rdma_frag_flush_all(module);
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_THREAD_LOCK(&module->lock);
/* zero the fragment counts here to ensure they are zerod */
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
peer = module->peers + ranks[i];
module->epoch_outgoing_frag_count[ranks[i]] = 0;
peer->eager_send_active = false;
}
/* wait for outgoing requests to complete. Don't wait for incoming, as
@ -347,14 +354,14 @@ ompi_osc_rdma_complete(ompi_win_t *win)
module->sc_group = NULL;
/* unlock here, as group cleanup can take a while... */
OPAL_THREAD_UNLOCK(&(module->lock));
OPAL_THREAD_UNLOCK(&module->lock);
/* phase 2 cleanup group */
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete complete"));
"ompi_osc_pt2pt_complete complete"));
free (ranks);
return OMPI_SUCCESS;
@ -362,21 +369,19 @@ ompi_osc_rdma_complete(ompi_win_t *win)
cleanup:
if (NULL != ranks) free(ranks);
OPAL_THREAD_UNLOCK(&(module->lock));
return ret;
}
int
ompi_osc_rdma_post(ompi_group_t *group,
ompi_osc_pt2pt_post(ompi_group_t *group,
int assert,
ompi_win_t *win)
{
int *ranks;
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_post_t post_req;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_header_post_t post_req;
int my_rank = ompi_comm_rank(module->comm);
/* can't check for all access epoch here due to fence */
@ -385,7 +390,7 @@ ompi_osc_rdma_post(ompi_group_t *group,
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post entering with group size %d...",
"ompi_osc_pt2pt_post entering with group size %d...",
ompi_group_size (group)));
/* save the group */
@ -422,19 +427,19 @@ ompi_osc_rdma_post(ompi_group_t *group,
/* shortcut for self */
if (my_rank == ranks[i]) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self post"));
osc_rdma_incoming_post (module, my_rank);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_pt2pt_complete self post"));
osc_pt2pt_incoming_post (module, my_rank);
continue;
}
post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
post_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
post_req.windx = ompi_comm_get_cid(module->comm);
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
ret = ompi_osc_rdma_control_send_unbuffered(module, ranks[i], &post_req,
sizeof(ompi_osc_rdma_header_post_t));
ret = ompi_osc_pt2pt_control_send_unbuffered(module, ranks[i], &post_req,
sizeof(ompi_osc_pt2pt_header_post_t));
if (OMPI_SUCCESS != ret) {
break;
}
@ -447,9 +452,9 @@ ompi_osc_rdma_post(ompi_group_t *group,
int
ompi_osc_rdma_wait(ompi_win_t *win)
ompi_osc_pt2pt_wait(ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group;
if (NULL == module->pw_group) {
@ -457,7 +462,7 @@ ompi_osc_rdma_wait(ompi_win_t *win)
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering..."));
"ompi_osc_pt2pt_wait entering..."));
OPAL_THREAD_LOCK(&module->lock);
while (0 != module->num_complete_msgs ||
@ -476,17 +481,17 @@ ompi_osc_rdma_wait(ompi_win_t *win)
OBJ_RELEASE(group);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait complete"));
"ompi_osc_pt2pt_wait complete"));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_test(ompi_win_t *win,
int
ompi_osc_pt2pt_test(ompi_win_t *win,
int *flag)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_group_t *group;
int ret = OMPI_SUCCESS;
@ -500,11 +505,10 @@ ompi_osc_rdma_test(ompi_win_t *win,
OPAL_THREAD_LOCK(&(module->lock));
if (0 != module->num_complete_msgs ||
if (0 != module->num_complete_msgs ||
module->active_incoming_frag_count != module->active_incoming_frag_signal_count) {
*flag = 0;
ret = OMPI_SUCCESS;
goto cleanup;
} else {
*flag = 1;
@ -519,21 +523,21 @@ ompi_osc_rdma_test(ompi_win_t *win,
return OMPI_SUCCESS;
}
cleanup:
OPAL_THREAD_UNLOCK(&(module->lock));
return ret;
}
int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
int osc_pt2pt_incoming_post (ompi_osc_pt2pt_module_t *module, int source)
{
ompi_proc_t *source_proc = ompi_comm_peer_lookup (module->comm, source);
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
OPAL_THREAD_LOCK(&module->lock);
/* verify that this proc is part of the current start group */
if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) {
ompi_osc_rdma_pending_post_t *pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
ompi_osc_pt2pt_pending_post_t *pending_post = OBJ_NEW(ompi_osc_pt2pt_pending_post_t);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received unexpected post message from %d. module->sc_group = %p, size = %d",
@ -547,6 +551,9 @@ int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
return OMPI_SUCCESS;
}
assert (!peer->eager_send_active);
peer->eager_send_active = true;
module->num_post_msgs++;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received post message. num_post_msgs = %d", module->num_post_msgs));

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -5,19 +5,19 @@
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -25,10 +25,10 @@
#include <string.h>
#include "osc_rdma.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_request.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "osc_pt2pt_request.h"
#include "opal/threads/condition.h"
#include "opal/threads/mutex.h"
@ -55,11 +55,11 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in
struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model);
ompi_osc_rdma_component_t mca_osc_rdma_component = {
ompi_osc_pt2pt_component_t mca_osc_pt2pt_component = {
{ /* ompi_osc_base_component_t */
{ /* ompi_base_component_t */
OMPI_OSC_BASE_VERSION_3_0_0,
"rdma",
"pt2pt",
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
@ -80,51 +80,51 @@ ompi_osc_rdma_component_t mca_osc_rdma_component = {
};
ompi_osc_rdma_module_t ompi_osc_rdma_module_template = {
ompi_osc_pt2pt_module_t ompi_osc_pt2pt_module_template = {
{
NULL, /* shared_query */
ompi_osc_rdma_attach,
ompi_osc_rdma_detach,
ompi_osc_rdma_free,
ompi_osc_pt2pt_attach,
ompi_osc_pt2pt_detach,
ompi_osc_pt2pt_free,
ompi_osc_rdma_put,
ompi_osc_rdma_get,
ompi_osc_rdma_accumulate,
ompi_osc_rdma_compare_and_swap,
ompi_osc_rdma_fetch_and_op,
ompi_osc_rdma_get_accumulate,
ompi_osc_pt2pt_put,
ompi_osc_pt2pt_get,
ompi_osc_pt2pt_accumulate,
ompi_osc_pt2pt_compare_and_swap,
ompi_osc_pt2pt_fetch_and_op,
ompi_osc_pt2pt_get_accumulate,
ompi_osc_rdma_rput,
ompi_osc_rdma_rget,
ompi_osc_rdma_raccumulate,
ompi_osc_rdma_rget_accumulate,
ompi_osc_pt2pt_rput,
ompi_osc_pt2pt_rget,
ompi_osc_pt2pt_raccumulate,
ompi_osc_pt2pt_rget_accumulate,
ompi_osc_rdma_fence,
ompi_osc_pt2pt_fence,
ompi_osc_rdma_start,
ompi_osc_rdma_complete,
ompi_osc_rdma_post,
ompi_osc_rdma_wait,
ompi_osc_rdma_test,
ompi_osc_pt2pt_start,
ompi_osc_pt2pt_complete,
ompi_osc_pt2pt_post,
ompi_osc_pt2pt_wait,
ompi_osc_pt2pt_test,
ompi_osc_rdma_lock,
ompi_osc_rdma_unlock,
ompi_osc_rdma_lock_all,
ompi_osc_rdma_unlock_all,
ompi_osc_pt2pt_lock,
ompi_osc_pt2pt_unlock,
ompi_osc_pt2pt_lock_all,
ompi_osc_pt2pt_unlock_all,
ompi_osc_rdma_sync,
ompi_osc_rdma_flush,
ompi_osc_rdma_flush_all,
ompi_osc_rdma_flush_local,
ompi_osc_rdma_flush_local_all,
ompi_osc_pt2pt_sync,
ompi_osc_pt2pt_flush,
ompi_osc_pt2pt_flush_all,
ompi_osc_pt2pt_flush_local,
ompi_osc_pt2pt_flush_local_all,
ompi_osc_rdma_set_info,
ompi_osc_rdma_get_info
ompi_osc_pt2pt_set_info,
ompi_osc_pt2pt_get_info
}
};
bool ompi_osc_rdma_no_locks;
bool ompi_osc_pt2pt_no_locks;
/* look up parameters for configuring this window. The code first
looks in the info structure passed by the user, then through mca
@ -157,7 +157,7 @@ check_config_value_bool(char *key, ompi_info_t *info)
return result;
info_not_found:
param = mca_base_var_find("ompi", "osc", "rdma", key);
param = mca_base_var_find("ompi", "osc", "pt2pt", key);
if (0 > param) return false;
ret = mca_base_var_get_value(param, &flag_value, NULL, NULL);
@ -177,8 +177,8 @@ component_open(void)
static int
component_register(void)
{
ompi_osc_rdma_no_locks = false;
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
ompi_osc_pt2pt_no_locks = false;
(void) mca_base_component_var_register(&mca_osc_pt2pt_component.super.osc_version,
"no_locks",
"Enable optimizations available only if MPI_LOCK is "
"not used. "
@ -186,38 +186,39 @@ component_register(void)
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_osc_rdma_no_locks);
&ompi_osc_pt2pt_no_locks);
mca_osc_rdma_component.buffer_size = 8192;
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
mca_osc_pt2pt_component.buffer_size = 8192;
(void) mca_base_component_var_register (&mca_osc_pt2pt_component.super.osc_version, "buffer_size",
"Data transfers smaller than this limit may be coalesced before "
"being transferred (default: 8k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_osc_rdma_component.buffer_size);
&mca_osc_pt2pt_component.buffer_size);
return OMPI_SUCCESS;
}
static int component_progress (void)
{
ompi_osc_rdma_pending_t *pending, *next;
int count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
ompi_osc_pt2pt_pending_t *pending, *next;
if (0 == opal_list_get_size (&mca_osc_rdma_component.pending_operations)) {
if (0 == count) {
return 0;
}
/* process one incoming request */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_rdma_component.pending_operations, ompi_osc_rdma_pending_t) {
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
int ret;
switch (pending->header.base.type) {
case OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_rdma_process_flush (pending->module, pending->source,
case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
&pending->header.flush);
break;
case OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_rdma_process_unlock (pending->module, pending->source,
case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
&pending->header.unlock);
break;
default:
@ -227,11 +228,11 @@ static int component_progress (void)
}
if (OMPI_SUCCESS == ret) {
opal_list_remove_item (&mca_osc_rdma_component.pending_operations, &pending->super);
opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
OBJ_RELEASE(pending);
}
}
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);
return 1;
}
@ -242,23 +243,24 @@ component_init(bool enable_progress_threads,
{
int ret;
OBJ_CONSTRUCT(&mca_osc_rdma_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.request_gc, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_rdma_component.modules,
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.modules,
opal_hash_table_t);
opal_hash_table_init(&mca_osc_rdma_component.modules, 2);
opal_hash_table_init(&mca_osc_pt2pt_component.modules, 2);
mca_osc_rdma_component.progress_enable = false;
mca_osc_rdma_component.module_count = 0;
mca_osc_pt2pt_component.progress_enable = false;
mca_osc_pt2pt_component.module_count = 0;
OBJ_CONSTRUCT(&mca_osc_rdma_component.frags, opal_free_list_t);
ret = opal_free_list_init(&mca_osc_rdma_component.frags,
sizeof(ompi_osc_rdma_frag_t),
OBJ_CLASS(ompi_osc_rdma_frag_t),
1, -1, 1);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.frags, ompi_free_list_t);
ret = ompi_free_list_init_new (&mca_osc_pt2pt_component.frags,
sizeof(ompi_osc_pt2pt_frag_t), 8,
OBJ_CLASS(ompi_osc_pt2pt_frag_t),
mca_osc_pt2pt_component.buffer_size +
sizeof (ompi_osc_pt2pt_frag_header_t),
8, 1, -1, 1, 0);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"%s:%d: ompi_free_list_init failed: %d",
@ -266,10 +268,10 @@ component_init(bool enable_progress_threads,
return ret;
}
OBJ_CONSTRUCT(&mca_osc_rdma_component.requests, ompi_free_list_t);
ret = ompi_free_list_init(&mca_osc_rdma_component.requests,
sizeof(ompi_osc_rdma_request_t),
OBJ_CLASS(ompi_osc_rdma_request_t),
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.requests, ompi_free_list_t);
ret = ompi_free_list_init(&mca_osc_pt2pt_component.requests,
sizeof(ompi_osc_pt2pt_request_t),
OBJ_CLASS(ompi_osc_pt2pt_request_t),
0, -1, 32, NULL);
if (OMPI_SUCCESS != ret) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
@ -282,29 +284,28 @@ component_init(bool enable_progress_threads,
}
int
int
component_finalize(void)
{
size_t num_modules;
if (mca_osc_rdma_component.progress_enable) {
if (mca_osc_pt2pt_component.progress_enable) {
opal_progress_unregister (component_progress);
}
if (0 !=
(num_modules = opal_hash_table_get_size(&mca_osc_rdma_component.modules))) {
if (0 !=
(num_modules = opal_hash_table_get_size(&mca_osc_pt2pt_component.modules))) {
opal_output(ompi_osc_base_framework.framework_output,
"WARNING: There were %d Windows created but not freed.",
(int) num_modules);
}
OBJ_DESTRUCT(&mca_osc_rdma_component.frags);
OBJ_DESTRUCT(&mca_osc_rdma_component.modules);
OBJ_DESTRUCT(&mca_osc_rdma_component.lock);
OBJ_DESTRUCT(&mca_osc_rdma_component.requests);
OBJ_DESTRUCT(&mca_osc_rdma_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_rdma_component.request_gc);
OBJ_DESTRUCT(&mca_osc_rdma_component.buffer_gc);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.frags);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.modules);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.lock);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.requests);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations_lock);
return OMPI_SUCCESS;
}
@ -326,27 +327,21 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
struct ompi_communicator_t *comm, struct ompi_info_t *info,
int flavor, int *model)
{
ompi_osc_rdma_module_t *module = NULL;
ompi_osc_pt2pt_module_t *module = NULL;
int ret;
char *name;
bool no_locks = false;
/* We don't support shared windows; that's for the sm onesided
component */
if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED;
if (check_config_value_bool("no_locks", info)) {
no_locks = true;
ompi_osc_rdma_no_locks = true;
}
/* create module structure with all fields initialized to zero */
module = (ompi_osc_rdma_module_t*)
calloc(1, sizeof(ompi_osc_rdma_module_t));
module = (ompi_osc_pt2pt_module_t*)
calloc(1, sizeof(ompi_osc_pt2pt_module_t));
if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
/* fill in the function pointer part */
memcpy(module, &ompi_osc_rdma_module_template,
memcpy(module, &ompi_osc_pt2pt_module_template,
sizeof(ompi_osc_base_module_t));
/* initialize the objects, so that always free in cleanup */
@ -354,10 +349,15 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
OBJ_CONSTRUCT(&module->cond, opal_condition_t);
OBJ_CONSTRUCT(&module->acc_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->queued_frags, opal_list_t);
OBJ_CONSTRUCT(&module->queued_frags_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->locks_pending, opal_list_t);
OBJ_CONSTRUCT(&module->locks_pending_lock, opal_mutex_t);
OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t);
OBJ_CONSTRUCT(&module->pending_acc, opal_list_t);
OBJ_CONSTRUCT(&module->pending_posts, opal_list_t);
OBJ_CONSTRUCT(&module->request_gc, opal_list_t);
OBJ_CONSTRUCT(&module->buffer_gc, opal_list_t);
OBJ_CONSTRUCT(&module->gc_lock, opal_mutex_t);
/* options */
/* FIX ME: should actually check this value... */
@ -385,14 +385,14 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
if (OMPI_SUCCESS != ret) goto cleanup;
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"rdma component creating window with id %d",
"pt2pt component creating window with id %d",
ompi_comm_get_cid(module->comm)));
/* record my displacement unit. Always resolved at target */
module->disp_unit = disp_unit;
/* peer data */
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_rdma_peer_t));
module->peers = calloc(ompi_comm_size(comm), sizeof(ompi_osc_pt2pt_peer_t));
if (NULL == module->peers) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
@ -405,20 +405,6 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
goto cleanup;
}
if (!no_locks) {
module->passive_incoming_frag_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
module->passive_incoming_frag_signal_count = calloc(ompi_comm_size(comm), sizeof(uint32_t));
if (NULL == module->passive_incoming_frag_signal_count) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* the statement below (from Brian) does not seem correct so disable active target on the
* window. if this end up being incorrect please revert this one change */
module->active_eager_send_active = false;
@ -429,43 +415,35 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->active_eager_send_active = true;
#endif
if (!no_locks) {
module->passive_eager_send_active = malloc(sizeof(bool) * ompi_comm_size(comm));
if (NULL == module->passive_eager_send_active) {
ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE;
goto cleanup;
}
}
/* lock data */
if (check_config_value_bool("no_locks", info)) {
win->w_flags |= OMPI_WIN_NO_LOCKS;
}
/* update component data */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
ret = opal_hash_table_set_value_uint32(&mca_osc_rdma_component.modules,
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.lock);
ret = opal_hash_table_set_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm),
module);
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.lock);
if (OMPI_SUCCESS != ret) goto cleanup;
/* fill in window information */
*model = MPI_WIN_UNIFIED;
win->w_osc_module = (ompi_osc_base_module_t*) module;
asprintf(&name, "rdma window %d", ompi_comm_get_cid(module->comm));
asprintf(&name, "pt2pt window %d", ompi_comm_get_cid(module->comm));
ompi_win_set_name(win, name);
free(name);
/* sync memory - make sure all initialization completed */
opal_atomic_mb();
module->incoming_buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
module->incoming_buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) {
goto cleanup;
}
ret = ompi_osc_rdma_frag_start_receive (module);
ret = ompi_osc_pt2pt_frag_start_receive (module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
@ -476,30 +454,30 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
module->comm->c_coll.coll_barrier_module);
if (OMPI_SUCCESS != ret) goto cleanup;
if (!mca_osc_rdma_component.progress_enable) {
if (!mca_osc_pt2pt_component.progress_enable) {
opal_progress_register (component_progress);
mca_osc_rdma_component.progress_enable = true;
mca_osc_pt2pt_component.progress_enable = true;
}
OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output,
"done creating rdma window %d", ompi_comm_get_cid(module->comm)));
"done creating pt2pt window %d", ompi_comm_get_cid(module->comm)));
return OMPI_SUCCESS;
cleanup:
/* set the module so we properly cleanup */
win->w_osc_module = (ompi_osc_base_module_t*) module;
ompi_osc_rdma_free (win);
ompi_osc_pt2pt_free (win);
return ret;
}
int
ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
ompi_osc_pt2pt_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
{
ompi_osc_rdma_module_t *module =
(ompi_osc_rdma_module_t*) win->w_osc_module;
ompi_osc_pt2pt_module_t *module =
(ompi_osc_pt2pt_module_t*) win->w_osc_module;
/* enforce collectiveness... */
return module->comm->c_coll.coll_barrier(module->comm,
@ -508,7 +486,7 @@ ompi_osc_rdma_set_info(struct ompi_win_t *win, struct ompi_info_t *info)
int
ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
{
ompi_info_t *info = OBJ_NEW(ompi_info_t);
if (NULL == info) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
@ -518,4 +496,4 @@ ompi_osc_rdma_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)
return OMPI_SUCCESS;
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_t, opal_list_item_t, NULL, NULL);
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_t, opal_list_item_t, NULL, NULL);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -4,7 +4,7 @@
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -12,28 +12,28 @@
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_RDMA_DATA_MOVE_H
#define OMPI_MCA_OSC_RDMA_DATA_MOVE_H
#ifndef OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#define OMPI_MCA_OSC_PT2PT_DATA_MOVE_H
#include "osc_rdma_header.h"
#include "osc_pt2pt_header.h"
int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_control_send(ompi_osc_pt2pt_module_t *module,
int target,
void *data,
size_t len);
/**
* ompi_osc_rdma_control_send_unbuffered:
* ompi_osc_pt2pt_control_send_unbuffered:
*
* @short Send an unbuffered control message to a peer.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] target - Target rank
* @param[in] data - Data to send
* @param[in] len - Length of data
@ -45,11 +45,11 @@ int ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module,
* from its peer). The buffer specified by data will be available
* when this call returns.
*/
int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
int ompi_osc_pt2pt_control_send_unbuffered (ompi_osc_pt2pt_module_t *module,
int target, void *data, size_t len);
/**
* ompi_osc_rdma_isend_w_cb:
* ompi_osc_pt2pt_isend_w_cb:
*
* @short Post a non-blocking send with a specified callback.
*
@ -66,11 +66,11 @@ int ompi_osc_rdma_control_send_unbuffered (ompi_osc_rdma_module_t *module,
* be called with the associated request. The context specified in ctx will be stored in
* the req_completion_cb_data member of the ompi_request_t for use by the callback.
*/
int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
int ompi_osc_pt2pt_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int target, int tag,
ompi_communicator_t *comm, ompi_request_complete_fn_t cb, void *ctx);
/**
* ompi_osc_rdma_irecv_w_cb:
* ompi_osc_pt2pt_irecv_w_cb:
*
* @short Post a non-blocking receive with a specified callback.
*
@ -89,49 +89,49 @@ int ompi_osc_rdma_isend_w_cb (void *ptr, int count, ompi_datatype_t *datatype, i
* request. The context specified in ctx will be stored in the req_completion_cb_data
* member of the ompi_request_t for use by the callback.
*/
int ompi_osc_rdma_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
int ompi_osc_pt2pt_irecv_w_cb (void *ptr, int count, ompi_datatype_t *datatype, int source, int tag,
ompi_communicator_t *comm, ompi_request_t **request_out,
ompi_request_complete_fn_t cb, void *ctx);
int ompi_osc_rdma_process_lock(ompi_osc_rdma_module_t* module,
int ompi_osc_pt2pt_process_lock(ompi_osc_pt2pt_module_t* module,
int source,
struct ompi_osc_rdma_header_lock_t* lock_header);
struct ompi_osc_pt2pt_header_lock_t* lock_header);
void ompi_osc_rdma_process_lock_ack(ompi_osc_rdma_module_t* module,
struct ompi_osc_rdma_header_lock_ack_t* lock_header);
void ompi_osc_pt2pt_process_lock_ack(ompi_osc_pt2pt_module_t* module,
struct ompi_osc_pt2pt_header_lock_ack_t* lock_header);
int ompi_osc_rdma_process_unlock(ompi_osc_rdma_module_t* module,
int ompi_osc_pt2pt_process_unlock(ompi_osc_pt2pt_module_t* module,
int source,
struct ompi_osc_rdma_header_unlock_t* lock_header);
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header);
struct ompi_osc_pt2pt_header_unlock_t* lock_header);
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_t *flush_header);
/**
* ompi_osc_rdma_process_unlock_ack:
* ompi_osc_pt2pt_process_unlock_ack:
*
* @short Process an incoming unlock acknowledgement.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] unlock_ack_header - Incoming unlock ack header
*/
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header);
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header);
/**
* ompi_osc_rdma_process_flush_ack:
* ompi_osc_pt2pt_process_flush_ack:
*
* @short Process an incoming flush acknowledgement.
*
* @param[in] module - OSC RDMA module
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] flush_ack_header - Incoming flush ack header
*/
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header);
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header);
/**
* ompi_osc_rdma_frag_start_receive:
* ompi_osc_pt2pt_frag_start_receive:
*
* @short Start receiving fragments on the OSC module.
*
@ -140,6 +140,6 @@ void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source
* @long This function starts receiving eager fragments on the module. The current
* implementation uses the pml to transfer eager fragments.
*/
int ompi_osc_rdma_frag_start_receive (ompi_osc_rdma_module_t *module);
int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module);
#endif

203
ompi/mca/osc/pt2pt/osc_pt2pt_frag.c Обычный файл
Просмотреть файл

@ -0,0 +1,203 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_frag.h"
#include "osc_pt2pt_data_move.h"
static void ompi_osc_pt2pt_frag_constructor (ompi_osc_pt2pt_frag_t *frag){
frag->buffer = frag->super.ptr;
}
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_frag_t, ompi_free_list_item_t,
ompi_osc_pt2pt_frag_constructor, NULL);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_pt2pt_frag_t *frag =
(ompi_osc_pt2pt_frag_t*) request->req_complete_cb_data;
ompi_osc_pt2pt_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_pt2pt_gc_add_request (module, request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
return ompi_osc_pt2pt_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_PT2PT_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t *frag)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + frag->target;
int ret;
assert(0 == frag->pending && peer->active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (!(peer->eager_send_active || module->all_access_epoch)) {
OPAL_THREAD_SCOPED_LOCK(&module->queued_frags_lock,
opal_list_append(&module->queued_frags, (opal_list_item_t *) frag));
return OMPI_SUCCESS;
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_frag_t *next, *frag = module->peers[target].active_frag;
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target begin"));
/* flush the active frag */
if (NULL != frag) {
if (1 != frag->pending) {
/* communication going on while synchronizing; this is an rma usage bug */
return OMPI_ERR_RMA_SYNC;
}
if (opal_atomic_cmpset (&module->peers[target].active_frag, frag, NULL)) {
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished active frag"));
/* walk through the pending list and send */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != frag)) {
break;
}
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush target finished"));
return ret;
}
int
ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_pt2pt_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
ompi_osc_pt2pt_frag_t *frag = module->peers[i].active_frag;
if (NULL != frag) {
if (1 != frag->pending) {
OPAL_THREAD_UNLOCK(&module->lock);
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
if (!opal_atomic_cmpset_ptr (&module->peers[i].active_frag, frag, NULL)) {
continue;
}
OPAL_THREAD_ADD32(&frag->pending, -1);
ret = ompi_osc_pt2pt_frag_start(module, frag);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_THREAD_LOCK(&module->queued_frags_lock);
if (opal_list_get_size (&module->queued_frags)) {
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_pt2pt_frag_t) {
opal_list_remove_item(&module->queued_frags, (opal_list_item_t *) frag);
ret = frag_send(module, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
}
OPAL_THREAD_UNLOCK(&module->queued_frags_lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: frag flush all done"));
return ret;
}

143
ompi/mca/osc/pt2pt/osc_pt2pt_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,143 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_PT2PT_FRAG_H
#define OSC_PT2PT_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_pt2pt_frag_t {
ompi_free_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int32_t pending;
ompi_osc_pt2pt_frag_header_t *header;
ompi_osc_pt2pt_module_t *module;
};
typedef struct ompi_osc_pt2pt_frag_t ompi_osc_pt2pt_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_frag_t);
extern int ompi_osc_pt2pt_frag_start(ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_frag_t *buffer);
extern int ompi_osc_pt2pt_frag_flush_target(ompi_osc_pt2pt_module_t *module, int target);
extern int ompi_osc_pt2pt_frag_flush_all(ompi_osc_pt2pt_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_pt2pt_frag_alloc(ompi_osc_pt2pt_module_t *module, int target,
size_t request_len, ompi_osc_pt2pt_frag_t **buffer,
char **ptr)
{
ompi_osc_pt2pt_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc pt2pt headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_pt2pt_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
OPAL_THREAD_LOCK(&module->lock);
if (NULL == curr || curr->remain_len < request_len) {
ompi_free_list_item_t *item = NULL;
if (NULL != curr) {
curr->remain_len = 0;
module->peers[target].active_frag = NULL;
opal_atomic_mb ();
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == OPAL_THREAD_ADD32(&curr->pending, -1)) {
ret = ompi_osc_pt2pt_frag_start(module, curr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.frags, item);
if (OPAL_UNLIKELY(NULL == item)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
curr = module->peers[target].active_frag =
(ompi_osc_pt2pt_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_pt2pt_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_pt2pt_component.buffer_size;
curr->module = module;
curr->pending = 1;
curr->header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_THREAD_ADD32(&curr->pending, 1);
OPAL_THREAD_ADD32(&curr->header->num_ops, 1);
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_pt2pt_frag_finish(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_frag_t* buffer)
{
if (0 == OPAL_THREAD_ADD32(&buffer->pending, -1)) {
return ompi_osc_pt2pt_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

189
ompi/mca/osc/pt2pt/osc_pt2pt_header.h Обычный файл
Просмотреть файл

@ -0,0 +1,189 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_PT2PT_HDR_H
#define OMPI_MCA_OSC_PT2PT_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_pt2pt_hdr_type_t {
OMPI_OSC_PT2PT_HDR_TYPE_PUT = 0x01,
OMPI_OSC_PT2PT_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_PT2PT_HDR_TYPE_ACC = 0x03,
OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_PT2PT_HDR_TYPE_GET = 0x05,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_PT2PT_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_PT2PT_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_PT2PT_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_PT2PT_HDR_TYPE_POST = 0x11,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_PT2PT_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_pt2pt_hdr_type_t ompi_osc_pt2pt_hdr_type_t;
#define OMPI_OSC_PT2PT_HDR_FLAG_NBO 0x01
#define OMPI_OSC_PT2PT_HDR_FLAG_VALID 0x02
#define OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_pt2pt_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_pt2pt_header_base_t ompi_osc_pt2pt_header_base_t;
struct ompi_osc_pt2pt_header_put_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_put_t ompi_osc_pt2pt_header_put_t;
struct ompi_osc_pt2pt_header_acc_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_acc_t ompi_osc_pt2pt_header_acc_t;
struct ompi_osc_pt2pt_header_get_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_get_t ompi_osc_pt2pt_header_get_t;
struct ompi_osc_pt2pt_header_complete_t {
ompi_osc_pt2pt_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_pt2pt_header_complete_t ompi_osc_pt2pt_header_complete_t;
struct ompi_osc_pt2pt_header_cswap_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_pt2pt_header_cswap_t ompi_osc_pt2pt_header_cswap_t;
struct ompi_osc_pt2pt_header_post_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_pt2pt_header_post_t ompi_osc_pt2pt_header_post_t;
struct ompi_osc_pt2pt_header_lock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_t ompi_osc_pt2pt_header_lock_t;
struct ompi_osc_pt2pt_header_lock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_lock_ack_t ompi_osc_pt2pt_header_lock_ack_t;
struct ompi_osc_pt2pt_header_unlock_t {
ompi_osc_pt2pt_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_t ompi_osc_pt2pt_header_unlock_t;
struct ompi_osc_pt2pt_header_unlock_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_header_unlock_ack_t ompi_osc_pt2pt_header_unlock_ack_t;
struct ompi_osc_pt2pt_header_flush_t {
ompi_osc_pt2pt_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_t ompi_osc_pt2pt_header_flush_t;
struct ompi_osc_pt2pt_header_flush_ack_t {
ompi_osc_pt2pt_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_pt2pt_header_flush_ack_t ompi_osc_pt2pt_header_flush_ack_t;
struct ompi_osc_pt2pt_frag_header_t {
ompi_osc_pt2pt_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
int32_t num_ops; /* number of operations in this buffer */
uint32_t pad; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_pt2pt_frag_header_t ompi_osc_pt2pt_frag_header_t;
union ompi_osc_pt2pt_header_t {
ompi_osc_pt2pt_header_base_t base;
ompi_osc_pt2pt_header_put_t put;
ompi_osc_pt2pt_header_acc_t acc;
ompi_osc_pt2pt_header_get_t get;
ompi_osc_pt2pt_header_complete_t complete;
ompi_osc_pt2pt_header_cswap_t cswap;
ompi_osc_pt2pt_header_post_t post;
ompi_osc_pt2pt_header_lock_t lock;
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
ompi_osc_pt2pt_header_unlock_t unlock;
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_header_flush_t flush;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
ompi_osc_pt2pt_frag_header_t frag;
};
typedef union ompi_osc_pt2pt_header_t ompi_osc_pt2pt_header_t;
#endif /* OMPI_MCA_OSC_PT2PT_HDR_H */

Просмотреть файл

@ -4,23 +4,23 @@
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_pt2pt.h"
#include "opal/threads/mutex.h"
#include "opal/mca/btl/btl.h"
@ -31,25 +31,24 @@
int
ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len)
ompi_osc_pt2pt_attach(struct ompi_win_t *win, void *base, size_t len)
{
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_detach(struct ompi_win_t *win, void *base)
ompi_osc_pt2pt_detach(struct ompi_win_t *win, void *base)
{
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_free(ompi_win_t *win)
ompi_osc_pt2pt_free(ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
opal_list_item_t *item;
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
if (NULL == module) {
return OMPI_SUCCESS;
@ -57,7 +56,7 @@ ompi_osc_rdma_free(ompi_win_t *win)
if (NULL != module->comm) {
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
"rdma component destroying window with id %d",
"pt2pt component destroying window with id %d",
ompi_comm_get_cid(module->comm));
/* finish with a barrier */
@ -67,43 +66,38 @@ ompi_osc_rdma_free(ompi_win_t *win)
}
/* remove from component information */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
opal_hash_table_remove_value_uint32(&mca_osc_rdma_component.modules,
ompi_comm_get_cid(module->comm));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock,
opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules,
ompi_comm_get_cid(module->comm)));
}
win->w_osc_module = NULL;
OBJ_DESTRUCT(&module->outstanding_locks);
OBJ_DESTRUCT(&module->locks_pending);
OBJ_DESTRUCT(&module->locks_pending_lock);
OBJ_DESTRUCT(&module->acc_lock);
OBJ_DESTRUCT(&module->cond);
OBJ_DESTRUCT(&module->lock);
/* it is erroneous to close a window with active operations on it so we should
* probably produce an error here instead of cleaning up */
while (NULL != (item = opal_list_remove_first (&module->pending_acc))) {
OBJ_RELEASE(item);
}
OPAL_LIST_DESTRUCT(&module->pending_acc);
OPAL_LIST_DESTRUCT(&module->pending_posts);
OPAL_LIST_DESTRUCT(&module->queued_frags);
OBJ_DESTRUCT(&module->queued_frags_lock);
OBJ_DESTRUCT(&module->pending_acc);
while (NULL != (item = opal_list_remove_first (&module->pending_posts))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&module->pending_posts);
osc_rdma_gc_clean ();
osc_pt2pt_gc_clean (module);
OPAL_LIST_DESTRUCT(&module->request_gc);
OPAL_LIST_DESTRUCT(&module->buffer_gc);
OBJ_DESTRUCT(&module->gc_lock);
if (NULL != module->peers) {
free(module->peers);
}
if (NULL != module->passive_eager_send_active) free(module->passive_eager_send_active);
if (NULL != module->passive_incoming_frag_count) free(module->passive_incoming_frag_count);
if (NULL != module->passive_incoming_frag_signal_count) free(module->passive_incoming_frag_signal_count);
if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->frag_request) {
module->frag_request->req_complete_cb = NULL;
ompi_request_cancel (module->frag_request);

925
ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c Обычный файл
Просмотреть файл

@ -0,0 +1,925 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_header.h"
#include "osc_pt2pt_data_move.h"
#include "osc_pt2pt_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type,
uint64_t serial_number);
/* target-side tracking of a lock request */
struct ompi_osc_pt2pt_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t lock_ptr;
};
typedef struct ompi_osc_pt2pt_pending_lock_t ompi_osc_pt2pt_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_pt2pt_outstanding_lock_t {
opal_list_item_t super;
int target;
int assert;
bool flushing;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_pt2pt_outstanding_lock_t ompi_osc_pt2pt_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module);
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor, int lock_type, uint64_t lock_ptr);
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC PT2PT module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_st (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->target == target) {
lock = outstanding_lock;
break;
}
}
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock (ompi_osc_pt2pt_module_t *module, int target)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline ompi_osc_pt2pt_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_pt2pt_module_t *module, uint64_t serial_number)
{
ompi_osc_pt2pt_outstanding_lock_t *outstanding_lock, *lock = NULL;
OPAL_THREAD_LOCK(&module->lock);
OPAL_LIST_FOREACH(outstanding_lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
if (outstanding_lock->serial_number == serial_number) {
lock = outstanding_lock;
break;
}
}
OPAL_THREAD_UNLOCK(&module->lock);
return lock;
}
static inline int ompi_osc_pt2pt_lock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
bool acquired = false;
acquired = ompi_osc_pt2pt_lock_try_acquire (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
if (!acquired) {
/* queue the lock */
queue_lock (module, my_rank, lock->type, (uint64_t) (uintptr_t) lock);
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
OPAL_THREAD_LOCK(&module->lock);
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_pt2pt_unlock_self (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_self: unlocking myself. lock state = %d", module->lock_status));
if (MPI_LOCK_EXCLUSIVE == lock->type) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1);
}
static inline int ompi_osc_pt2pt_lock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
ret = ompi_osc_pt2pt_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_pt2pt_unlock_remote (ompi_osc_pt2pt_module_t *module, int target, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
ompi_osc_pt2pt_header_unlock_t unlock_req;
int32_t frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
unlock_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = frag_count;
unlock_req.lock_type = lock->type;
unlock_req.lock_ptr = (uint64_t) (uintptr_t) lock;
/* send control message with unlock request and count */
return ompi_osc_pt2pt_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
static int ompi_osc_pt2pt_lock_internal_execute (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock)
{
int my_rank = ompi_comm_rank (module->comm);
int target = lock->target;
int assert = lock->assert;
int ret;
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (my_rank != target && target != -1) {
ret = ompi_osc_pt2pt_lock_remote (module, target, lock);
} else {
ret = ompi_osc_pt2pt_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* return */
return ret;
}
if (-1 == target) {
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
}
} else {
if (-1 == target) {
lock->lock_acks_received = ompi_comm_size(module->comm);
} else {
lock->lock_acks_received = 1;
}
}
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
ompi_osc_pt2pt_peer_t *peer = NULL;
int ret = OMPI_SUCCESS;
if (-1 != target) {
peer = module->peers + target;
}
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: lock %d %d", target, lock_type));
/* create lock item */
lock = OBJ_NEW(ompi_osc_pt2pt_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = OPAL_THREAD_ADD64((int64_t *) &module->lock_serial_number, 1);
lock->type = lock_type;
lock->assert = assert;
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
/* check for conflicting lock */
if (find_outstanding_lock_st (module, target)) {
OBJ_RELEASE(lock);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_CONFLICT;
}
/* when the lock ack returns we will be in an access epoch with this peer/all peers (target = -1) */
if (-1 == target) {
module->all_access_epoch = true;
} else {
peer->access_epoch = true;
}
module->passive_target_access_epoch = true;
opal_list_append(&module->outstanding_locks, &lock->super);
OPAL_THREAD_UNLOCK(&module->lock);
ret = ompi_osc_pt2pt_lock_internal_execute (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_SCOPED_LOCK(&module->lock,
opal_list_remove_item(&module->outstanding_locks, &lock->super));
OBJ_RELEASE(lock);
}
return ret;
}
static int ompi_osc_pt2pt_unlock_internal (int target, ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock = NULL;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_pt2pt_peer_t *peer = NULL;
int lock_acks_expected;
int ret = OMPI_SUCCESS;
if (-1 != target) {
lock_acks_expected = 1;
peer = module->peers + target;
} else {
lock_acks_expected = ompi_comm_size (module->comm);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock_internal: unlocking target %d", target));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock_st (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
opal_list_remove_item (&module->outstanding_locks, &lock->super);
/* wait until ack has arrived from target */
while (lock->lock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
if (lock->assert & MPI_MODE_NOCHECK) {
/* flush intstead */
ompi_osc_pt2pt_flush_lock (module, lock, target);
} else if (my_rank != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
if (-1 == target) {
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_pt2pt_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
ret = ompi_osc_pt2pt_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* start all sendreqs to target */
if (-1 == target) {
ret = ompi_osc_pt2pt_frag_flush_all (module);
} else {
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* wait for unlock acks. this signals remote completion of fragments */
OPAL_THREAD_LOCK(&module->lock);
while (lock->unlock_acks_received != lock_acks_expected) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_unlock: unlock of %d complete", target));
}
if ((target == my_rank || target == -1) && !(lock->assert & MPI_MODE_NOCHECK)) {
ompi_osc_pt2pt_unlock_self (module, lock);
}
OPAL_THREAD_LOCK(&module->lock);
if (-1 != target) {
peer->access_epoch = false;
module->passive_target_access_epoch = false;
} else {
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
}
OPAL_THREAD_UNLOCK(&module->lock);
OBJ_RELEASE(lock);
return ret;
}
int ompi_osc_pt2pt_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
assert(target >= 0);
return ompi_osc_pt2pt_lock_internal (lock_type, target, assert, win);
}
int ompi_osc_pt2pt_unlock (int target, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (target, win);
}
int ompi_osc_pt2pt_lock_all(int assert, struct ompi_win_t *win)
{
return ompi_osc_pt2pt_lock_internal (MPI_LOCK_SHARED, -1, assert, win);
}
int ompi_osc_pt2pt_unlock_all (struct ompi_win_t *win)
{
return ompi_osc_pt2pt_unlock_internal (-1, win);
}
int ompi_osc_pt2pt_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_pt2pt_flush_lock (ompi_osc_pt2pt_module_t *module, ompi_osc_pt2pt_outstanding_lock_t *lock,
int target)
{
ompi_osc_pt2pt_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
OPAL_THREAD_LOCK(&module->lock);
while (peer_count > lock->lock_acks_received && lock->flushing) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
lock->flushing = true;
OPAL_THREAD_UNLOCK(&module->lock);
flush_req.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID | OMPI_OSC_PT2PT_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + i, -1);
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = opal_atomic_swap_32 ((int32_t *) module->epoch_outgoing_frag_count + target, -1);
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_pt2pt_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_pt2pt_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
OPAL_THREAD_LOCK(&module->lock);
while (flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flushing = false;
opal_condition_broadcast(&module->cond);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush: target %d is not locked in window %s",
target, win->w_name));
ret = OMPI_ERR_RMA_SYNC;
} else {
ret = ompi_osc_pt2pt_flush_lock (module, lock, target);
}
return ret;
}
int ompi_osc_pt2pt_flush_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
ompi_osc_pt2pt_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (OPAL_UNLIKELY(!module->passive_target_access_epoch ||
0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_pt2pt_outstanding_lock_t) {
ret = ompi_osc_pt2pt_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_flush_all complete"));
return ret;
}
int ompi_osc_pt2pt_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
int ompi_osc_pt2pt_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_pt2pt_frag_flush_all(module);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* wait for all the requests */
OPAL_THREAD_LOCK(&module->lock);
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_pt2pt_module_t *module, int requestor,
uint64_t lock_ptr)
{
ompi_osc_pt2pt_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_pt2pt_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_pt2pt_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: releasing local lock"));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ptr;
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_pt2pt_module_t *module, int requestor,
int lock_type, uint64_t lock_ptr)
{
ompi_osc_pt2pt_pending_lock_t *pending =
OBJ_NEW(ompi_osc_pt2pt_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->lock_ptr = lock_ptr;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc pt2pt: queueing lock request from %d", requestor));
OPAL_THREAD_SCOPED_LOCK(&module->locks_pending_lock, opal_list_append(&module->locks_pending, &pending->super));
return OMPI_SUCCESS;
}
static bool ompi_osc_pt2pt_lock_try_acquire (ompi_osc_pt2pt_module_t* module, int source, int lock_type, uint64_t lock_ptr)
{
bool queue = false;
if (MPI_LOCK_SHARED == lock_type) {
int32_t lock_status = module->lock_status;
do {
if (lock_status < 0) {
queue = true;
break;
}
if (opal_atomic_cmpset_32 (&module->lock_status, lock_status, lock_status + 1)) {
break;
}
lock_status = module->lock_status;
} while (1);
} else {
queue = !opal_atomic_cmpset_32 (&module->lock_status, 0, -1);
}
if (queue) {
return false;
}
activate_lock(module, source, lock_ptr);
/* activated the lock */
return true;
}
static int ompi_osc_activate_next_lock (ompi_osc_pt2pt_module_t *module) {
/* release any other pending locks we can */
ompi_osc_pt2pt_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->locks_pending_lock);
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_pt2pt_pending_lock_t) {
bool acquired = ompi_osc_pt2pt_lock_try_acquire (module, pending_lock->peer, pending_lock->lock_type,
pending_lock->lock_ptr);
if (!acquired) {
break;
}
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
OPAL_THREAD_UNLOCK(&module->locks_pending_lock);
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_pt2pt_process_lock (ompi_osc_pt2pt_module_t* module, int source,
ompi_osc_pt2pt_header_lock_t* lock_header)
{
bool acquired;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_lock: processing lock request from %d. current lock state = %d",
source, module->lock_status));
acquired = ompi_osc_pt2pt_lock_try_acquire (module, source, lock_header->lock_type, lock_header->lock_ptr);
if (!acquired) {
queue_lock(module, source, lock_header->lock_type, lock_header->lock_ptr);
}
return OMPI_SUCCESS;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_pt2pt_process_lock_ack (ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_header_lock_ack_t *lock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + lock_ack_header->source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing lock ack from %d for lock %" PRIu64,
lock_ack_header->source, lock_ack_header->lock_ptr));
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (uintptr_t) lock_ack_header->lock_ptr;
assert (NULL != lock);
/* no need to hold the lock to set this */
peer->eager_send_active = true;
OPAL_THREAD_ADD32(&lock->lock_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_ack_t *flush_ack_header) {
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
OPAL_THREAD_ADD32(&lock->flush_acks_received, 1);
opal_condition_broadcast(&module->cond);
}
void ompi_osc_pt2pt_process_unlock_ack (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_ack_t *unlock_ack_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = (ompi_osc_pt2pt_outstanding_lock_t *) (intptr_t) unlock_ack_header->lock_ptr;
assert (NULL != lock);
peer->eager_send_active = false;
if (0 == OPAL_THREAD_ADD32(&lock->unlock_acks_received, 1)) {
opal_condition_broadcast(&module->cond);
}
}
/**
* Process an unlock request.
*
* @param[in] module - OSC PT2PT module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side function for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_pt2pt_process_unlock (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_unlock_t *unlock_header)
{
ompi_osc_pt2pt_header_unlock_ack_t unlock_ack;
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_unlock entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
unlock_ack.lock_ptr = unlock_header->lock_ptr;
ret = ompi_osc_pt2pt_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
if (-1 == module->lock_status) {
OPAL_THREAD_ADD32(&module->lock_status, 1);
ompi_osc_activate_next_lock (module);
} else if (0 == OPAL_THREAD_ADD32(&module->lock_status, -1)) {
ompi_osc_activate_next_lock (module);
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc pt2pt: finished processing unlock fragment"));
return ret;
}
int ompi_osc_pt2pt_process_flush (ompi_osc_pt2pt_module_t *module, int source,
ompi_osc_pt2pt_header_flush_t *flush_header)
{
ompi_osc_pt2pt_peer_t *peer = module->peers + source;
ompi_osc_pt2pt_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_pt2pt_process_flush entering (passive_incoming_frag_count: %d)...",
peer->passive_incoming_frag_count));
/* we cannot block when processing an incoming request */
if (0 != peer->passive_incoming_frag_count) {
return OMPI_ERR_WOULD_BLOCK;
}
flush_ack.base.type = OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_PT2PT_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_pt2pt_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*
* Pending frags are fragments that have been received on the target,
@ -14,35 +17,35 @@
* message.
*/
#ifndef OSC_RDMA_PENDING_FRAG_H
#define OSC_RDMA_PENDING_FRAG_H
#ifndef OSC_PT2PT_PENDING_FRAG_H
#define OSC_PT2PT_PENDING_FRAG_H
/** Incoming fragment that has to be queued */
struct ompi_osc_rdma_pending_frag_t {
struct ompi_osc_pt2pt_pending_frag_t {
opal_list_item_t super;
/* This is a pointer to the top of the fragment (which is always
the header). Save as a header to make the casting a bit less
onerous during sequence number lookups. */
ompi_osc_rdma_frag_header_t *header;
ompi_osc_pt2pt_frag_header_t *header;
};
typedef struct ompi_osc_rdma_pending_frag_t ompi_osc_rdma_pending_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_frag_t);
typedef struct ompi_osc_pt2pt_pending_frag_t ompi_osc_pt2pt_pending_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_frag_t);
/*
* Note: module lock must be held during this operation
*/
static inline ompi_osc_rdma_pending_frag_t*
ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
static inline ompi_osc_pt2pt_pending_frag_t*
ompi_osc_pt2pt_pending_frag_create(ompi_osc_pt2pt_module_t *module,
void *ptr,
size_t size)
{
size_t total_size = sizeof(ompi_osc_rdma_pending_frag_t) + size;
ompi_osc_rdma_pending_frag_t *ret =
(ompi_osc_rdma_pending_frag_t*) malloc(total_size);
size_t total_size = sizeof(ompi_osc_pt2pt_pending_frag_t) + size;
ompi_osc_pt2pt_pending_frag_t *ret =
(ompi_osc_pt2pt_pending_frag_t*) malloc(total_size);
if (NULL == ret) return NULL;
OBJ_CONSTRUCT(&ret, ompi_osc_rdma_pending_frag_t);
OBJ_CONSTRUCT(&ret, ompi_osc_pt2pt_pending_frag_t);
memcpy(ret->header, ptr, size);
return ret;
@ -50,11 +53,11 @@ ompi_osc_rdma_pending_frag_create(ompi_osc_rdma_module_t *module,
/*
* Note: module lock must be held for this operation
* Note: module lock must be held for this operation
*/
static inline int
ompi_osc_rdma_pending_frag_destroy(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_pending_frag_t* frag)
ompi_osc_pt2pt_pending_frag_destroy(ompi_osc_pt2pt_module_t *module,
ompi_osc_pt2pt_pending_frag_t* frag)
{
OBJ_DESTRUCT(&frag);
free(frag);

Просмотреть файл

@ -1,9 +1,12 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -14,8 +17,8 @@
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "osc_rdma.h"
#include "osc_rdma_request.h"
#include "osc_pt2pt.h"
#include "osc_pt2pt_request.h"
static int
request_cancel(struct ompi_request_t *request, int complete)
@ -26,14 +29,14 @@ request_cancel(struct ompi_request_t *request, int complete)
static int
request_free(struct ompi_request_t **ompi_req)
{
ompi_osc_rdma_request_t *request =
(ompi_osc_rdma_request_t*) *ompi_req;
ompi_osc_pt2pt_request_t *request =
(ompi_osc_pt2pt_request_t*) *ompi_req;
if (true != request->super.req_complete) {
return MPI_ERR_REQUEST;
}
OMPI_OSC_RDMA_REQUEST_RETURN(request);
OMPI_OSC_PT2PT_REQUEST_RETURN(request);
*ompi_req = MPI_REQUEST_NULL;
@ -42,7 +45,7 @@ request_free(struct ompi_request_t **ompi_req)
static
void
request_construct(ompi_osc_rdma_request_t *request)
request_construct(ompi_osc_pt2pt_request_t *request)
{
request->super.req_type = OMPI_REQUEST_WIN;
request->super.req_status._cancelled = 0;
@ -50,7 +53,7 @@ request_construct(ompi_osc_rdma_request_t *request)
request->super.req_cancel = request_cancel;
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_request_t,
ompi_request_t,
request_construct,
NULL);

Просмотреть файл

@ -4,46 +4,46 @@
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_REQUEST_H
#define OMPI_OSC_RDMA_REQUEST_H
#ifndef OMPI_OSC_PT2PT_REQUEST_H
#define OMPI_OSC_PT2PT_REQUEST_H
#include "osc_rdma.h"
#include "osc_pt2pt.h"
#include "ompi/request/request.h"
#include "opal/util/output.h"
struct ompi_osc_rdma_request_t {
struct ompi_osc_pt2pt_request_t {
ompi_request_t super;
int type;
void *origin_addr;
int origin_count;
struct ompi_datatype_t *origin_dt;
ompi_osc_rdma_module_t* module;
int outstanding_requests;
ompi_osc_pt2pt_module_t* module;
int32_t outstanding_requests;
bool internal;
};
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
typedef struct ompi_osc_pt2pt_request_t ompi_osc_pt2pt_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_request_t);
/* REQUEST_ALLOC is only called from "top-level" functions (rdma_rput,
rdma_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_RDMA_REQUEST_ALLOC(win, req) \
/* REQUEST_ALLOC is only called from "top-level" functions (pt2pt_rput,
pt2pt_rget, etc.), so it's ok to spin here... */
#define OMPI_OSC_PT2PT_REQUEST_ALLOC(win, req) \
do { \
ompi_free_list_item_t *item; \
do { \
OMPI_FREE_LIST_GET_MT(&mca_osc_rdma_component.requests, item); \
OMPI_FREE_LIST_GET_MT(&mca_osc_pt2pt_component.requests, item); \
if (NULL == item) { \
opal_progress(); \
} \
} while (NULL == item); \
req = (ompi_osc_rdma_request_t*) item; \
req = (ompi_osc_pt2pt_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = win; \
req->super.req_complete = false; \
@ -52,14 +52,14 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
req->internal = false; \
} while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
#define OMPI_OSC_PT2PT_REQUEST_RETURN(req) \
do { \
OMPI_REQUEST_FINI(&(req)->super); \
OMPI_FREE_LIST_RETURN_MT(&mca_osc_rdma_component.requests, \
(ompi_free_list_item_t *) (req)); \
OMPI_REQUEST_FINI(&(req)->super); \
OMPI_FREE_LIST_RETURN_MT(&mca_osc_pt2pt_component.requests, \
(ompi_free_list_item_t *) (req)); \
} while (0)
static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *request, int mpi_error)
static inline void ompi_osc_pt2pt_request_complete (ompi_osc_pt2pt_request_t *request, int mpi_error)
{
if (!request->internal) {
request->super.req_status.MPI_ERROR = mpi_error;
@ -67,8 +67,8 @@ static inline void ompi_osc_rdma_request_complete (ompi_osc_rdma_request_t *requ
/* mark the request complete at the mpi level */
ompi_request_complete (&request->super, true);
} else {
OMPI_OSC_RDMA_REQUEST_RETURN (request);
OMPI_OSC_PT2PT_REQUEST_RETURN (request);
}
}
#endif /* OMPI_OSC_RDMA_REQUEST_H */
#endif /* OMPI_OSC_PT2PT_REQUEST_H */

Просмотреть файл

@ -1,26 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2013 Sandia National Laboratories. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_ompi_osc_rdma_POST_CONFIG(will_build)
# ----------------------------------------
# Only require the tag if we're actually going to be built, since bml
# is one of the ones frequently disabled for large installs.
AC_DEFUN([MCA_ompi_osc_rdma_POST_CONFIG], [
AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([BML])])
])dnl
# MCA_ompi_osc_rdma_CONFIG(action-if-can-compile,
# [action-if-cant-compile])
# ------------------------------------------------
# We can always build, unless we were explicitly disabled.
AC_DEFUN([MCA_ompi_osc_rdma_CONFIG],[
AC_CONFIG_FILES([ompi/mca/osc/rdma/Makefile])
[$1]
])dnl

Просмотреть файл

@ -1,197 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "ompi/mca/osc/base/base.h"
#include "ompi/mca/pml/pml.h"
#include "osc_rdma.h"
#include "osc_rdma_frag.h"
#include "osc_rdma_data_move.h"
static void ompi_osc_rdma_frag_constructor (ompi_osc_rdma_frag_t *frag){
frag->buffer = malloc (mca_osc_rdma_component.buffer_size + sizeof (ompi_osc_rdma_frag_header_t));
assert (frag->buffer);
}
static void ompi_osc_rdma_frag_destructor (ompi_osc_rdma_frag_t *frag) {
if (NULL != frag->buffer) {
free (frag->buffer);
}
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_frag_t, opal_list_item_t,
ompi_osc_rdma_frag_constructor, ompi_osc_rdma_frag_destructor);
static int frag_send_cb (ompi_request_t *request)
{
ompi_osc_rdma_frag_t *frag =
(ompi_osc_rdma_frag_t*) request->req_complete_cb_data;
ompi_osc_rdma_module_t *module = frag->module;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send complete to %d, frag = %p, request = %p",
frag->target, (void *) frag, (void *) request));
mark_outgoing_completion(module);
OPAL_FREE_LIST_RETURN(&mca_osc_rdma_component.frags, &frag->super);
/* put this request on the garbage colletion list */
osc_rdma_gc_add_request (request);
return OMPI_SUCCESS;
}
static int
frag_send(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int count;
count = (int)((uintptr_t) frag->top - (uintptr_t) frag->buffer);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag_send called to %d, frag = %p, count = %d",
frag->target, (void *) frag, count));
return ompi_osc_rdma_isend_w_cb (frag->buffer, count, MPI_BYTE, frag->target, OSC_RDMA_FRAG_TAG,
module->comm, frag_send_cb, frag);
}
int
ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t *frag)
{
int ret;
assert(0 == frag->pending);
assert(module->peers[frag->target].active_frag != frag);
/* we need to signal now that a frag is outgoing to ensure the count sent
* with the unlock message is correct */
ompi_osc_signal_outgoing (module, frag->target, 1);
/* if eager sends are not active, can't send yet, so buffer and
get out... */
if (module->passive_target_access_epoch) {
if (!module->passive_eager_send_active[frag->target]) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
} else {
if (!module->active_eager_send_active) {
opal_list_append(&module->queued_frags, &frag->super);
return OMPI_SUCCESS;
}
}
ret = frag_send(module, frag);
opal_condition_broadcast(&module->cond);
return ret;
}
int
ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target)
{
int ret = OMPI_SUCCESS;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target begin"));
/* flush the active frag */
if (NULL != module->peers[target].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[target].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished active frag"));
/* walk through the pending list and send */
ompi_osc_rdma_frag_t *frag, *next;
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
if (frag->target == target) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush target finished"));
return OMPI_SUCCESS;
}
int
ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module)
{
int ret = OMPI_SUCCESS;
int i;
ompi_osc_rdma_frag_t *frag, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all begin"));
/* flush the active frag */
for (i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (NULL != module->peers[i].active_frag) {
ompi_osc_rdma_frag_t *frag = module->peers[i].active_frag;
if (0 != frag->pending) {
/* communication going on while synchronizing; this is a bug */
return OMPI_ERR_RMA_SYNC;
}
module->peers[i].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, frag);
if (OMPI_SUCCESS != ret) return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all finished active frag"));
/* try to start all the queued frags */
OPAL_LIST_FOREACH_SAFE(frag, next, &module->queued_frags, ompi_osc_rdma_frag_t) {
opal_list_remove_item(&module->queued_frags, &frag->super);
ret = frag_send(module, frag);
if (OMPI_SUCCESS != ret) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: failure for frag send: %d", ret));
return ret;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: frag flush all done"));
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,138 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OSC_RDMA_FRAG_H
#define OSC_RDMA_FRAG_H
#include "ompi/communicator/communicator.h"
#include "osc_rdma_header.h"
#include "osc_rdma_request.h"
#include "opal/align.h"
/** Communication buffer for packing messages */
struct ompi_osc_rdma_frag_t {
opal_list_item_t super;
/* target rank of buffer */
int target;
unsigned char *buffer;
/* space remaining in buffer */
size_t remain_len;
/* start of unused space */
char *top;
/* Number of operations which have started writing into the frag, but not yet completed doing so */
int pending;
ompi_osc_rdma_frag_header_t *header;
ompi_osc_rdma_module_t *module;
};
typedef struct ompi_osc_rdma_frag_t ompi_osc_rdma_frag_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_frag_t);
extern int ompi_osc_rdma_frag_start(ompi_osc_rdma_module_t *module, ompi_osc_rdma_frag_t *buffer);
extern int ompi_osc_rdma_frag_flush_target(ompi_osc_rdma_module_t *module, int target);
extern int ompi_osc_rdma_frag_flush_all(ompi_osc_rdma_module_t *module);
/*
* Note: module lock must be held during this operation
*/
static inline int ompi_osc_rdma_frag_alloc(ompi_osc_rdma_module_t *module, int target,
size_t request_len, ompi_osc_rdma_frag_t **buffer,
char **ptr)
{
ompi_osc_rdma_frag_t *curr = module->peers[target].active_frag;
int ret;
/* osc rdma headers can have 64-bit values. these will need to be aligned
* on an 8-byte boundary on some architectures so we up align the allocation
* size here. */
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > mca_osc_rdma_component.buffer_size) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (NULL == curr || curr->remain_len < request_len) {
opal_free_list_item_t *item;
if (NULL != curr) {
curr->remain_len = 0;
/* If there's something pending, the pending finish will
start the buffer. Otherwise, we need to start it now. */
if (0 == curr->pending) {
module->peers[target].active_frag = NULL;
ret = ompi_osc_rdma_frag_start(module, curr);
}
}
OPAL_FREE_LIST_GET(&mca_osc_rdma_component.frags,
item, ret);
if (OMPI_SUCCESS != ret) return ret;
curr = module->peers[target].active_frag =
(ompi_osc_rdma_frag_t*) item;
curr->target = target;
curr->header = (ompi_osc_rdma_frag_header_t*) curr->buffer;
curr->top = (char*) (curr->header + 1);
curr->remain_len = mca_osc_rdma_component.buffer_size;
curr->module = module;
curr->pending = 0;
curr->header->base.type = OMPI_OSC_RDMA_HDR_TYPE_FRAG;
curr->header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
if (module->passive_target_access_epoch) {
curr->header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
}
curr->header->source = ompi_comm_rank(module->comm);
curr->header->num_ops = 0;
curr->header->windx = ompi_comm_get_cid(module->comm);
if (curr->remain_len < request_len) {
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
}
}
*ptr = curr->top;
*buffer = curr;
curr->top += request_len;
curr->remain_len -= request_len;
curr->pending++;
curr->header->num_ops++;
return OMPI_SUCCESS;
}
/*
* Note: module lock must be held for this operation
*/
static inline int ompi_osc_rdma_frag_finish(ompi_osc_rdma_module_t *module,
ompi_osc_rdma_frag_t* buffer)
{
if (0 == --buffer->pending && 0 == buffer->remain_len) {
if (OPAL_LIKELY(buffer == module->peers[buffer->target].active_frag)) {
/* this is the active fragment. need to set the current fragment to null
* or it will be started multiple times */
module->peers[buffer->target].active_frag = NULL;
}
return ompi_osc_rdma_frag_start(module, buffer);
}
return OMPI_SUCCESS;
}
#endif

Просмотреть файл

@ -1,187 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_MCA_OSC_RDMA_HDR_H
#define OMPI_MCA_OSC_RDMA_HDR_H
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "opal/types.h"
enum ompi_osc_rdma_hdr_type_t {
OMPI_OSC_RDMA_HDR_TYPE_PUT = 0x01,
OMPI_OSC_RDMA_HDR_TYPE_PUT_LONG = 0x02,
OMPI_OSC_RDMA_HDR_TYPE_ACC = 0x03,
OMPI_OSC_RDMA_HDR_TYPE_ACC_LONG = 0x04,
OMPI_OSC_RDMA_HDR_TYPE_GET = 0x05,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP = 0x06,
OMPI_OSC_RDMA_HDR_TYPE_CSWAP_LONG = 0x07,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC = 0x08,
OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG = 0x09,
OMPI_OSC_RDMA_HDR_TYPE_COMPLETE = 0x10,
OMPI_OSC_RDMA_HDR_TYPE_POST = 0x11,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ = 0x12,
OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK = 0x13,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ = 0x14,
OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK = 0x15,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ = 0x16,
OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK = 0x17,
OMPI_OSC_RDMA_HDR_TYPE_FRAG = 0x20,
};
typedef enum ompi_osc_rdma_hdr_type_t ompi_osc_rdma_hdr_type_t;
#define OMPI_OSC_RDMA_HDR_FLAG_NBO 0x01
#define OMPI_OSC_RDMA_HDR_FLAG_VALID 0x02
#define OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET 0x04
#define OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE 0x08
struct ompi_osc_rdma_header_base_t {
/** fragment type. 8 bits */
uint8_t type;
/** fragment flags. 8 bits */
uint8_t flags;
};
typedef struct ompi_osc_rdma_header_base_t ompi_osc_rdma_header_base_t;
struct ompi_osc_rdma_header_put_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_put_t ompi_osc_rdma_header_put_t;
struct ompi_osc_rdma_header_acc_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint32_t op;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_acc_t ompi_osc_rdma_header_acc_t;
struct ompi_osc_rdma_header_get_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t count;
uint64_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_get_t ompi_osc_rdma_header_get_t;
struct ompi_osc_rdma_header_complete_t {
ompi_osc_rdma_header_base_t base;
int frag_count;
};
typedef struct ompi_osc_rdma_header_complete_t ompi_osc_rdma_header_complete_t;
struct ompi_osc_rdma_header_cswap_t {
ompi_osc_rdma_header_base_t base;
uint16_t tag;
uint32_t len;
uint64_t displacement;
};
typedef struct ompi_osc_rdma_header_cswap_t ompi_osc_rdma_header_cswap_t;
struct ompi_osc_rdma_header_post_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
};
typedef struct ompi_osc_rdma_header_post_t ompi_osc_rdma_header_post_t;
struct ompi_osc_rdma_header_lock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_t ompi_osc_rdma_header_lock_t;
struct ompi_osc_rdma_header_lock_ack_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx;
uint32_t source;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_lock_ack_t ompi_osc_rdma_header_lock_ack_t;
struct ompi_osc_rdma_header_unlock_t {
ompi_osc_rdma_header_base_t base;
int32_t lock_type;
uint32_t frag_count;
};
typedef struct ompi_osc_rdma_header_unlock_t ompi_osc_rdma_header_unlock_t;
struct ompi_osc_rdma_header_unlock_ack_t {
ompi_osc_rdma_header_base_t base;
};
typedef struct ompi_osc_rdma_header_unlock_ack_t ompi_osc_rdma_header_unlock_ack_t;
struct ompi_osc_rdma_header_flush_t {
ompi_osc_rdma_header_base_t base;
uint32_t frag_count;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_t ompi_osc_rdma_header_flush_t;
struct ompi_osc_rdma_header_flush_ack_t {
ompi_osc_rdma_header_base_t base;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_header_flush_ack_t ompi_osc_rdma_header_flush_ack_t;
struct ompi_osc_rdma_frag_header_t {
ompi_osc_rdma_header_base_t base;
uint16_t windx; /* cid of communicator backing window (our window id) */
uint32_t source; /* rank in window of source process */
uint16_t num_ops; /* number of operations in this buffer */
uint16_t pad[3]; /* ensure the fragment header is a multiple of 8 bytes */
};
typedef struct ompi_osc_rdma_frag_header_t ompi_osc_rdma_frag_header_t;
union ompi_osc_rdma_header_t {
ompi_osc_rdma_header_base_t base;
ompi_osc_rdma_header_put_t put;
ompi_osc_rdma_header_acc_t acc;
ompi_osc_rdma_header_get_t get;
ompi_osc_rdma_header_complete_t complete;
ompi_osc_rdma_header_cswap_t cswap;
ompi_osc_rdma_header_post_t post;
ompi_osc_rdma_header_lock_t lock;
ompi_osc_rdma_header_lock_ack_t lock_ack;
ompi_osc_rdma_header_unlock_t unlock;
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
ompi_osc_rdma_header_flush_t flush;
ompi_osc_rdma_header_flush_ack_t flush_ack;
ompi_osc_rdma_frag_header_t frag;
};
typedef union ompi_osc_rdma_header_t ompi_osc_rdma_header_t;
#endif /* OMPI_MCA_OSC_RDMA_HDR_H */

Просмотреть файл

@ -1,47 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/*
* utility functions for dealing with remote datatype and op structures
*/
/**
* Convert a window index number into a module instance.
*/
static inline ompi_osc_rdma_module_t*
ompi_osc_rdma_windx_to_module(uint32_t windx)
{
int ret;
ompi_osc_rdma_module_t *module;
/* find the right module and dispatch */
OPAL_THREAD_LOCK(&mca_osc_rdma_component.lock);
ret = opal_hash_table_get_value_uint32(&mca_osc_rdma_component.modules,
windx,
(void**) (&module));
OPAL_THREAD_UNLOCK(&mca_osc_rdma_component.lock);
if (OMPI_SUCCESS != ret) {
opal_output(0, "Could not translate windx %d to a local MPI_Win instance",
windx);
return NULL;
}
return module;
}

Просмотреть файл

@ -1,966 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "osc_rdma.h"
#include "osc_rdma_header.h"
#include "osc_rdma_data_move.h"
#include "osc_rdma_frag.h"
#include "mpi.h"
#include "opal/runtime/opal_progress.h"
#include "opal/threads/mutex.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/include/opal_stdint.h"
/* target-side tracking of a lock request */
struct ompi_osc_rdma_pending_lock_t {
opal_list_item_t super;
int peer;
int lock_type;
uint64_t serial_number;
};
typedef struct ompi_osc_rdma_pending_lock_t ompi_osc_rdma_pending_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_lock_t, opal_list_item_t,
NULL, NULL);
/* origin-side tracking of a lock request */
struct ompi_osc_rdma_outstanding_lock_t {
opal_list_item_t super;
int target;
int32_t lock_acks_received;
int32_t unlock_acks_received;
int32_t flush_acks_received;
uint64_t serial_number;
int32_t type;
};
typedef struct ompi_osc_rdma_outstanding_lock_t ompi_osc_rdma_outstanding_lock_t;
OBJ_CLASS_INSTANCE(ompi_osc_rdma_outstanding_lock_t, opal_list_item_t,
NULL, NULL);
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module);
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number);
/**
* Find the first outstanding lock to a target.
*
* @param[in] module - OSC RDMA module
* @param[in] target - Target rank
*
* @returns an outstanding lock on success
*
* This function traverses the outstanding_locks list in the module
* looking for a lock that matches target. The caller must hold the
* module lock.
*/
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock (ompi_osc_rdma_module_t *module, int target)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->target == target) {
return lock;
}
}
return NULL;
}
static inline ompi_osc_rdma_outstanding_lock_t *find_outstanding_lock_by_serial (ompi_osc_rdma_module_t *module, uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == serial_number) {
return lock;
}
}
return NULL;
}
static inline int ompi_osc_rdma_lock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
const int my_rank = ompi_comm_rank (module->comm);
if ((MPI_LOCK_SHARED == lock->type && MPI_LOCK_EXCLUSIVE != module->lock_status) ||
(MPI_LOCK_EXCLUSIVE == lock->type && 0 == module->lock_status)) {
/* we can aquire the lock immediately */
module->lock_status = lock->type;
if (MPI_LOCK_SHARED == lock->type) {
module->shared_count++;
}
lock->lock_acks_received++;
} else {
/* queue the lock */
queue_lock (module, my_rank, lock->type, lock->serial_number);
}
/* If locking local, can't be non-blocking according to the
standard. We need to wait for the ack here. */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"local lock aquired"));
return OMPI_SUCCESS;
}
static inline void ompi_osc_rdma_unlock_self (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock)
{
if (!(MPI_LOCK_SHARED == lock->type && 0 == --module->shared_count)) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
/* need to ensure we make progress */
opal_progress();
lock->unlock_acks_received++;
}
static inline int ompi_osc_rdma_lock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_lock_t lock_req;
int ret;
/* generate a lock request */
lock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_REQ;
lock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
lock_req.lock_type = lock->type;
lock_req.serial_number = lock->serial_number;
ret = ompi_osc_rdma_control_send (module, target, &lock_req, sizeof (lock_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* make sure the request gets sent, so we can start eager sending... */
ret = ompi_osc_rdma_frag_flush_target (module, target);
return ret;
}
static inline int ompi_osc_rdma_unlock_remote (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_outstanding_lock_t *lock)
{
ompi_osc_rdma_header_unlock_t unlock_req;
unlock_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_REQ;
unlock_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
unlock_req.frag_count = module->epoch_outgoing_frag_count[target];
unlock_req.lock_type = lock->type;
/* send control message with unlock request and count */
return ompi_osc_rdma_control_send (module, target, &unlock_req, sizeof (unlock_req));
}
int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active || module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
assert(module->epoch_outgoing_frag_count[target] == 0);
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock %d %d", target, lock_type));
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
module->passive_eager_send_active[target] = false;
module->passive_target_access_epoch = true;
/* when the lock ack returns we will be in an access epoch with this peer */
peer->access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
lock->target = target;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = lock_type;
opal_list_append(&module->outstanding_locks, &lock->super);
if (0 == (assert & MPI_MODE_NOCHECK)) {
if (ompi_comm_rank (module->comm) != target) {
ret = ompi_osc_rdma_lock_remote (module, target, lock);
} else {
ret = ompi_osc_rdma_lock_self (module, lock);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
} else {
lock->lock_acks_received = 1;
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
if (ompi_comm_rank (module->comm) != target) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: unlock %d, lock_acks_received = %d", target,
lock->lock_acks_received));
/* wait until ack has arrived from target */
while (0 == lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
ret = ompi_osc_rdma_unlock_remote (module, target, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
0 == lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock: unlock of %d complete", target));
} else {
ompi_osc_rdma_unlock_self (module, lock);
}
module->passive_eager_send_active[target] = false;
module->epoch_outgoing_frag_count[target] = 0;
module->passive_target_access_epoch = false;
peer->access_epoch = false;
/* delete the lock */
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret, my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
/* Check if no_locks is set. TODO: we also need to track whether we are in an active
* target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active) {
return OMPI_ERR_RMA_SYNC;
}
/* delay all eager sends until we've heard back.. */
OPAL_THREAD_LOCK(&module->lock);
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
module->passive_eager_send_active[i] = false;
}
module->passive_target_access_epoch = true;
module->all_access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
lock->target = -1;
lock->lock_acks_received = 0;
lock->unlock_acks_received = 0;
lock->serial_number = module->lock_serial_number++;
lock->type = MPI_LOCK_SHARED;
opal_list_append(&module->outstanding_locks, &lock->super);
/* if nocheck is not specified, send a lock request to everyone
and wait for the local response */
if (0 != (assert & MPI_MODE_NOCHECK)) {
ret = ompi_osc_rdma_lock_self (module, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto exit_error;
}
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_lock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
opal_list_remove_item(&module->outstanding_locks, &lock->super);
}
}
} else {
lock->lock_acks_received = ompi_comm_size(module->comm);
}
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
exit_error:
OPAL_THREAD_UNLOCK(&module->lock);
opal_list_remove_item(&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
/* return */
return ret;
}
int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all entering..."));
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all: not locked in window %s",
win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
/* wait for lock acks */
while (ompi_comm_size(module->comm) != lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* send unlock messages to all of my peers */
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (my_rank == i) {
continue;
}
ret = ompi_osc_rdma_unlock_remote (module, i, lock);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
}
}
/* unlock myself */
ompi_osc_rdma_unlock_self (module, lock);
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests and the unlock ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
ompi_comm_size(module->comm) != lock->unlock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
/* reset all fragment counters */
memset (module->epoch_outgoing_frag_count, 0, ompi_comm_size(module->comm) * sizeof (module->epoch_outgoing_frag_count[0]));
memset (module->passive_eager_send_active, 0, ompi_comm_size(module->comm) * sizeof (module->passive_eager_send_active[0]));
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all complete"));
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_sync (struct ompi_win_t *win)
{
opal_progress();
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_flush_lock (ompi_osc_rdma_module_t *module, ompi_osc_rdma_outstanding_lock_t *lock,
int target)
{
ompi_osc_rdma_header_flush_t flush_req;
int peer_count, ret, flush_count;
int my_rank = ompi_comm_rank (module->comm);
if (-1 == lock->target) {
peer_count = ompi_comm_size(module->comm);
} else {
peer_count = 1;
}
/* wait until ack has arrived from target, since we need to be
able to eager send before we can transfer all the data... */
while (peer_count > lock->lock_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
lock->flush_acks_received = 0;
flush_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_REQ;
flush_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID | OMPI_OSC_RDMA_HDR_FLAG_PASSIVE_TARGET;
flush_req.serial_number = lock->serial_number;
if (-1 == target) {
/* NTH: no local flush */
flush_count = ompi_comm_size(module->comm) - 1;
for (int i = 0 ; i < ompi_comm_size(module->comm) ; ++i) {
if (i == my_rank) {
continue;
}
flush_req.frag_count = module->epoch_outgoing_frag_count[i];
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, i, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, i);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
} else {
flush_req.frag_count = module->epoch_outgoing_frag_count[target];
flush_count = 1;
/* send control message with flush request and count */
ret = ompi_osc_rdma_control_send (module, target, &flush_req, sizeof (flush_req));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
/* start all sendreqs to target */
ret = ompi_osc_rdma_frag_flush_target (module, target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* wait for all the requests and the flush ack (meaning remote completion) */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count ||
flush_count != lock->flush_acks_received) {
opal_condition_wait(&module->cond, &module->lock);
}
if (-1 == target) {
memset (module->epoch_outgoing_frag_count, 0, peer_count * sizeof (module->epoch_outgoing_frag_count[0]));
} else {
module->epoch_outgoing_frag_count[target] = 0;
}
return OMPI_SUCCESS;
}
int ompi_osc_rdma_flush (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret;
assert (0 <= target);
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush starting..."));
if (ompi_comm_rank (module->comm) == target) {
/* nothing to flush */
opal_progress ();
return OMPI_SUCCESS;
}
OPAL_THREAD_LOCK(&module->lock);
lock = find_outstanding_lock (module, target);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
}
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush: target %d is not locked in window %s",
target, win->w_name));
OPAL_THREAD_LOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
ret = ompi_osc_rdma_flush_lock (module, lock, target);
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
if (OPAL_UNLIKELY(0 == opal_list_get_size (&module->outstanding_locks))) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all: no targets are locked in window %s",
win->w_name));
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all entering..."));
/* flush all locks */
OPAL_LIST_FOREACH(lock, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
ret = ompi_osc_rdma_flush_lock (module, lock, lock->target);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
break;
}
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_flush_all complete"));
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local (int target, struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_target(module, target);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
int ompi_osc_rdma_flush_local_all (struct ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
int ret = OMPI_SUCCESS;
/* flush is only allowed from within a passive target epoch */
if (!module->passive_target_access_epoch) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_THREAD_LOCK(&module->lock);
ret = ompi_osc_rdma_frag_flush_all(module);
if (OMPI_SUCCESS != ret) goto cleanup;
/* wait for all the requests */
while (module->outgoing_frag_count != module->outgoing_frag_signal_count) {
opal_condition_wait(&module->cond, &module->lock);
}
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
return ret;
}
/* target side operation to acknowledge to initiator side that the
lock is now held by the initiator */
static inline int activate_lock (ompi_osc_rdma_module_t *module, int requestor,
uint64_t serial_number)
{
ompi_osc_rdma_outstanding_lock_t *lock;
if (ompi_comm_rank (module->comm) != requestor) {
ompi_osc_rdma_header_lock_ack_t lock_ack;
lock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_LOCK_ACK;
lock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
lock_ack.source = ompi_comm_rank(module->comm);
lock_ack.windx = ompi_comm_get_cid(module->comm);
lock_ack.serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: sending lock to %d", requestor));
/* we don't want to send any data, since we're the exposure
epoch only, so use an unbuffered send */
return ompi_osc_rdma_control_send_unbuffered (module, requestor, &lock_ack, sizeof (lock_ack));
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: releasing local lock"));
lock = find_outstanding_lock (module, requestor);
if (NULL == lock) {
lock = find_outstanding_lock (module, -1);
if (OPAL_UNLIKELY(NULL == lock)) {
OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output,
"lock could not be located"));
}
}
lock->lock_acks_received++;
opal_condition_broadcast (&module->cond);
return OMPI_SUCCESS;
}
/* target side operation to create a pending lock request for a lock
request that could not be satisfied */
static inline int queue_lock (ompi_osc_rdma_module_t *module, int requestor,
int lock_type, uint64_t serial_number)
{
ompi_osc_rdma_pending_lock_t *pending =
OBJ_NEW(ompi_osc_rdma_pending_lock_t);
if (NULL == pending) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pending->peer = requestor;
pending->lock_type = lock_type;
pending->serial_number = serial_number;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: queueing lock request from %d", requestor));
opal_list_append(&module->locks_pending, &pending->super);
return OMPI_SUCCESS;
}
static int ompi_osc_activate_next_lock (ompi_osc_rdma_module_t *module) {
/* release any other pending locks we can */
ompi_osc_rdma_pending_lock_t *pending_lock, *next;
int ret = OMPI_SUCCESS;
OPAL_LIST_FOREACH_SAFE(pending_lock, next, &module->locks_pending,
ompi_osc_rdma_pending_lock_t) {
if (MPI_LOCK_SHARED == pending_lock->lock_type) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_SHARED to peer %d\n",
pending_lock->peer));
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
} else {
if (0 == module->lock_status) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_activate_next_lock: release pending lock of type MPI_LOCK_EXCLUSIVE to peer %d\n",
pending_lock->peer));
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, pending_lock->peer, pending_lock->serial_number);
opal_list_remove_item (&module->locks_pending, &pending_lock->super);
OBJ_RELEASE(pending_lock);
}
/* if the lock was acquired (ie, status was 0), then
we're done. If the lock was not acquired, we're
also done, because all the shared locks have to
finish first */
break;
}
if (OMPI_SUCCESS != ret) {
break;
}
}
return ret;
}
/* target side function called when the initiator sends a lock
request. Lock will either be activated and acknowledged or
queued. */
int ompi_osc_rdma_process_lock (ompi_osc_rdma_module_t* module, int source,
ompi_osc_rdma_header_lock_t* lock_header)
{
int ret;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_lock: processing lock request from %d. current lock state = %d, shared_count = %d",
source, module->lock_status, module->shared_count));
if (MPI_LOCK_SHARED == lock_header->lock_type) {
if (module->lock_status != MPI_LOCK_EXCLUSIVE) {
/* acquire shared lock */
module->lock_status = MPI_LOCK_SHARED;
module->shared_count++;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
} else {
if (0 == module->lock_status) {
/* acquire exclusive lock */
module->lock_status = MPI_LOCK_EXCLUSIVE;
ret = activate_lock(module, source, lock_header->serial_number);
} else {
/* lock not available, queue */
ret = queue_lock(module, source, lock_header->lock_type, lock_header->serial_number);
}
}
return ret;
}
/* initiator-side function called when the target acks the lock
request. */
void ompi_osc_rdma_process_lock_ack (ompi_osc_rdma_module_t *module,
ompi_osc_rdma_header_lock_ack_t *lock_ack_header)
{
ompi_osc_rdma_outstanding_lock_t *lock, *next;
OPAL_LIST_FOREACH_SAFE(lock, next, &module->outstanding_locks, ompi_osc_rdma_outstanding_lock_t) {
if (lock->serial_number == lock_ack_header->serial_number) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d", lock_ack_header->source));
lock->lock_acks_received++;
module->passive_eager_send_active[lock_ack_header->source] = true;
return;
}
}
opal_output(ompi_osc_base_framework.framework_output,
"osc rdma: lock ack %d, %ld for unfindable lock request",
lock_ack_header->source, (unsigned long) lock_ack_header->serial_number);
}
void ompi_osc_rdma_process_flush_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_ack_t *flush_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing flush ack from %d for lock %" PRIu64,
source, flush_ack_header->serial_number));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock_by_serial (module, flush_ack_header->serial_number);
assert (NULL != lock);
lock->flush_acks_received++;
opal_condition_broadcast(&module->cond);
}
void ompi_osc_rdma_process_unlock_ack (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_ack_t *unlock_ack_header) {
ompi_osc_rdma_outstanding_lock_t *lock;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock_ack: processing unlock ack from %d",
source));
/* NTH: need to verify that this will work as expected */
lock = find_outstanding_lock (module, source);
if (NULL == lock) {
lock = find_outstanding_lock(module, -1);
assert (NULL != lock);
}
lock->unlock_acks_received++;
}
/**
* Process an unlock request.
*
* @param[in] module - OSC RDMA module
* @param[in] source - Source rank
* @param[in] unlock_header - Incoming unlock header
*
* This functions is the target-side functio for handling an unlock
* request. Once all pending operations from the target are complete
* this functions sends an unlock acknowledgement then attempts to
* active a pending lock if the lock becomes free.
*/
int ompi_osc_rdma_process_unlock (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_unlock_t *unlock_header)
{
ompi_osc_rdma_header_unlock_ack_t unlock_ack;
int ret;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_unlock entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
unlock_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_UNLOCK_ACK;
unlock_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
ret = ompi_osc_rdma_control_send_unbuffered (module, source, &unlock_ack, sizeof (unlock_ack));
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
OPAL_THREAD_LOCK(&module->lock);
if (unlock_header->lock_type == MPI_LOCK_EXCLUSIVE || 0 == --module->shared_count) {
module->lock_status = 0;
ompi_osc_activate_next_lock (module);
}
OPAL_THREAD_UNLOCK(&module->lock);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: finished processing unlock fragment"));
return ret;
}
int ompi_osc_rdma_process_flush (ompi_osc_rdma_module_t *module, int source,
ompi_osc_rdma_header_flush_t *flush_header)
{
ompi_osc_rdma_header_flush_ack_t flush_ack;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_process_flush entering (finished %d/%d)...",
module->passive_incoming_frag_count[source],
module->passive_incoming_frag_signal_count[source]));
/* we cannot block when processing an incoming request */
if (module->passive_incoming_frag_signal_count[source] !=
module->passive_incoming_frag_count[source]) {
return OMPI_ERR_WOULD_BLOCK;
}
module->passive_incoming_frag_signal_count[source] = 0;
module->passive_incoming_frag_count[source] = 0;
flush_ack.base.type = OMPI_OSC_RDMA_HDR_TYPE_FLUSH_ACK;
flush_ack.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
flush_ack.serial_number = flush_header->serial_number;
return ompi_osc_rdma_control_send_unbuffered (module, source, &flush_ack, sizeof (flush_ack));
}