1
1
openmpi/ompi/mca/osc/rdma/osc_rdma.h
Brian Barrett 2ec0c4f593 * Fix race condition in post/wait/start/complete synchronization where one
epoch's control data could overwrite the previous epoch's data because
    we were reusing data structures between PW and SC.  Instead, we now
    have explicit post_msg and complete_msg counters for completion.

    refs trac:354

  * Only register the rdma osc callback once, as it turns out that some
    btls (MX) do somethng more than update a table during the register
    call, and each register call sucks up valuable fragments...

This commit was SVN r11745.

The following Trac tickets were found above:
  Ticket 354 --> https://svn.open-mpi.org/trac/ompi/ticket/354
2006-09-21 19:57:57 +00:00

266 строки
8.9 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_OSC_RDMA_H
#define OMPI_OSC_RDMA_H
#include "opal/class/opal_list.h"
#include "opal/class/opal_free_list.h"
#include "opal/class/opal_hash_table.h"
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/win/win.h"
#include "ompi/communicator/communicator.h"
struct ompi_osc_rdma_component_t {
/** Extend the basic osc component interface */
ompi_osc_base_component_t super;
/** store the state of progress threads for this instance of OMPI */
bool p2p_c_have_progress_threads;
/** lock access to datastructures in the component structure */
opal_mutex_t p2p_c_lock;
/** List of ompi_osc_rdma_module_ts currently in existance.
Needed so that received fragments can be dispatched to the
correct module */
opal_hash_table_t p2p_c_modules;
/** free list of ompi_osc_rdma_sendreq_t structures */
opal_free_list_t p2p_c_sendreqs;
/** free list of ompi_osc_rdma_replyreq_t structures */
opal_free_list_t p2p_c_replyreqs;
/** free list of ompi_osc_rdma_longreq_t structures */
opal_free_list_t p2p_c_longreqs;
};
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
struct ompi_osc_rdma_module_t {
/** Extend the basic osc module interface */
ompi_osc_base_module_t super;
/** lock access to data structures in the current module */
opal_mutex_t p2p_lock;
/** lock for "atomic" window updates from reductions */
opal_mutex_t p2p_acc_lock;
/** pointer back to window */
ompi_win_t *p2p_win;
/** communicator created with this window */
ompi_communicator_t *p2p_comm;
/** list of ompi_osc_rdma_sendreq_t structures, and includes all
requests for this access epoch that have not already been
started. p2p_lock must be held when modifying this field. */
opal_list_t p2p_pending_sendreqs;
/** list of int16_t counters for the number of requests to a
particular rank in p2p_comm for this access epoc. p2p_lock
must be held when modifying this field */
short *p2p_num_pending_sendreqs;
/** For MPI_Fence synchronization, the number of messages to send
in epoch. For Start/Complete, the number of updates for this
Complete. For lock, the number of
messages waiting for completion on on the origin side. Not
protected by p2p_lock - must use atomic counter operations. */
volatile int32_t p2p_num_pending_out;
/** For MPI_Fence synchronization, the number of expected incoming
messages. For Post/Wait, the number of expected updates from
complete. For lock, the number of messages on the passive side
we are waiting for. Not protected by p2p_lock - must use
atomic counter operations. */
volatile int32_t p2p_num_pending_in;
/** Number of "ping" messages from the remote post group we've
received */
volatile int32_t p2p_num_post_msgs;
/** Number of "count" messages from the remote complete group
we've received */
volatile int32_t p2p_num_complete_msgs;
/** cyclic counter for a unique tage for long messages. Not
protected by the p2p_lock - must use create_send_tag() to
create a send tag */
volatile int32_t p2p_tag_counter;
/** list of outstanding long messages that must be processes
(ompi_osc_rdma_request_long). Protected by p2p_lock. */
opal_list_t p2p_long_msgs;
opal_list_t p2p_copy_pending_sendreqs;
short *p2p_copy_num_pending_sendreqs;
bool p2p_eager_send;
/* ********************* FENCE data ************************ */
/* an array of <sizeof(p2p_comm)> ints, each containing the value
1. */
int *p2p_fence_coll_counts;
/* an array of <sizeof(p2p_comm)> shorts, for use in experimenting
with different synchronization costs */
short *p2p_fence_coll_results;
mca_osc_fence_sync_t p2p_fence_sync_type;
/* ********************* PWSC data ************************ */
struct ompi_group_t *p2p_pw_group;
struct ompi_group_t *p2p_sc_group;
/* ********************* LOCK data ************************ */
int32_t p2p_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */
int32_t p2p_shared_count;
opal_list_t p2p_locks_pending;
int32_t p2p_lock_received_ack;
};
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
/*
* Helper macro for grabbing the module structure from a window instance
*/
#if OMPI_ENABLE_DEBUG
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
static inline ompi_osc_rdma_module_t* P2P_MODULE(struct ompi_win_t* win)
{
ompi_osc_rdma_module_t *module =
(ompi_osc_rdma_module_t*) win->w_osc_module;
assert(module->p2p_win == win);
return module;
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#else
#define P2P_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
/*
* Component functions
*/
int ompi_osc_rdma_component_init(bool enable_progress_threads,
bool enable_mpi_threads);
int ompi_osc_rdma_component_finalize(void);
int ompi_osc_rdma_component_query(struct ompi_win_t *win,
struct ompi_info_t *info,
struct ompi_communicator_t *comm);
int ompi_osc_rdma_component_select(struct ompi_win_t *win,
struct ompi_info_t *info,
struct ompi_communicator_t *comm);
/*
* Module interface function types
*/
int ompi_osc_rdma_module_free(struct ompi_win_t *win);
int ompi_osc_rdma_module_put(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
int target_disp,
int target_count,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_module_accumulate(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
int target_disp,
int target_count,
struct ompi_datatype_t *target_dt,
struct ompi_op_t *op,
struct ompi_win_t *win);
int ompi_osc_rdma_module_get(void *origin_addr,
int origin_count,
struct ompi_datatype_t *origin_dt,
int target,
int target_disp,
int target_count,
struct ompi_datatype_t *target_dt,
struct ompi_win_t *win);
int ompi_osc_rdma_module_fence(int assert, struct ompi_win_t *win);
int ompi_osc_rdma_module_start(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_module_complete(struct ompi_win_t *win);
int ompi_osc_rdma_module_post(struct ompi_group_t *group,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_module_wait(struct ompi_win_t *win);
int ompi_osc_rdma_module_test(struct ompi_win_t *win,
int *flag);
int ompi_osc_rdma_module_lock(int lock_type,
int target,
int assert,
struct ompi_win_t *win);
int ompi_osc_rdma_module_unlock(int target,
struct ompi_win_t *win);
/*
* passive side sync interface functions
*/
int ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module,
int32_t origin,
int32_t lock_type);
int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module,
int32_t origin,
int32_t count);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* OMPI_OSC_RDMA_H */