fceabb2498
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects. Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems. Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct. I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things: 1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new) 2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it. There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do. This commit was SVN r23925.
318 строки
10 KiB
C
318 строки
10 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#ifndef OMPI_OSC_RDMA_H
|
|
#define OMPI_OSC_RDMA_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/class/opal_free_list.h"
|
|
#include "opal/class/opal_hash_table.h"
|
|
#include "opal/threads/threads.h"
|
|
|
|
#include "ompi/win/win.h"
|
|
#include "ompi/communicator/communicator.h"
|
|
#include "ompi/mca/osc/osc.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/mca/bml/bml.h"
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
struct ompi_osc_rdma_buffer_t {
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
size_t remain_len;
|
|
mca_bml_base_btl_t *bml_btl;
|
|
};
|
|
typedef struct ompi_osc_rdma_buffer_t ompi_osc_rdma_buffer_t;
|
|
|
|
struct ompi_osc_rdma_component_t {
|
|
/** Extend the basic osc component interface */
|
|
ompi_osc_base_component_t super;
|
|
|
|
/** store the state of progress threads for this instance of OMPI */
|
|
bool c_have_progress_threads;
|
|
|
|
/** lock access to datastructures in the component structure */
|
|
opal_mutex_t c_lock;
|
|
|
|
/** List of ompi_osc_rdma_module_ts currently in existance.
|
|
Needed so that received fragments can be dispatched to the
|
|
correct module */
|
|
opal_hash_table_t c_modules;
|
|
|
|
/** Lock for request management */
|
|
opal_mutex_t c_request_lock;
|
|
|
|
/** Condition variable for request management */
|
|
opal_condition_t c_request_cond;
|
|
|
|
/** free list of ompi_osc_rdma_sendreq_t structures */
|
|
opal_free_list_t c_sendreqs;
|
|
/** free list of ompi_osc_rdma_replyreq_t structures */
|
|
opal_free_list_t c_replyreqs;
|
|
/** free list of ompi_osc_rdma_longreq_t structures */
|
|
opal_free_list_t c_longreqs;
|
|
|
|
/** list of outstanding requests, of type ompi_osc_pt2pt_longreq_t */
|
|
opal_list_t c_pending_requests;
|
|
|
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
opal_thread_t c_thread;
|
|
bool c_thread_run;
|
|
#endif
|
|
|
|
bool c_btl_registered;
|
|
|
|
uint32_t c_sequence_number;
|
|
};
|
|
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
|
|
|
|
|
|
struct ompi_osc_rdma_btl_t {
|
|
uint64_t peer_seg_key;
|
|
mca_bml_base_btl_t *bml_btl;
|
|
int rdma_order;
|
|
int32_t num_sent;
|
|
};
|
|
typedef struct ompi_osc_rdma_btl_t ompi_osc_rdma_btl_t;
|
|
|
|
|
|
struct ompi_osc_rdma_peer_info_t {
|
|
uint64_t peer_base;
|
|
uint64_t peer_len;
|
|
|
|
int peer_num_btls;
|
|
volatile int peer_index_btls;
|
|
ompi_osc_rdma_btl_t *peer_btls;
|
|
|
|
int local_num_btls;
|
|
mca_bml_base_btl_t **local_btls;
|
|
mca_mpool_base_registration_t **local_registrations;
|
|
mca_btl_base_descriptor_t **local_descriptors;
|
|
};
|
|
typedef struct ompi_osc_rdma_peer_info_t ompi_osc_rdma_peer_info_t;
|
|
|
|
|
|
struct ompi_osc_rdma_setup_info_t {
|
|
volatile int32_t num_btls_callin;
|
|
int32_t num_btls_expected;
|
|
volatile int32_t num_btls_outgoing;
|
|
opal_list_t *outstanding_btl_requests;
|
|
};
|
|
typedef struct ompi_osc_rdma_setup_info_t ompi_osc_rdma_setup_info_t;
|
|
|
|
|
|
struct ompi_osc_rdma_module_t {
|
|
/** Extend the basic osc module interface */
|
|
ompi_osc_base_module_t super;
|
|
|
|
uint32_t m_sequence_number;
|
|
|
|
/** lock access to data structures in the current module */
|
|
opal_mutex_t m_lock;
|
|
|
|
/** condition variable for access to current module */
|
|
opal_condition_t m_cond;
|
|
|
|
/** lock for "atomic" window updates from reductions */
|
|
opal_mutex_t m_acc_lock;
|
|
|
|
/** pointer back to window */
|
|
ompi_win_t *m_win;
|
|
|
|
/** communicator created with this window */
|
|
ompi_communicator_t *m_comm;
|
|
|
|
/** list of ompi_osc_rdma_sendreq_t structures, and includes all
|
|
requests for this access epoch that have not already been
|
|
started. m_lock must be held when modifying this field. */
|
|
opal_list_t m_pending_sendreqs;
|
|
|
|
/** list of unsigned int counters for the number of requests to a
|
|
particular rank in m_comm for this access epoc. m_lock
|
|
must be held when modifying this field */
|
|
unsigned int *m_num_pending_sendreqs;
|
|
|
|
/** For MPI_Fence synchronization, the number of messages to send
|
|
in epoch. For Start/Complete, the number of updates for this
|
|
Complete. For lock, the number of
|
|
messages waiting for completion on on the origin side. Not
|
|
protected by m_lock - must use atomic counter operations. */
|
|
int32_t m_num_pending_out;
|
|
|
|
/** For MPI_Fence synchronization, the number of expected incoming
|
|
messages. For Post/Wait, the number of expected updates from
|
|
complete. For lock, the number of messages on the passive side
|
|
we are waiting for. Not protected by m_lock - must use
|
|
atomic counter operations. */
|
|
int32_t m_num_pending_in;
|
|
|
|
/** Number of "ping" messages from the remote post group we've
|
|
received */
|
|
int32_t m_num_post_msgs;
|
|
|
|
/** Number of "count" messages from the remote complete group
|
|
we've received */
|
|
int32_t m_num_complete_msgs;
|
|
|
|
/** cyclic counter for a unique tage for long messages. Not
|
|
protected by the m_lock - must use create_send_tag() to
|
|
create a send tag */
|
|
volatile int32_t m_tag_counter;
|
|
|
|
opal_list_t m_copy_pending_sendreqs;
|
|
unsigned int *m_copy_num_pending_sendreqs;
|
|
|
|
opal_list_t m_queued_sendreqs;
|
|
|
|
/** start sending data eagerly */
|
|
bool m_eager_send_active;
|
|
bool m_eager_send_ok;
|
|
|
|
/* RDMA data */
|
|
bool m_use_rdma;
|
|
bool m_rdma_wait_completion;
|
|
ompi_osc_rdma_setup_info_t *m_setup_info;
|
|
ompi_osc_rdma_peer_info_t *m_peer_info;
|
|
int32_t m_rdma_num_pending;
|
|
|
|
/*** buffering ***/
|
|
bool m_use_buffers;
|
|
ompi_osc_rdma_buffer_t *m_pending_buffers;
|
|
|
|
/* ********************* FENCE data ************************ */
|
|
/* an array of <sizeof(m_comm)> ints, each containing the value
|
|
1. */
|
|
int *m_fence_coll_counts;
|
|
|
|
/* ********************* PWSC data ************************ */
|
|
struct ompi_group_t *m_pw_group;
|
|
struct ompi_group_t *m_sc_group;
|
|
bool *m_sc_remote_active_ranks;
|
|
int *m_sc_remote_ranks;
|
|
|
|
/* ********************* LOCK data ************************ */
|
|
int32_t m_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */
|
|
int32_t m_shared_count;
|
|
opal_list_t m_locks_pending;
|
|
opal_list_t m_unlocks_pending;
|
|
int32_t m_lock_received_ack;
|
|
};
|
|
typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t;
|
|
OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component;
|
|
|
|
|
|
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
|
|
|
|
/*
|
|
* Component functions
|
|
*/
|
|
|
|
int ompi_osc_rdma_component_init(bool enable_progress_threads,
|
|
bool enable_mpi_threads);
|
|
|
|
int ompi_osc_rdma_component_finalize(void);
|
|
|
|
int ompi_osc_rdma_component_query(struct ompi_win_t *win,
|
|
struct ompi_info_t *info,
|
|
struct ompi_communicator_t *comm);
|
|
|
|
int ompi_osc_rdma_component_select(struct ompi_win_t *win,
|
|
struct ompi_info_t *info,
|
|
struct ompi_communicator_t *comm);
|
|
|
|
int ompi_osc_rdma_component_progress(void);
|
|
|
|
int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info);
|
|
|
|
/*
|
|
* Module interface function types
|
|
*/
|
|
int ompi_osc_rdma_module_free(struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_put(void *origin_addr,
|
|
int origin_count,
|
|
struct ompi_datatype_t *origin_dt,
|
|
int target,
|
|
OPAL_PTRDIFF_TYPE target_disp,
|
|
int target_count,
|
|
struct ompi_datatype_t *target_dt,
|
|
struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_accumulate(void *origin_addr,
|
|
int origin_count,
|
|
struct ompi_datatype_t *origin_dt,
|
|
int target,
|
|
OPAL_PTRDIFF_TYPE target_disp,
|
|
int target_count,
|
|
struct ompi_datatype_t *target_dt,
|
|
struct ompi_op_t *op,
|
|
struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_get(void *origin_addr,
|
|
int origin_count,
|
|
struct ompi_datatype_t *origin_dt,
|
|
int target,
|
|
OPAL_PTRDIFF_TYPE target_disp,
|
|
int target_count,
|
|
struct ompi_datatype_t *target_dt,
|
|
struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_fence(int assert, struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_start(struct ompi_group_t *group,
|
|
int assert,
|
|
struct ompi_win_t *win);
|
|
int ompi_osc_rdma_module_complete(struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_post(struct ompi_group_t *group,
|
|
int assert,
|
|
struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_wait(struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_test(struct ompi_win_t *win,
|
|
int *flag);
|
|
|
|
int ompi_osc_rdma_module_lock(int lock_type,
|
|
int target,
|
|
int assert,
|
|
struct ompi_win_t *win);
|
|
|
|
int ompi_osc_rdma_module_unlock(int target,
|
|
struct ompi_win_t *win);
|
|
|
|
/*
|
|
* passive side sync interface functions
|
|
*/
|
|
int ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module,
|
|
int32_t origin,
|
|
int32_t lock_type);
|
|
|
|
int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module,
|
|
int32_t origin,
|
|
int32_t count);
|
|
|
|
int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module);
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* OMPI_OSC_RDMA_H */
|