/* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. * All rights reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef OMPI_OSC_RDMA_H #define OMPI_OSC_RDMA_H #include "opal/class/opal_list.h" #include "opal/class/opal_free_list.h" #include "opal/class/opal_hash_table.h" #include "ompi/win/win.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/osc/osc.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/bml/bml.h" BEGIN_C_DECLS struct ompi_osc_rdma_buffer_t { mca_btl_base_descriptor_t* descriptor; size_t remain_len; mca_bml_base_btl_t *bml_btl; }; typedef struct ompi_osc_rdma_buffer_t ompi_osc_rdma_buffer_t; struct ompi_osc_rdma_component_t { /** Extend the basic osc component interface */ ompi_osc_base_component_t super; /** store the state of progress threads for this instance of OMPI */ bool c_have_progress_threads; /** lock access to datastructures in the component structure */ opal_mutex_t c_lock; /** List of ompi_osc_rdma_module_ts currently in existance. Needed so that received fragments can be dispatched to the correct module */ opal_hash_table_t c_modules; /** Lock for request management */ opal_mutex_t c_request_lock; /** Condition variable for request management */ opal_condition_t c_request_cond; /** free list of ompi_osc_rdma_sendreq_t structures */ opal_free_list_t c_sendreqs; /** free list of ompi_osc_rdma_replyreq_t structures */ opal_free_list_t c_replyreqs; /** free list of ompi_osc_rdma_longreq_t structures */ opal_free_list_t c_longreqs; /** list of outstanding requests, of type ompi_osc_pt2pt_longreq_t */ opal_list_t c_pending_requests; #if OMPI_ENABLE_PROGRESS_THREADS opal_thread_t c_thread; bool c_thread_run; #endif bool c_btl_registered; uint32_t c_sequence_number; }; typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t; struct ompi_osc_rdma_btl_t { uint64_t peer_seg_key; mca_bml_base_btl_t *bml_btl; int rdma_order; int32_t num_sent; }; typedef struct ompi_osc_rdma_btl_t ompi_osc_rdma_btl_t; struct ompi_osc_rdma_peer_info_t { uint64_t peer_base; uint64_t peer_len; int peer_num_btls; volatile int peer_index_btls; ompi_osc_rdma_btl_t *peer_btls; int local_num_btls; mca_bml_base_btl_t **local_btls; mca_mpool_base_registration_t **local_registrations; mca_btl_base_descriptor_t **local_descriptors; }; typedef struct ompi_osc_rdma_peer_info_t ompi_osc_rdma_peer_info_t; struct ompi_osc_rdma_setup_info_t { volatile int32_t num_btls_callin; int32_t num_btls_expected; volatile int32_t num_btls_outgoing; opal_list_t *outstanding_btl_requests; }; typedef struct ompi_osc_rdma_setup_info_t ompi_osc_rdma_setup_info_t; struct ompi_osc_rdma_module_t { /** Extend the basic osc module interface */ ompi_osc_base_module_t super; uint32_t m_sequence_number; /** lock access to data structures in the current module */ opal_mutex_t m_lock; /** condition variable for access to current module */ opal_condition_t m_cond; /** lock for "atomic" window updates from reductions */ opal_mutex_t m_acc_lock; /** pointer back to window */ ompi_win_t *m_win; /** communicator created with this window */ ompi_communicator_t *m_comm; /** list of ompi_osc_rdma_sendreq_t structures, and includes all requests for this access epoch that have not already been started. m_lock must be held when modifying this field. */ opal_list_t m_pending_sendreqs; /** list of unsigned int counters for the number of requests to a particular rank in m_comm for this access epoc. m_lock must be held when modifying this field */ unsigned int *m_num_pending_sendreqs; /** For MPI_Fence synchronization, the number of messages to send in epoch. For Start/Complete, the number of updates for this Complete. For lock, the number of messages waiting for completion on on the origin side. Not protected by m_lock - must use atomic counter operations. */ int32_t m_num_pending_out; /** For MPI_Fence synchronization, the number of expected incoming messages. For Post/Wait, the number of expected updates from complete. For lock, the number of messages on the passive side we are waiting for. Not protected by m_lock - must use atomic counter operations. */ int32_t m_num_pending_in; /** Number of "ping" messages from the remote post group we've received */ int32_t m_num_post_msgs; /** Number of "count" messages from the remote complete group we've received */ int32_t m_num_complete_msgs; /** cyclic counter for a unique tage for long messages. Not protected by the m_lock - must use create_send_tag() to create a send tag */ volatile int32_t m_tag_counter; opal_list_t m_copy_pending_sendreqs; unsigned int *m_copy_num_pending_sendreqs; opal_list_t m_queued_sendreqs; /** start sending data eagerly */ bool m_eager_send_active; bool m_eager_send_ok; /* RDMA data */ bool m_use_rdma; bool m_rdma_wait_completion; ompi_osc_rdma_setup_info_t *m_setup_info; ompi_osc_rdma_peer_info_t *m_peer_info; int32_t m_rdma_num_pending; /*** buffering ***/ bool m_use_buffers; ompi_osc_rdma_buffer_t *m_pending_buffers; /* ********************* FENCE data ************************ */ /* an array of ints, each containing the value 1. */ int *m_fence_coll_counts; /* ********************* PWSC data ************************ */ struct ompi_group_t *m_pw_group; struct ompi_group_t *m_sc_group; bool *m_sc_remote_active_ranks; int *m_sc_remote_ranks; /* ********************* LOCK data ************************ */ int32_t m_lock_status; /* one of 0, MPI_LOCK_EXCLUSIVE, MPI_LOCK_SHARED */ int32_t m_shared_count; opal_list_t m_locks_pending; opal_list_t m_unlocks_pending; int32_t m_lock_received_ack; }; typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t; OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component; #define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module) /* * Component functions */ int ompi_osc_rdma_component_init(bool enable_progress_threads, bool enable_mpi_threads); int ompi_osc_rdma_component_finalize(void); int ompi_osc_rdma_component_query(struct ompi_win_t *win, struct ompi_info_t *info, struct ompi_communicator_t *comm); int ompi_osc_rdma_component_select(struct ompi_win_t *win, struct ompi_info_t *info, struct ompi_communicator_t *comm); int ompi_osc_rdma_component_progress(void); int ompi_osc_rdma_peer_info_free(ompi_osc_rdma_peer_info_t *peer_info); /* * Module interface function types */ int ompi_osc_rdma_module_free(struct ompi_win_t *win); int ompi_osc_rdma_module_put(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, int target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); int ompi_osc_rdma_module_accumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, int target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win); int ompi_osc_rdma_module_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, int target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win); int ompi_osc_rdma_module_fence(int assert, struct ompi_win_t *win); int ompi_osc_rdma_module_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win); int ompi_osc_rdma_module_complete(struct ompi_win_t *win); int ompi_osc_rdma_module_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); int ompi_osc_rdma_module_wait(struct ompi_win_t *win); int ompi_osc_rdma_module_test(struct ompi_win_t *win, int *flag); int ompi_osc_rdma_module_lock(int lock_type, int target, int assert, struct ompi_win_t *win); int ompi_osc_rdma_module_unlock(int target, struct ompi_win_t *win); /* * passive side sync interface functions */ int ompi_osc_rdma_passive_lock(ompi_osc_rdma_module_t *module, int32_t origin, int32_t lock_type); int ompi_osc_rdma_passive_unlock(ompi_osc_rdma_module_t *module, int32_t origin, int32_t count); int ompi_osc_rdma_passive_unlock_complete(ompi_osc_rdma_module_t *module); END_C_DECLS #endif /* OMPI_OSC_RDMA_H */