/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2004-2006 The Trustees of the University of Tennessee. * All rights reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef OMPI_OSC_RDMA_H #define OMPI_OSC_RDMA_H #include "ompi_config.h" #include "opal/class/opal_free_list.h" #include "opal/class/opal_hash_table.h" #include "opal/threads/threads.h" #include "opal/util/output.h" #include "opal/mca/shmem/shmem.h" #include "opal/mca/shmem/base/base.h" #include "ompi/win/win.h" #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/request/request.h" #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "opal/mca/btl/btl.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" #include "osc_rdma_types.h" #include "osc_rdma_sync.h" #include "osc_rdma_peer.h" #include "opal_stdint.h" /** * @brief osc rdma component structure */ struct ompi_osc_rdma_component_t { /** Extend the basic osc component interface */ ompi_osc_base_component_t super; /** lock access to modules */ opal_mutex_t lock; /** cid -> module mapping */ opal_hash_table_t modules; /** free list of ompi_osc_rdma_frag_t structures */ opal_free_list_t frags; /** Free list of requests */ opal_free_list_t requests; /** RDMA component buffer size */ unsigned int buffer_size; /** aggregation limit */ unsigned int aggregation_limit; /** List of requests that need to be freed */ opal_list_t request_gc; /** List of buffers that need to be freed */ opal_list_t buffer_gc; /** Maximum number of segments that can be attached to a dynamic window */ unsigned int max_attach; /** Default value of the no_locks info key for new windows */ bool no_locks; /** Priority of the osc/rdma component */ unsigned int priority; /** aggregation free list */ opal_free_list_t aggregate; }; typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t; struct ompi_osc_rdma_frag_t; /** * @brief osc rdma module structure * * Each MPI window is associated with a single osc module. This struct * stores the data relevant to the osc/rdma component. */ struct ompi_osc_rdma_module_t { /** Extend the basic osc module interface */ ompi_osc_base_module_t super; /** pointer back to MPI window */ struct ompi_win_t *win; /** Mutex lock protecting module data */ opal_mutex_t lock; /* window configuration */ /** value of same_disp_unit info key for this window */ bool same_disp_unit; /** value of same_size info key for this window */ bool same_size; /** window should have accumulate ordering... */ bool accumulate_ordering; /** passive-target synchronization will not be used in this window */ bool no_locks; /** flavor of this window */ int flavor; /** size of local window */ size_t size; /** Local displacement unit. */ int disp_unit; /** global leader */ ompi_osc_rdma_peer_t *leader; /** pointer to free on cleanup (may be NULL) */ void *free_after; /** local state structure (shared memory) */ ompi_osc_rdma_state_t *state; /** node-level communication data (shared memory) */ unsigned char *node_comm_info; /* only relevant on the lowest rank on each node (shared memory) */ ompi_osc_rdma_rank_data_t *rank_array; /** communicator created with this window. This is the cid used * in the component's modules mapping. */ ompi_communicator_t *comm; /* temporary communicators for window initialization */ ompi_communicator_t *local_leaders; ompi_communicator_t *shared_comm; /** node id of this rank */ int node_id; /** number of nodes */ int node_count; /** handle valid for local state (valid for local data for MPI_Win_allocate) */ mca_btl_base_registration_handle_t *state_handle; /** registration handle for the window base (only used for MPI_Win_create) */ mca_btl_base_registration_handle_t *base_handle; /** size of a region */ size_t region_size; /** size of the state structure */ size_t state_size; /** offset in the shared memory segment where the state array starts */ size_t state_offset; /* ********************* sync data ************************ */ /** global sync object (PSCW, fence, lock all) */ ompi_osc_rdma_sync_t all_sync; /** current group associate with pscw exposure epoch */ struct ompi_group_t *pw_group; /** list of unmatched post messages */ opal_list_t pending_posts; /* ********************* LOCK data ************************ */ /** number of outstanding locks */ osc_rdma_counter_t passive_target_access_epoch; /** origin side list of locks currently outstanding */ opal_hash_table_t outstanding_locks; /** array of locks (small jobs) */ ompi_osc_rdma_sync_t **outstanding_lock_array; /* ******************* peer storage *********************** */ /** hash table of allocated peers */ opal_hash_table_t peer_hash; /** array of allocated peers (small jobs) */ ompi_osc_rdma_peer_t **peer_array; /** lock for peer hash table/array */ opal_mutex_t peer_lock; /** BTL in use */ struct mca_btl_base_module_t *selected_btl; /** registered fragment used for locally buffered RDMA transfers */ struct ompi_osc_rdma_frag_t *rdma_frag; /** registration handles for dynamically attached regions. These are not stored * in the state structure as it is entirely local. */ ompi_osc_rdma_handle_t *dynamic_handles; /** shared memory segment. this segment holds this node's portion of the rank -> node * mapping array, node communication data (node_comm_info), state for all local ranks, * and data for all local ranks (MPI_Win_allocate only) */ void *segment_base; /** opal shared memory structure for the shared memory segment */ opal_shmem_ds_t seg_ds; /* performance values */ /** number of times a put had to be retried */ unsigned long put_retry_count; /** number of time a get had to be retried */ unsigned long get_retry_count; }; typedef struct ompi_osc_rdma_module_t ompi_osc_rdma_module_t; OMPI_MODULE_DECLSPEC extern ompi_osc_rdma_component_t mca_osc_rdma_component; #define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module) int ompi_osc_rdma_free (struct ompi_win_t *win); /* peer functions */ /** * @brief cache a peer object * * @param[in] module osc rdma module * @param[in] peer peer object to cache * * @returns OMPI_SUCCESS on success * @returns OMPI_ERR_OUT_OF_RESOURCE on failure */ int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer); /** * @brief check if a peer object is cached for a remote rank * * @param[in] module osc rdma module * @param[in] peer_id remote peer rank * * @returns peer object on success * @returns NULL if a peer object is not cached for the peer */ static inline ompi_osc_rdma_peer_t *ompi_osc_module_get_peer (ompi_osc_rdma_module_t *module, int peer_id) { if (NULL == module->peer_array) { ompi_osc_rdma_peer_t *peer = NULL; (void) opal_hash_table_get_value_uint32 (&module->peer_hash, peer_id, (void **) &peer); return peer; } return module->peer_array[peer_id]; } /** * @brief get the peer object for a remote rank * * @param[in] module osc rdma module * @param[in] peer_id remote peer rank */ static inline ompi_osc_rdma_peer_t *ompi_osc_rdma_module_peer (ompi_osc_rdma_module_t *module, int peer_id) { ompi_osc_rdma_peer_t *peer; peer = ompi_osc_module_get_peer (module, peer_id); if (NULL != peer) { return peer; } return ompi_osc_rdma_peer_lookup (module, peer_id); } /** * @brief check if this process has this process is in a passive target access epoch * * @param[in] module osc rdma module */ static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *module) { return 0 != module->passive_target_access_epoch; } static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr, size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file) { if (module->selected_btl->btl_register_mem) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)", ptr, (char *) ptr + size, size); *handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags); if (OPAL_UNLIKELY(NULL == *handle)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, " "size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line); return OMPI_ERR_OUT_OF_RESOURCE; } } else { *handle = NULL; } return OMPI_SUCCESS; } #define ompi_osc_rdma_register(...) _ompi_osc_rdma_register(__VA_ARGS__, __LINE__, __FILE__) static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file) { if (handle) { module->selected_btl->btl_deregister_mem (module->selected_btl, handle); } } #define ompi_osc_rdma_deregister(...) _ompi_osc_rdma_deregister(__VA_ARGS__, __LINE__, __FILE__) static inline void ompi_osc_rdma_progress (ompi_osc_rdma_module_t *module) { opal_progress (); } /** * Find the first outstanding lock of the target. * * @param[in] module osc rdma module * @param[in] target target rank * @param[out] peer peer object associated with the target * * @returns an outstanding lock on success * * This function looks for an outstanding lock to the target. If a lock exists it is returned. */ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_lock_find (ompi_osc_rdma_module_t *module, int target, ompi_osc_rdma_peer_t **peer) { ompi_osc_rdma_sync_t *outstanding_lock = NULL; if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) { outstanding_lock = module->outstanding_lock_array[target]; } else { (void) opal_hash_table_get_value_uint32 (&module->outstanding_locks, (uint32_t) target, (void **) &outstanding_lock); } if (NULL != outstanding_lock && peer) { *peer = outstanding_lock->peer_list.peer; } return outstanding_lock; } /** * Add an outstanding lock * * @param[in] module osc rdma module * @param[in] lock lock object * * This function inserts a lock object to the list of outstanding locks. The caller must be holding the module * lock. */ static inline void ompi_osc_rdma_module_lock_insert (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock) { if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) { module->outstanding_lock_array[lock->sync.lock.target] = lock; } else { (void) opal_hash_table_set_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target, (void *) lock); } } /** * Remove an outstanding lock * * @param[in] module osc rdma module * @param[in] lock lock object * * This function removes a lock object to the list of outstanding locks. The caller must be holding the module * lock. */ static inline void ompi_osc_rdma_module_lock_remove (struct ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *lock) { if (OPAL_LIKELY(NULL != module->outstanding_lock_array)) { module->outstanding_lock_array[lock->sync.lock.target] = NULL; } else { (void) opal_hash_table_remove_value_uint32 (&module->outstanding_locks, (uint32_t) lock->sync.lock.target); } } /** * Lookup a synchronization object associated with the target * * @param[in] module osc rdma module * @param[in] target target rank * @param[out] peer peer object * * @returns NULL if the target is not locked, fenced, or part of a pscw sync * @returns synchronization object on success * * This function returns the synchronization object associated with an access epoch for * the target. If the target is not part of any current access epoch then NULL is returned. */ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_rdma_module_t *module, int target, struct ompi_osc_rdma_peer_t **peer) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "looking for synchronization object for target %d", target); switch (module->all_sync.type) { case OMPI_OSC_RDMA_SYNC_TYPE_NONE: if (!module->no_locks) { return ompi_osc_rdma_module_lock_find (module, target, peer); } return NULL; case OMPI_OSC_RDMA_SYNC_TYPE_FENCE: case OMPI_OSC_RDMA_SYNC_TYPE_LOCK: OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence/lock_all access epoch for target %d", target); /* fence epoch is now active */ module->all_sync.epoch_active = true; *peer = ompi_osc_rdma_module_peer (module, target); return &module->all_sync; case OMPI_OSC_RDMA_SYNC_TYPE_PSCW: if (ompi_osc_rdma_sync_pscw_peer (module, target, peer)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found PSCW access epoch target for %d", target); return &module->all_sync; } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no access epoch found for target %d", target); return NULL; } /** * @brief complete all outstanding rdma operations to all peers * * @param[in] module osc rdma module */ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) { ompi_osc_rdma_aggregation_t *aggregation, *next; if (opal_list_get_size (&sync->aggregations)) { OPAL_THREAD_SCOPED_LOCK(&sync->lock, OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) { ompi_osc_rdma_peer_aggregate_flush (aggregation->peer); }); } do { opal_progress (); } while (sync->outstanding_rdma); } /** * @brief check if an access epoch is active * * @param[in] module osc rdma module * * @returns true if any type of access epoch is active * @returns false otherwise * * This function is used to check for conflicting access epochs. */ static inline bool ompi_osc_rdma_access_epoch_active (ompi_osc_rdma_module_t *module) { return (module->all_sync.epoch_active || ompi_osc_rdma_in_passive_epoch (module)); } static inline void ompi_osc_rdma_aggregation_return (ompi_osc_rdma_aggregation_t *aggregation) { if (aggregation->sync) { opal_list_remove_item (&aggregation->sync->aggregations, (opal_list_item_t *) aggregation); } opal_free_list_return(&mca_osc_rdma_component.aggregate, (opal_free_list_item_t *) aggregation); } #endif /* OMPI_OSC_RDMA_H */