WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic. This commit was SVN r32317.
805 строки
26 KiB
805 строки
26 KiB
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* Additional copyrights may follow
#ifndef MCA_BCOL_H
#define MCA_BCOL_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/mca/sbgp/sbgp.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/op/op.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/patterns/net/netpatterns_knomial_tree.h"
#include "opal/util/show_help.h"
#include <limits.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
/* Forward declaration - please do not remove it */
struct ml_buffers_t;
struct mca_bcol_base_coll_fn_comm_attributes_t;
struct mca_bcol_base_coll_fn_invoke_attributes_t;
struct mca_bcol_base_coll_fn_desc_t;
#define NUM_MSG_RANGES 5
#define MSG_RANGE_INITIAL (1024)*12
#define MSG_RANGE_INC 10
/* Maximum size of a bcol's header. This allows us to correctly calculate the message
* thresholds. If the header of any bcol exceeds this value then increase this one
* to match. */
#define BCOL_HEADER_MAX 96
#define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */
* Functions supported
enum bcol_coll {
/* blocking functions */
/* nonblocking functions */
/* New function - needed for intermediate steps */
typedef enum bcol_coll bcol_coll;
typedef enum bcol_elem_type {
} bcol_elem_type;
typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name);
typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num);
* Collective function status
enum {
* Collective component initialization
* Initialize the given collective component. This function should
* initialize any component-level. data. It will be called exactly
* once during MPI_INIT.
* @note The component framework is not lazily opened, so attempts
* should be made to minimze the amount of memory allocated during
* this function.
* @param[in] enable_progress_threads True if the component needs to
* support progress threads
* @param[in] enable_mpi_threads True if the component needs to
* @retval OMPI_SUCCESS Component successfully initialized
* @retval ORTE_ERROR An unspecified error occurred
typedef int (*mca_bcol_base_component_init_query_fn_t)
(bool enable_progress_threads, bool enable_mpi_threads);
* Query whether a component is available for the given sub-group
* Query whether the component is available for the given
* sub-group. If the component is available, an array of pointers should be
* allocated and returned (with refcount at 1). The module will not
* be used for collective operations until module_enable() is called
* on the module, but may be destroyed (via OBJ_RELEASE) either before
* or after module_enable() is called. If the module needs to release
* resources obtained during query(), it should do so in the module
* destructor.
* A component may provide NULL to this function to indicate it does
* not wish to run or return an error during module_enable().
* @note The communicator is available for point-to-point
* communication, but other functionality is not available during this
* phase of initialization.
* @param[in] sbgp Pointer to sub-group module.
* @param[out] priority Priority setting for component on
* this communicator
* @param[out] num_modules Number of modules that where generated
* for the sub-group module.
* @returns An array of pointer to an initialized modules structures if the component can
* provide a modules with the requested functionality or NULL if the
* component should not be used on the given communicator.
typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t)
(mca_sbgp_base_module_t *sbgp, int *num_modules);
typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
* Macro for use in modules that are of type btl v2.0.0
#define MCA_BCOL_BASE_VERSION_2_0_0 \
"bcol", 2, 0, 0
/* This is really an abstarction violation, but is the easiest way to get
* started. For memory management we need to know what bcol components
* have compatible memory management schemes. Such compatibility can
* be used to eliminate memory copies between levels in the collective
* operation hierarchy, by having the output buffer of one level be the
* input buffer to the next level
enum {
OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE];
OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE];
/* what are the input parameters ? too many void * pointers here */
typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base,
size_t size, void **reg_desc);
/* deregistration function */
typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc);
/* Bcol network context definition */
struct bcol_base_network_context_t {
opal_object_t super;
/* Context id - defined by upper layer, ML */
int context_id;
/* Any context information that bcol what to use */
void *context_data;
/* registration function */
bcol_register_mem_fn_t register_memory_fn;
/* deregistration function */
bcol_deregister_mem_fn_t deregister_memory_fn;
typedef struct bcol_base_network_context_t bcol_base_network_context_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t);
*primitive function types
/* bcast */
enum {
/* small data function */
/* small data - dynamic decision making supported */
/* number of functions */
* BCOL instance.
/* no limit on fragment size - this supports using user buffers rather
* than library buffers
/* forward declaration */
struct coll_bcol_collective_description_t;
struct mca_bcol_base_component_2_0_0_t {
/** Base component description */
mca_base_component_t bcol_version;
/** Component initialization function */
mca_bcol_base_component_init_query_fn_t collm_init_query;
/** Query whether component is useable for given communicator */
mca_bcol_base_component_comm_query_fn_t collm_comm_query;
/** If bcol supports all possible data types */
mca_bcol_base_module_coll_support_fn_t coll_support;
/** If bcol supports all possible data types for given collective operation */
mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types;
/** Use this flag to prevent init_query multiple calls
in case we have the same bcol more than on a single level */
bool init_done;
/** If collective calls with bcols of this type need to be ordered */
bool need_ordering;
/** MCA parameter: Priority of this component */
int priority;
/** Bcast function pointers */
struct coll_bcol_collective_description_t *
/** Number of network contexts - need this for resource management */
int n_net_contexts;
/** List of network contexts */
bcol_base_network_context_t **network_contexts;
* Fragmentation support
/** Minimum fragement size */
int min_frag_size;
/** Maximum fragment size */
int max_frag_size;
/** Supports direct use of user-buffers */
bool can_use_user_buffers;
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t;
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t);
/* forward declaration */
struct mca_coll_ml_descriptor_t;
struct mca_bcol_base_payload_buffer_desc_t;
struct mca_bcol_base_route_info_t;
typedef struct {
int order_num; /* Seq num of collective fragment */
int bcols_started; /* How many bcols need ordering have been started */
int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */
} mca_bcol_base_order_info_t;
/* structure that encapsultes information propagated amongst multiple
* fragments whereby completing the entire ensemble of fragments is
* necessary in order to complete the entire collective
struct bcol_fragment_descriptor_t {
/* start iterator */
int head;
/* end iterator */
int tail;
/* current iteration */
int start_iter;
/* number of full iterations this frag */
int num_iter;
/* end iter */
int end_iter;
typedef struct bcol_fragment_descriptor_t bcol_fragment_descriptor_t;
struct bcol_function_args_t {
/* full message sequence number */
int64_t sequence_num;
/* full message descriptor - single copy of fragment invariant
* parameters */
/* Pasha: We don need this one for new flow - remove it */
struct mca_coll_ml_descriptor_t *full_message_descriptor;
struct mca_bcol_base_route_info_t *root_route;
/* function status */
int function_status;
/* root, for rooted operations */
int root;
/* input buffer */
void *sbuf;
void *rbuf;
void *userbuf;
struct mca_bcol_base_payload_buffer_desc_t *src_desc;
struct mca_bcol_base_payload_buffer_desc_t *dst_desc;
/* ml buffer size */
uint32_t buffer_size;
/* index of buffer in ml payload cache */
int buffer_index;
int count;
struct ompi_datatype_t *dtype;
struct ompi_op_t *op;
int sbuf_offset;
int rbuf_offset;
/* for bcol opaque data */
void *bcol_opaque_data;
/* An output argument that will be used by BCOL function to tell ML that the result of the BCOL is in rbuf */
bool result_in_rbuf;
bool root_flag; /* True if the rank is root of operation */
bool need_dt_support; /* will trigger alternate code path for some colls */
int status; /* Used for non-blocking collective completion */
uint32_t frag_size; /* fragment size for large messages */
int hier_factor; /* factor used when bcast is invoked as a service function back down
* the tree in allgather for example, the pacl_len is not the actual
* len of the data needing bcasting
mca_bcol_base_order_info_t order_info;
bcol_fragment_descriptor_t frag_info;
struct mca_bcol_base_route_info_t {
int level;
int rank;
typedef struct mca_bcol_base_route_info_t mca_bcol_base_route_info_t;
struct mca_bcol_base_lmngr_block_t {
opal_list_item_t super;
struct mca_coll_ml_lmngr_t *lmngr;
void* base_addr;
typedef struct mca_bcol_base_lmngr_block_t mca_bcol_base_lmngr_block_t;
struct mca_bcol_base_memory_block_desc_t {
/* memory block for payload buffers */
struct mca_bcol_base_lmngr_block_t *block;
/* Address offset in bytes -- Indicates free memory in the block */
uint64_t block_addr_offset;
/* size of the memory block */
size_t size_block;
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* pointer to buffer descriptors initialized */
struct mca_bcol_base_payload_buffer_desc_t *buffer_descs;
/* index of the next free buffer in the block */
uint64_t next_free_buffer;
uint32_t *bank_release_counters;
/* Counter that defines what bank should be synchronized next
* since collectives could be completed out of order, we have to make
* sure that memory synchronization collectives started in order ! */
int memsync_counter;
/* This arrays of flags used to signal that the bank is ready for recycling */
bool *ready_for_memsync;
/* This flags monitors if bank is open for usage. Usually we expect that user
* will do the check only on buffer-zero allocation */
bool *bank_is_busy;
/* convenience typedef */
typedef struct mca_bcol_base_memory_block_desc_t mca_bcol_base_memory_block_desc_t;
typedef void (*mca_bcol_base_release_buff_fn_t)(struct mca_bcol_base_memory_block_desc_t *ml_memblock, uint32_t buff_id);
struct mca_bcol_base_payload_buffer_desc_t {
void *base_data_addr; /* buffer address */
void *data_addr; /* buffer address + header offset */
uint64_t generation_number; /* my generation */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
/* convenience typedef */
typedef struct mca_bcol_base_payload_buffer_desc_t mca_bcol_base_payload_buffer_desc_t;
typedef struct bcol_function_args_t bcol_function_args_t;
/* The collective operation is defined by a series of collective operations
* invoked through a function pointer. Each function may be different,
* so will store the arguments in a struct and pass a pointer to the struct,
* and use this as a way to hide the different function signatures.
* @param[in] input_args Structure with function arguments
* @param[in] bcol_desc Component specific paremeters
* @param[out] status return status of the function
* MCA_BCOL_COMPLETE - function completed
* MCA_BCOL_IN_PROGRESS - function incomplete
* @retval OMPI_SUCCESS successful completion
* @retval OMPI_ERROR function returned error
/* forward declaration */
struct mca_bcol_base_module_t;
/* collective function prototype - all functions have the same interface
* so that we can call them via a function pointer */
struct mca_bcol_base_function_t;
typedef int (*mca_bcol_base_module_collective_fn_primitives_t)
(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args);
typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t)
(struct mca_bcol_base_module_t *bcol_module);
* function to query for collctive function attributes
* @param attribute (IN) the attribute of interest
* @param algorithm_parameters (OUT) the value of attribute for this
* function. If this attribute is not supported,
* OMPI_ERR_NOT_FOUND is returned.
typedef int (*mca_bcol_get_collective_attributes)(int attribute,
void *algorithm_parameters);
/* data structure for tracking the relevant data needed for ml level
* algorithm construction (e.g., function selection), initialization, and
* usage.
struct coll_bcol_collective_description_t {
/* collective initiation function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
/* collective progress function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
/* collective progress function - first functin called */
mca_bcol_get_collective_attributes get_attributes;
/* attributes supported - bit map */
uint64_t attribute;
typedef struct coll_bcol_collective_description_t
/* collective operation attributes */
enum {
/* supports dynamic decisions - e.g., do not need to have the collective
* operation fully defined before it can be started
/* number of attributes */
/* For rooted collectives,
* does the algorithm knows its data source ?
enum {
enum {
/* gvm For selection logic */
struct mca_bcol_base_coll_fn_comm_attributes_t {
int bcoll_type;
int comm_size_min;
int comm_size_max;
int data_src;
int waiting_semantics;
typedef struct mca_bcol_base_coll_fn_comm_attributes_t
struct mca_bcol_base_coll_fn_invoke_attributes_t {
int bcol_msg_min;
int bcol_msg_max;
uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */
uint32_t op_types_bitmap; /* bit map of optypes supported */
typedef struct mca_bcol_base_coll_fn_invoke_attributes_t
struct mca_bcol_base_coll_fn_desc_t {
opal_list_item_t super;
struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr;
struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr;
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t;
/* end selection logic */
typedef int (*mca_bcol_base_module_collective_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
/* per communicator memory initialization function */
typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers,
mca_bcol_base_component_t *module);
/* Initialize memory block - ml_memory_block initialization interface function
* Invoked at the ml level, used to pass bcol specific registration information
* for the "ml_memory_block"
* @param[in] ml_memory_block Pointer to the ml_memory_block. This struct
* contains bcol specific registration information and a call back function
* used for resource recycling.
* @param[in] reg_data bcol specific registration data.
* @returns On Success: OMPI_SUCCESS
* On Failure: OMPI_ERROR
/*typedef int (*mca_bcol_base_init_memory_fn_t)
(struct mca_bcol_base_memory_block_desc_t *ml_block, void *reg_data);*/
typedef int (*mca_bcol_base_init_memory_fn_t)
(struct mca_bcol_base_memory_block_desc_t *payload_block,
uint32_t data_offset,
struct mca_bcol_base_module_t *bcol,
void *reg_data);
typedef int (*mca_common_allgather_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
typedef void (*mca_bcol_base_set_thresholds_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
enum {
/* base module */
struct mca_bcol_base_module_t {
/* base coll component */
opal_object_t super;
/* bcol component (Pasha: Do we really need cache the component?)*/
mca_bcol_base_component_t *bcol_component;
/* network context that is used by this bcol
only one context per bcol is allowed */
bcol_base_network_context_t *network_context;
/* We are going to use the context index a lot,
int order to decrease number of dereferences
we are caching the value on bcol */
int context_index;
/* Set of flags that describe features supported by bcol */
uint64_t supported_mode;
/* per communicator memory initialization function */
mca_bcol_module_mem_init init_module;
/* sub-grouping module partner */
mca_sbgp_base_module_t *sbgp_partner_module;
/* size of subgroup - cache this, so can have access when
* sbgp_partner_module no longer existes */
int size_of_subgroup;
/* sequence number offset - want to make sure that we start
* id'ing collectives with id 0, so we can have simple
* resource management.
int64_t squence_number_offset;
/* number of times to poll for operation completion before
* breaking out of a non-blocking collective operation
int n_poll_loops;
/* size of header that will go in data buff, should not include
* any info regarding alignment, let the ml level handle this
uint32_t header_size;
/* Each bcol is assigned a unique value
* see if we can get away with 16-bit id
int16_t bcol_id;
* Since mca_bcol_base_module_t is the only parameter which will be passed
* into the bcol_basesmuma_bcast_init(), add the flag to indicate whether
* the hdl-based algorithms will get enabled.
bool use_hdl;
* Collective function pointers
/* changing function signature - will replace bcol_functions */
mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS];
/* Tables hold pointers to functions */
mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS];
opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS];
struct mca_bcol_base_coll_fn_desc_t*
* Bcol interface function to pass bcol specific
* info and memory recycling call back
mca_bcol_base_init_memory_fn_t bcol_memory_init;
* netpatterns interface function, would like to invoke this on
* on the ml level
mca_common_allgather_init_fn_t k_nomial_tree;
/* Each bcol caches a list which describes how many ranks
* are "below" each rank in this bcol
int *list_n_connected;
/* offsets for scatter/gather */
int hier_scather_offset;
/* Small message threshold for each collective */
int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];
/* Set small_message_thresholds array */
mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds;
/* Pointer to the order counter on the upper layer,
used if the bcol needs to be ordered */
int *next_inorder;
typedef struct mca_bcol_base_module_t mca_bcol_base_module_t;
/* function description */
struct mca_bcol_base_function_t {
int fn_idx;
/* module */
struct mca_bcol_base_module_t *bcol_module;
* The following two parameters are used for bcol modules
* that want to do some optimizations based on the fact that
* n functions from the same bcol module are called in a row.
* For example, in the iboffload case, on the first call one
* will want to initialize the MWR, and start to instantiate
* it, but only post it at the end of the last call.
* The index of this function in a sequence of consecutive
* functions from the same bcol
int index_in_consecutive_same_bcol_calls;
/* number of times functions from this bcol are
* called in order
int n_of_this_type_in_a_row;
* number of times functions from this module are called in the
* collective operation.
int n_of_this_type_in_collective;
int index_of_this_type_in_collective;
typedef struct mca_bcol_base_function_t mca_bcol_base_function_t;
struct mca_bcol_base_descriptor_t {
ompi_free_list_item_t super;
/* Vasily: will be described in the future */
typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t;
static inline __opal_attribute_always_inline__ size_t
mca_bcol_base_get_buff_length(ompi_datatype_t *dtype, int count)
ptrdiff_t lb, extent;
ompi_datatype_get_extent(dtype, &lb, &extent);
return (size_t) (extent * count);
#define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \
do { \
if (*((module)->next_inorder) != \
(bcol_function_args)->order_info.order_num) { \
} \
} while (0);
#define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \
do { \
(order_info)->bcols_started++; \
if ((order_info)->n_fns_need_ordering == \
(order_info)->bcols_started) { \
++(*((module)->next_inorder)); \
} \
} while (0);
#if defined(c_plusplus) || defined(__cplusplus)
#endif /* MCA_BCOL_H */