1
1
openmpi/ompi/mca/bcol/bcol.h
Pavel Shamis b89f8fabc9 Adding Hierarchical Collectives project to the Open MPI trunk.
The project includes following components and frameworks: 
- ML Collective component
- NETPATTERNS and COMMPATTERNS common components
- BCOL framework
- SBGP framework

Note: By default the ML collective component is disabled. In order to enable
new collectives user should bump up the priority of ml component (coll_ml_priority)

=============================================

Primary Contributors (in alphabetical order):

Ishai Rabinovich (Mellanox)
Joshua S. Ladd (ORNL / Mellanox)
Manjunath Gorentla Venkata (ORNL)
Mike Dubman (Mellanox)
Noam Bloch (Mellanox)
Pavel (Pasha) Shamis (ORNL / Mellanox)
Richard Graham (ORNL / Mellanox)
Vasily Filipov (Mellanox)

This commit was SVN r27078.
2012-08-16 19:11:35 +00:00

683 строки
22 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_H
#define MCA_BCOL_H
#include "ompi_config.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/sbgp/sbgp.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/op/op.h"
#include "ompi/include/ompi/constants.h"
#include "ompi/mca/common/netpatterns/common_netpatterns_knomial_tree.h"
#include <limits.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* Forward declaration - please do not remove it */
struct ml_memory_block_desc_t;
struct mca_coll_ml_module_t;
struct ml_buffers_t;
struct mca_bcol_base_coll_fn_comm_attributes_t;
struct mca_bcol_base_coll_fn_invoke_attributes_t;
struct mca_bcol_base_coll_fn_desc_t;
#define NUM_MSG_RANGES 5
#define MSG_RANGE_INITIAL (1024)*12
#define MSG_RANGE_INC 10
#define BCOL_THRESHOLD_UNLIMITED (INT_MAX)
#define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */
/*
* Functions supported
*/
enum bcol_coll {
/* blocking functions */
BCOL_ALLGATHER,
BCOL_ALLGATHERV,
BCOL_ALLREDUCE,
BCOL_ALLTOALL,
BCOL_ALLTOALLV,
BCOL_ALLTOALLW,
BCOL_BARRIER,
BCOL_BCAST,
BCOL_EXSCAN,
BCOL_GATHER,
BCOL_GATHERV,
BCOL_REDUCE,
BCOL_REDUCE_SCATTER,
BCOL_SCAN,
BCOL_SCATTER,
BCOL_SCATTERV,
BCOL_FANIN,
BCOL_FANOUT,
/* nonblocking functions */
BCOL_IALLGATHER,
BCOL_IALLGATHERV,
BCOL_IALLREDUCE,
BCOL_IALLTOALL,
BCOL_IALLTOALLV,
BCOL_IALLTOALLW,
BCOL_IBARRIER,
BCOL_IBCAST,
BCOL_IEXSCAN,
BCOL_IGATHER,
BCOL_IGATHERV,
BCOL_IREDUCE,
BCOL_IREDUCE_SCATTER,
BCOL_ISCAN,
BCOL_ISCATTER,
BCOL_ISCATTERV,
BCOL_IFANIN,
BCOL_IFANOUT,
BCOL_SYNC,
/* New function - needed for intermediate steps */
BCOL_REDUCE_TO_LEADER,
BCOL_NUM_OF_FUNCTIONS
};
typedef enum bcol_coll bcol_coll;
typedef enum bcol_elem_type {
BCOL_SINGLE_ELEM_TYPE,
BCOL_MULTI_ELEM_TYPE,
BCOL_NUM_OF_ELEM_TYPES
} bcol_elem_type;
typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name);
typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num);
/*
* Collective function status
*/
enum {
BCOL_FN_NOT_STARTED = (OMPI_ERR_MAX - 1),
BCOL_FN_STARTED = (OMPI_ERR_MAX - 2),
BCOL_FN_COMPLETE = (OMPI_ERR_MAX - 3)
};
/* Originally this enum was placed in ompi/op/op.h file. It should be moved back
* when we are ready to lobby for its inclusion. Since we are releasing only the
* bcast and barrier initially and this struct supports the allreduce, we are not
* going to worry about it now. Note that in the same h-file, op.h, the struct "ompi_op_t"
* also has a field that we introduced called "enum ompi_op_type op_type" that this needs to
* be resolved also.
*/
enum ompi_op_type {
OMPI_OP_NULL,
OMPI_OP_MAX,
OMPI_OP_MIN,
OMPI_OP_SUM,
OMPI_OP_PROD,
OMPI_OP_LAND,
OMPI_OP_BAND,
OMPI_OP_LOR,
OMPI_OP_BOR,
OMPI_OP_LXOR,
OMPI_OP_BXOR,
OMPI_OP_MAXLOC,
OMPI_OP_MINLOC,
OMPI_OP_REPLACE,
OMPI_OP_NUM_OF_TYPES
};
/**
* Collective component initialization
*
* Initialize the given collective component. This function should
* initialize any component-level. data. It will be called exactly
* once during MPI_INIT.
*
* @note The component framework is not lazily opened, so attempts
* should be made to minimze the amount of memory allocated during
* this function.
*
* @param[in] enable_progress_threads True if the component needs to
* support progress threads
* @param[in] enable_mpi_threads True if the component needs to
* support MPI_THREAD_MULTIPLE
*
* @retval ORTE_SUCCESS Component successfully initialized
* @retval ORTE_ERROR An unspecified error occurred
*/
typedef int (*mca_bcol_base_component_init_query_fn_t)
(bool enable_progress_threads, bool enable_mpi_threads);
/**
* Query whether a component is available for the given sub-group
*
* Query whether the component is available for the given
* sub-group. If the component is available, an array of pointers should be
* allocated and returned (with refcount at 1). The module will not
* be used for collective operations until module_enable() is called
* on the module, but may be destroyed (via OBJ_RELEASE) either before
* or after module_enable() is called. If the module needs to release
* resources obtained during query(), it should do so in the module
* destructor.
*
* A component may provide NULL to this function to indicate it does
* not wish to run or return an error during module_enable().
*
* @note The communicator is available for point-to-point
* communication, but other functionality is not available during this
* phase of initialization.
*
* @param[in] sbgp Pointer to sub-group module.
* @param[out] priority Priority setting for component on
* this communicator
* @param[out] num_modules Number of modules that where generated
* for the sub-group module.
*
* @returns An array of pointer to an initialized modules structures if the component can
* provide a modules with the requested functionality or NULL if the
* component should not be used on the given communicator.
*/
typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t)
(mca_sbgp_base_module_t *sbgp, int *num_modules);
typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
/*
* Macro for use in modules that are of type btl v2.0.0
*/
#define MCA_BCOL_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"bcol", 2, 0, 0
/* This is really an abstarction violation, but is the easiest way to get
* started. For memory management we need to know what bcol components
* have compatible memory management schemes. Such compatibility can
* be used to eliminate memory copies between levels in the collective
* operation hierarchy, by having the output buffer of one level be the
* input buffer to the next level
*/
enum {
BCOL_SHARED_MEMORY_UMA=0,
BCOL_SHARED_MEMORY_SOCKET,
BCOL_POINT_TO_POINT,
BCOL_IB_OFFLOAD,
BCOL_SIZE
};
OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE];
OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE];
/* what are the input parameters ? too many void * pointers here */
typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base,
size_t size, void **reg_desc);
/* deregistration function */
typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc);
/* Bcol network context definition */
struct bcol_base_network_context_t {
opal_object_t super;
/* Context id - defined by upper layer, ML */
int context_id;
/* Any context information that bcol what to use */
void *context_data;
/* registration function */
bcol_register_mem_fn_t register_memory_fn;
/* deregistration function */
bcol_deregister_mem_fn_t deregister_memory_fn;
};
typedef struct bcol_base_network_context_t bcol_base_network_context_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t);
/*
*primitive function types
*/
/* bcast */
enum {
/* small data function */
BCOL_BCAST_SMALL_DATA,
/* small data - dynamic decision making supported */
BCOL_BCAST_SMALL_DATA_DYNAMIC,
/* number of functions */
BCOL_NUM_BCAST_FUNCTIONS
};
/**
* BCOL instance.
*/
/* no limit on fragment size - this supports using user buffers rather
* than library buffers
*/
#define FRAG_SIZE_NO_LIMIT -1
/* forward declaration */
struct coll_bcol_collective_description_t;
struct mca_bcol_base_component_2_0_0_t {
/** Base component description */
mca_base_component_t bcol_version;
/** Component initialization function */
mca_bcol_base_component_init_query_fn_t collm_init_query;
/** Query whether component is useable for given communicator */
mca_bcol_base_component_comm_query_fn_t collm_comm_query;
/** If bcol supports all possible data types */
mca_bcol_base_module_coll_support_fn_t coll_support;
/** If bcol supports all possible data types for given collective operation */
mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types;
/** Use this flag to prevent init_query multiple calls
in case we have the same bcol more than on a single level */
bool init_done;
/** If collective calls with bcols of this type need to be ordered */
bool need_ordering;
/** MCA parameter: Priority of this component */
int priority;
/** Bcast function pointers */
struct coll_bcol_collective_description_t *
bcast_functions[BCOL_NUM_BCAST_FUNCTIONS];
/** Number of network contexts - need this for resource management */
int n_net_contexts;
/** List of network contexts */
bcol_base_network_context_t **network_contexts;
/*
* Fragmentation support
*/
/** Minimum fragement size */
size_t min_frag_size;
/** Maximum fragment size */
int32_t max_frag_size;
/** Supports direct use of user-buffers */
int can_use_user_buffers;
/** Support pipelining */
int use_pipeline;
};
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t;
typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t);
/* forward declaration */
struct mca_coll_ml_descriptor_t;
struct ml_payload_buffer_desc_t;
struct mca_coll_ml_route_info_t;
typedef struct {
int order_num; /* Seq num of collective fragment */
int bcols_started; /* How many bcols need ordering have been started */
int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */
} mca_bcol_base_order_info_t;
struct bcol_function_args_t {
/* full message sequence number */
int64_t sequence_num;
/* full message descriptor - single copy of fragment invariant
* parameters */
/* Pasha: We don need this one for new flow - remove it */
struct mca_coll_ml_descriptor_t *full_message_descriptor;
struct mca_coll_ml_route_info_t *root_route;
/* function status */
int function_status;
/* root, for rooted operations */
int root;
/* input buffer */
void *sbuf;
void *rbuf;
void *userbuf;
struct ml_payload_buffer_desc_t *src_desc;
struct ml_payload_buffer_desc_t *dst_desc;
/* ml buffer size */
uint32_t buffer_size;
/* index of buffer in ml payload cache */
int buffer_index;
int count;
struct ompi_datatype_t *dtype;
struct ompi_op_t *op;
int sbuf_offset;
int rbuf_offset;
/* for bcol opaque data */
void *bcol_opaque_data;
/* An output argument that will be used by BCOL funstion to tell ML that the result of the BCOL is in rbuf */
bool result_in_rbuf;
bool root_flag; /* True if the rank is root of operation */
int status; /* Used for non-blocking collective completion */
uint32_t frag_size; /* fragment size for large messages */
int hier_factor; /* factor used when bcast is invoked as a service function back down
* the tree in allgather for example, the pacl_len is not the actual
* len of the data needing bcasting
*/
mca_bcol_base_order_info_t order_info;
};
typedef struct bcol_function_args_t bcol_function_args_t;
/* The collective operation is defined by a series of collective operations
* invoked through a function pointer. Each function may be different,
* so will store the arguments in a struct and pass a pointer to the struct,
* and use this as a way to hide the different function signatures.
*
* @param[in] input_args Structure with function arguments
* @param[in] bcol_desc Component specific paremeters
* @param[out] status return status of the function
* MCA_BCOL_COMPLETE - function completed
* MCA_BCOL_IN_PROGRESS - function incomplete
*
* @retval OMPI_SUCCESS successful completion
* @retval OMPI_ERROR function returned error
*/
/* forward declaration */
struct mca_bcol_base_module_t;
/* collective function prototype - all functions have the same interface
* so that we can call them via a function pointer */
struct coll_ml_function_t;
typedef int (*mca_bcol_base_module_collective_fn_primitives_t)
(bcol_function_args_t *input_args, struct coll_ml_function_t *const_args);
typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t)
(struct mca_bcol_base_module_t *bcol_module);
/**
* function to query for collctive function attributes
*
* @param attribute (IN) the attribute of interest
* @param algorithm_parameters (OUT) the value of attribute for this
* function. If this attribute is not supported,
* OMPI_ERR_NOT_FOUND is returned.
*/
typedef int (*mca_bcol_get_collective_attributes)(int attribute,
void *algorithm_parameters);
/* data structure for tracking the relevant data needed for ml level
* algorithm construction (e.g., function selection), initialization, and
* usage.
*/
struct coll_bcol_collective_description_t {
/* collective initiation function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
/* collective progress function - first functin called */
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
/* collective progress function - first functin called */
mca_bcol_get_collective_attributes get_attributes;
/* attributes supported - bit map */
uint64_t attribute;
};
typedef struct coll_bcol_collective_description_t
coll_bcol_collective_description_t;
/* collective operation attributes */
enum {
/* supports dynamic decisions - e.g., do not need to have the collective
* operation fully defined before it can be started
*/
BCOL_ATTRIBUTE_DYNAMIC,
/* number of attributes */
BCOL_NUM_ATTRIBUTES
};
/* For rooted collectives,
* does the algorithm knows its data source ?
*/
enum {
DATA_SRC_KNOWN=0,
DATA_SRC_UNKNOWN,
DATA_SRC_TYPES
};
enum {
BLOCKING,
NON_BLOCKING
};
/* gvm For selection logic */
struct mca_bcol_base_coll_fn_comm_attributes_t {
int bcoll_type;
int comm_size_min;
int comm_size_max;
int data_src;
int waiting_semantics;
};
typedef struct mca_bcol_base_coll_fn_comm_attributes_t
mca_bcol_base_coll_fn_comm_attributes_t;
struct mca_bcol_base_coll_fn_invoke_attributes_t {
int bcol_msg_min;
int bcol_msg_max;
uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */
uint32_t op_types_bitmap; /* bit map of optypes supported */
};
typedef struct mca_bcol_base_coll_fn_invoke_attributes_t
mca_bcol_base_coll_fn_invoke_attributes_t;
struct mca_bcol_base_coll_fn_desc_t {
opal_list_item_t super;
struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr;
struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr;
mca_bcol_base_module_collective_fn_primitives_t coll_fn;
mca_bcol_base_module_collective_fn_primitives_t progress_fn;
};
typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t;
OBJ_CLASS_DECLARATION(mca_bcol_base_coll_fn_desc_t);
/* end selection logic */
typedef int (*mca_bcol_base_module_collective_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module,
mca_sbgp_base_module_t *sbgp_module);
/* per communicator memory initialization function */
typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers,
mca_bcol_base_component_t *module);
/* Initialize memory block - ml_memory_block initialization interface function
*
* Invoked at the ml level, used to pass bcol specific registration information
* for the "ml_memory_block"
*
* @param[in] ml_memory_block Pointer to the ml_memory_block. This struct
* contains bcol specific registration information and a call back function
* used for resource recycling.
*
* @param[in] reg_data bcol specific registration data.
*
* @returns On Success: OMPI_SUCCESS
* On Failure: OMPI_ERROR
*
*/
/*typedef int (*mca_bcol_base_init_memory_fn_t)
(struct ml_memory_block_desc_t *ml_block, void *reg_data);*/
typedef int (*mca_bcol_base_init_memory_fn_t)
(struct mca_coll_ml_module_t *ml_module,
struct mca_bcol_base_module_t *bcol_module,
void *reg_data);
typedef int (*mca_common_allgather_init_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
typedef void (*mca_bcol_base_set_thresholds_fn_t)
(struct mca_bcol_base_module_t *bcol_module);
enum {
MCA_BCOL_BASE_ZERO_COPY = 1,
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG = 1 << 1,
MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER = 1 << 2
};
/* base module */
struct mca_bcol_base_module_t {
/* base coll component */
opal_object_t super;
/* bcol component (Pasha: Do we really need cache the component?)*/
mca_bcol_base_component_t *bcol_component;
/* network context that is used by this bcol
only one context per bcol is allowed */
bcol_base_network_context_t *network_context;
/* We are going to use the context index a lot,
int order to decrease number of dereferences
bcol->network_context->index
we are caching the value on bcol */
int context_index;
/* Set of flags that describe features supported by bcol */
uint64_t supported_mode;
/* per communicator memory initialization function */
mca_bcol_module_mem_init init_module;
/* sub-grouping module partner */
mca_sbgp_base_module_t *sbgp_partner_module;
/* size of subgroup - cache this, so can have access when
* sbgp_partner_module no longer existes */
int size_of_subgroup;
/* sequence number offset - want to make sure that we start
* id'ing collectives with id 0, so we can have simple
* resource management.
*/
int64_t squence_number_offset;
/* number of times to poll for operation completion before
* breaking out of a non-blocking collective operation
*/
int n_poll_loops;
/* size of header that will go in data buff, should not include
* any info regarding alignment, let the ml level handle this
*/
uint32_t header_size;
/* Each bcol is assigned a unique value
* see if we can get away with 16-bit id
*/
int16_t bcol_id;
/*FIXME:
* Since mca_bcol_base_module_t is the only parameter which will be passed
* into the bcol_basesmuma_bcast_init(), add the flag to indicate whether
* the hdl-based algorithms will get enabled.
*/
bool use_hdl;
/*
* Collective function pointers
*/
/* changing function signature - will replace bcol_functions */
mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS];
/* Tables hold pointers to functions */
mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS];
opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS];
struct mca_bcol_base_coll_fn_desc_t*
filtered_fns_table[DATA_SRC_TYPES][2][BCOL_NUM_OF_FUNCTIONS][NUM_MSG_RANGES+1][OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED];
/*
* Bcol interface function to pass bcol specific
* info and memory recycling call back
*/
mca_bcol_base_init_memory_fn_t bcol_memory_init;
/*
* netpatterns interface function, would like to invoke this on
* on the ml level
*/
mca_common_allgather_init_fn_t k_nomial_tree;
/* Each bcol caches a list which describes how many ranks
* are "below" each rank in this bcol
*/
int *list_n_connected;
/* offsets for scatter/gather */
int hier_scather_offset;
/* Small message threshold for each collective */
int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];
/* Set small_message_thresholds array */
mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds;
/* Pointer to the order counter on the upper layer,
used if the bcol needs to be ordered */
int *next_inorder;
};
typedef struct mca_bcol_base_module_t mca_bcol_base_module_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_module_t);
struct mca_bcol_base_descriptor_t {
ompi_free_list_item_t super;
/* Vasily: will be described in the future */
};
typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t;
#define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \
do { \
if (*((module)->next_inorder) != \
(bcol_function_args)->order_info.order_num) { \
return BCOL_FN_NOT_STARTED; \
} \
} while (0);
#define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \
do { \
(order_info)->bcols_started++; \
if ((order_info)->n_fns_need_ordering == \
(order_info)->bcols_started) { \
++(*((module)->next_inorder)); \
} \
} while (0);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_BCOL_H */