/* * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #ifndef MCA_BCOL_H #define MCA_BCOL_H #include "ompi_config.h" #include "opal/class/opal_list.h" #include "opal/mca/mca.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/mpool/mpool.h" #include "ompi/mca/sbgp/sbgp.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/op/op.h" #include "ompi/include/ompi/constants.h" #include "ompi/patterns/net/netpatterns_knomial_tree.h" #include "opal/util/show_help.h" #include #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif /* Forward declaration - please do not remove it */ struct ml_memory_block_desc_t; struct mca_coll_ml_module_t; struct ml_buffers_t; struct mca_bcol_base_coll_fn_comm_attributes_t; struct mca_bcol_base_coll_fn_invoke_attributes_t; struct mca_bcol_base_coll_fn_desc_t; #define NUM_MSG_RANGES 5 #define MSG_RANGE_INITIAL (1024)*12 #define MSG_RANGE_INC 10 #define BCOL_THRESHOLD_UNLIMITED (INT_MAX) #define BCOL_HEAD_ALIGN 32 /* will turn into an MCA parameter after debug */ /* * Functions supported */ enum bcol_coll { /* blocking functions */ BCOL_ALLGATHER, BCOL_ALLGATHERV, BCOL_ALLREDUCE, BCOL_ALLTOALL, BCOL_ALLTOALLV, BCOL_ALLTOALLW, BCOL_BARRIER, BCOL_BCAST, BCOL_EXSCAN, BCOL_GATHER, BCOL_GATHERV, BCOL_REDUCE, BCOL_REDUCE_SCATTER, BCOL_SCAN, BCOL_SCATTER, BCOL_SCATTERV, BCOL_FANIN, BCOL_FANOUT, /* nonblocking functions */ BCOL_IALLGATHER, BCOL_IALLGATHERV, BCOL_IALLREDUCE, BCOL_IALLTOALL, BCOL_IALLTOALLV, BCOL_IALLTOALLW, BCOL_IBARRIER, BCOL_IBCAST, BCOL_IEXSCAN, BCOL_IGATHER, BCOL_IGATHERV, BCOL_IREDUCE, BCOL_IREDUCE_SCATTER, BCOL_ISCAN, BCOL_ISCATTER, BCOL_ISCATTERV, BCOL_IFANIN, BCOL_IFANOUT, BCOL_SYNC, /* New function - needed for intermediate steps */ BCOL_REDUCE_TO_LEADER, BCOL_NUM_OF_FUNCTIONS }; typedef enum bcol_coll bcol_coll; typedef enum bcol_elem_type { BCOL_SINGLE_ELEM_TYPE, BCOL_MULTI_ELEM_TYPE, BCOL_NUM_OF_ELEM_TYPES } bcol_elem_type; typedef int (*mca_bcol_base_module_coll_support_all_types_fn_t)(bcol_coll coll_name); typedef int (*mca_bcol_base_module_coll_support_fn_t)(int op, int dtype, bcol_elem_type elem_num); /* * Collective function status */ enum { BCOL_FN_NOT_STARTED = (OMPI_ERR_MAX - 1), BCOL_FN_STARTED = (OMPI_ERR_MAX - 2), BCOL_FN_COMPLETE = (OMPI_ERR_MAX - 3) }; /* Originally this enum was placed in ompi/op/op.h file. It should be moved back * when we are ready to lobby for its inclusion. Since we are releasing only the * bcast and barrier initially and this struct supports the allreduce, we are not * going to worry about it now. Note that in the same h-file, op.h, the struct "ompi_op_t" * also has a field that we introduced called "enum ompi_op_type op_type" that this needs to * be resolved also. */ enum ompi_op_type { OMPI_OP_NULL, OMPI_OP_MAX, OMPI_OP_MIN, OMPI_OP_SUM, OMPI_OP_PROD, OMPI_OP_LAND, OMPI_OP_BAND, OMPI_OP_LOR, OMPI_OP_BOR, OMPI_OP_LXOR, OMPI_OP_BXOR, OMPI_OP_MAXLOC, OMPI_OP_MINLOC, OMPI_OP_REPLACE, OMPI_OP_NUM_OF_TYPES }; /** * Collective component initialization * * Initialize the given collective component. This function should * initialize any component-level. data. It will be called exactly * once during MPI_INIT. * * @note The component framework is not lazily opened, so attempts * should be made to minimze the amount of memory allocated during * this function. * * @param[in] enable_progress_threads True if the component needs to * support progress threads * @param[in] enable_mpi_threads True if the component needs to * support MPI_THREAD_MULTIPLE * * @retval OMPI_SUCCESS Component successfully initialized * @retval ORTE_ERROR An unspecified error occurred */ typedef int (*mca_bcol_base_component_init_query_fn_t) (bool enable_progress_threads, bool enable_mpi_threads); /** * Query whether a component is available for the given sub-group * * Query whether the component is available for the given * sub-group. If the component is available, an array of pointers should be * allocated and returned (with refcount at 1). The module will not * be used for collective operations until module_enable() is called * on the module, but may be destroyed (via OBJ_RELEASE) either before * or after module_enable() is called. If the module needs to release * resources obtained during query(), it should do so in the module * destructor. * * A component may provide NULL to this function to indicate it does * not wish to run or return an error during module_enable(). * * @note The communicator is available for point-to-point * communication, but other functionality is not available during this * phase of initialization. * * @param[in] sbgp Pointer to sub-group module. * @param[out] priority Priority setting for component on * this communicator * @param[out] num_modules Number of modules that where generated * for the sub-group module. * * @returns An array of pointer to an initialized modules structures if the component can * provide a modules with the requested functionality or NULL if the * component should not be used on the given communicator. */ typedef struct mca_bcol_base_module_t **(*mca_bcol_base_component_comm_query_fn_t) (mca_sbgp_base_module_t *sbgp, int *num_modules); typedef int (*mca_bcol_barrier_init_fn_t)(struct mca_bcol_base_module_t *bcol_module, mca_sbgp_base_module_t *sbgp_module); /* * Macro for use in modules that are of type btl v2.0.0 */ #define MCA_BCOL_BASE_VERSION_2_0_0 \ MCA_BASE_VERSION_2_0_0, \ "bcol", 2, 0, 0 /* This is really an abstarction violation, but is the easiest way to get * started. For memory management we need to know what bcol components * have compatible memory management schemes. Such compatibility can * be used to eliminate memory copies between levels in the collective * operation hierarchy, by having the output buffer of one level be the * input buffer to the next level */ enum { BCOL_SHARED_MEMORY_UMA=0, BCOL_SHARED_MEMORY_SOCKET, BCOL_POINT_TO_POINT, BCOL_IB_OFFLOAD, BCOL_SIZE }; OMPI_DECLSPEC extern int bcol_mpool_compatibility[BCOL_SIZE][BCOL_SIZE]; OMPI_DECLSPEC extern int bcol_mpool_index[BCOL_SIZE][BCOL_SIZE]; /* what are the input parameters ? too many void * pointers here */ typedef int (*bcol_register_mem_fn_t)(void *context_data, void *base, size_t size, void **reg_desc); /* deregistration function */ typedef int (*bcol_deregister_mem_fn_t)(void *context_data, void *reg_desc); /* Bcol network context definition */ struct bcol_base_network_context_t { opal_object_t super; /* Context id - defined by upper layer, ML */ int context_id; /* Any context information that bcol what to use */ void *context_data; /* registration function */ bcol_register_mem_fn_t register_memory_fn; /* deregistration function */ bcol_deregister_mem_fn_t deregister_memory_fn; }; typedef struct bcol_base_network_context_t bcol_base_network_context_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(bcol_base_network_context_t); /* *primitive function types */ /* bcast */ enum { /* small data function */ BCOL_BCAST_SMALL_DATA, /* small data - dynamic decision making supported */ BCOL_BCAST_SMALL_DATA_DYNAMIC, /* number of functions */ BCOL_NUM_BCAST_FUNCTIONS }; /** * BCOL instance. */ /* no limit on fragment size - this supports using user buffers rather * than library buffers */ #define FRAG_SIZE_NO_LIMIT -1 /* forward declaration */ struct coll_bcol_collective_description_t; struct mca_bcol_base_component_2_0_0_t { /** Base component description */ mca_base_component_t bcol_version; /** Component initialization function */ mca_bcol_base_component_init_query_fn_t collm_init_query; /** Query whether component is useable for given communicator */ mca_bcol_base_component_comm_query_fn_t collm_comm_query; /** If bcol supports all possible data types */ mca_bcol_base_module_coll_support_fn_t coll_support; /** If bcol supports all possible data types for given collective operation */ mca_bcol_base_module_coll_support_all_types_fn_t coll_support_all_types; /** Use this flag to prevent init_query multiple calls in case we have the same bcol more than on a single level */ bool init_done; /** If collective calls with bcols of this type need to be ordered */ bool need_ordering; /** MCA parameter: Priority of this component */ int priority; /** Bcast function pointers */ struct coll_bcol_collective_description_t * bcast_functions[BCOL_NUM_BCAST_FUNCTIONS]; /** Number of network contexts - need this for resource management */ int n_net_contexts; /** List of network contexts */ bcol_base_network_context_t **network_contexts; /* * Fragmentation support */ /** Minimum fragement size */ size_t min_frag_size; /** Maximum fragment size */ int32_t max_frag_size; /** Supports direct use of user-buffers */ int can_use_user_buffers; /** Support pipelining */ int use_pipeline; }; typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_2_0_0_t; typedef struct mca_bcol_base_component_2_0_0_t mca_bcol_base_component_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_component_t); /* forward declaration */ struct mca_coll_ml_descriptor_t; struct ml_payload_buffer_desc_t; struct mca_coll_ml_route_info_t; typedef struct { int order_num; /* Seq num of collective fragment */ int bcols_started; /* How many bcols need ordering have been started */ int n_fns_need_ordering; /* The number of functions are called for bcols need ordering */ } mca_bcol_base_order_info_t; struct bcol_function_args_t { /* full message sequence number */ int64_t sequence_num; /* full message descriptor - single copy of fragment invariant * parameters */ /* Pasha: We don need this one for new flow - remove it */ struct mca_coll_ml_descriptor_t *full_message_descriptor; struct mca_coll_ml_route_info_t *root_route; /* function status */ int function_status; /* root, for rooted operations */ int root; /* input buffer */ void *sbuf; void *rbuf; void *userbuf; struct ml_payload_buffer_desc_t *src_desc; struct ml_payload_buffer_desc_t *dst_desc; /* ml buffer size */ uint32_t buffer_size; /* index of buffer in ml payload cache */ int buffer_index; int count; struct ompi_datatype_t *dtype; struct ompi_op_t *op; int sbuf_offset; int rbuf_offset; /* for bcol opaque data */ void *bcol_opaque_data; /* An output argument that will be used by BCOL funstion to tell ML that the result of the BCOL is in rbuf */ bool result_in_rbuf; bool root_flag; /* True if the rank is root of operation */ int status; /* Used for non-blocking collective completion */ uint32_t frag_size; /* fragment size for large messages */ int hier_factor; /* factor used when bcast is invoked as a service function back down * the tree in allgather for example, the pacl_len is not the actual * len of the data needing bcasting */ mca_bcol_base_order_info_t order_info; }; typedef struct bcol_function_args_t bcol_function_args_t; /* The collective operation is defined by a series of collective operations * invoked through a function pointer. Each function may be different, * so will store the arguments in a struct and pass a pointer to the struct, * and use this as a way to hide the different function signatures. * * @param[in] input_args Structure with function arguments * @param[in] bcol_desc Component specific paremeters * @param[out] status return status of the function * MCA_BCOL_COMPLETE - function completed * MCA_BCOL_IN_PROGRESS - function incomplete * * @retval OMPI_SUCCESS successful completion * @retval OMPI_ERROR function returned error */ /* forward declaration */ struct mca_bcol_base_module_t; /* collective function prototype - all functions have the same interface * so that we can call them via a function pointer */ struct coll_ml_function_t; typedef int (*mca_bcol_base_module_collective_fn_primitives_t) (bcol_function_args_t *input_args, struct coll_ml_function_t *const_args); typedef int (*mca_bcol_base_module_collective_init_fn_primitives_t) (struct mca_bcol_base_module_t *bcol_module); /** * function to query for collctive function attributes * * @param attribute (IN) the attribute of interest * @param algorithm_parameters (OUT) the value of attribute for this * function. If this attribute is not supported, * OMPI_ERR_NOT_FOUND is returned. */ typedef int (*mca_bcol_get_collective_attributes)(int attribute, void *algorithm_parameters); /* data structure for tracking the relevant data needed for ml level * algorithm construction (e.g., function selection), initialization, and * usage. */ struct coll_bcol_collective_description_t { /* collective initiation function - first functin called */ mca_bcol_base_module_collective_fn_primitives_t coll_fn; /* collective progress function - first functin called */ mca_bcol_base_module_collective_fn_primitives_t progress_fn; /* collective progress function - first functin called */ mca_bcol_get_collective_attributes get_attributes; /* attributes supported - bit map */ uint64_t attribute; }; typedef struct coll_bcol_collective_description_t coll_bcol_collective_description_t; /* collective operation attributes */ enum { /* supports dynamic decisions - e.g., do not need to have the collective * operation fully defined before it can be started */ BCOL_ATTRIBUTE_DYNAMIC, /* number of attributes */ BCOL_NUM_ATTRIBUTES }; /* For rooted collectives, * does the algorithm knows its data source ? */ enum { DATA_SRC_KNOWN=0, DATA_SRC_UNKNOWN, DATA_SRC_TYPES }; enum { BLOCKING, NON_BLOCKING }; /* gvm For selection logic */ struct mca_bcol_base_coll_fn_comm_attributes_t { int bcoll_type; int comm_size_min; int comm_size_max; int data_src; int waiting_semantics; }; typedef struct mca_bcol_base_coll_fn_comm_attributes_t mca_bcol_base_coll_fn_comm_attributes_t; struct mca_bcol_base_coll_fn_invoke_attributes_t { int bcol_msg_min; int bcol_msg_max; uint64_t datatype_bitmap; /* Max is OMPI_DATATYPE_MAX_PREDEFINED defined to be 45 */ uint32_t op_types_bitmap; /* bit map of optypes supported */ }; typedef struct mca_bcol_base_coll_fn_invoke_attributes_t mca_bcol_base_coll_fn_invoke_attributes_t; struct mca_bcol_base_coll_fn_desc_t { opal_list_item_t super; struct mca_bcol_base_coll_fn_comm_attributes_t *comm_attr; struct mca_bcol_base_coll_fn_invoke_attributes_t *inv_attr; mca_bcol_base_module_collective_fn_primitives_t coll_fn; mca_bcol_base_module_collective_fn_primitives_t progress_fn; }; typedef struct mca_bcol_base_coll_fn_desc_t mca_bcol_base_coll_fn_desc_t; OBJ_CLASS_DECLARATION(mca_bcol_base_coll_fn_desc_t); /* end selection logic */ typedef int (*mca_bcol_base_module_collective_init_fn_t) (struct mca_bcol_base_module_t *bcol_module, mca_sbgp_base_module_t *sbgp_module); /* per communicator memory initialization function */ typedef int (*mca_bcol_module_mem_init)(struct ml_buffers_t *registered_buffers, mca_bcol_base_component_t *module); /* Initialize memory block - ml_memory_block initialization interface function * * Invoked at the ml level, used to pass bcol specific registration information * for the "ml_memory_block" * * @param[in] ml_memory_block Pointer to the ml_memory_block. This struct * contains bcol specific registration information and a call back function * used for resource recycling. * * @param[in] reg_data bcol specific registration data. * * @returns On Success: OMPI_SUCCESS * On Failure: OMPI_ERROR * */ /*typedef int (*mca_bcol_base_init_memory_fn_t) (struct ml_memory_block_desc_t *ml_block, void *reg_data);*/ typedef int (*mca_bcol_base_init_memory_fn_t) (struct mca_coll_ml_module_t *ml_module, struct mca_bcol_base_module_t *bcol_module, void *reg_data); typedef int (*mca_common_allgather_init_fn_t) (struct mca_bcol_base_module_t *bcol_module); typedef void (*mca_bcol_base_set_thresholds_fn_t) (struct mca_bcol_base_module_t *bcol_module); enum { MCA_BCOL_BASE_ZERO_COPY = 1, MCA_BCOL_BASE_NO_ML_BUFFER_FOR_LARGE_MSG = 1 << 1, MCA_BCOL_BASE_NO_ML_BUFFER_FOR_BARRIER = 1 << 2 }; /* base module */ struct mca_bcol_base_module_t { /* base coll component */ opal_object_t super; /* bcol component (Pasha: Do we really need cache the component?)*/ mca_bcol_base_component_t *bcol_component; /* network context that is used by this bcol only one context per bcol is allowed */ bcol_base_network_context_t *network_context; /* We are going to use the context index a lot, int order to decrease number of dereferences bcol->network_context->index we are caching the value on bcol */ int context_index; /* Set of flags that describe features supported by bcol */ uint64_t supported_mode; /* per communicator memory initialization function */ mca_bcol_module_mem_init init_module; /* sub-grouping module partner */ mca_sbgp_base_module_t *sbgp_partner_module; /* size of subgroup - cache this, so can have access when * sbgp_partner_module no longer existes */ int size_of_subgroup; /* sequence number offset - want to make sure that we start * id'ing collectives with id 0, so we can have simple * resource management. */ int64_t squence_number_offset; /* number of times to poll for operation completion before * breaking out of a non-blocking collective operation */ int n_poll_loops; /* size of header that will go in data buff, should not include * any info regarding alignment, let the ml level handle this */ uint32_t header_size; /* Each bcol is assigned a unique value * see if we can get away with 16-bit id */ int16_t bcol_id; /*FIXME: * Since mca_bcol_base_module_t is the only parameter which will be passed * into the bcol_basesmuma_bcast_init(), add the flag to indicate whether * the hdl-based algorithms will get enabled. */ bool use_hdl; /* * Collective function pointers */ /* changing function signature - will replace bcol_functions */ mca_bcol_base_module_collective_fn_primitives_t bcol_function_table[BCOL_NUM_OF_FUNCTIONS]; /* Tables hold pointers to functions */ mca_bcol_base_module_collective_init_fn_primitives_t bcol_function_init_table[BCOL_NUM_OF_FUNCTIONS]; opal_list_t bcol_fns_table[BCOL_NUM_OF_FUNCTIONS]; struct mca_bcol_base_coll_fn_desc_t* filtered_fns_table[DATA_SRC_TYPES][2][BCOL_NUM_OF_FUNCTIONS][NUM_MSG_RANGES+1][OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED]; /* * Bcol interface function to pass bcol specific * info and memory recycling call back */ mca_bcol_base_init_memory_fn_t bcol_memory_init; /* * netpatterns interface function, would like to invoke this on * on the ml level */ mca_common_allgather_init_fn_t k_nomial_tree; /* Each bcol caches a list which describes how many ranks * are "below" each rank in this bcol */ int *list_n_connected; /* offsets for scatter/gather */ int hier_scather_offset; /* Small message threshold for each collective */ int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS]; /* Set small_message_thresholds array */ mca_bcol_base_set_thresholds_fn_t set_small_msg_thresholds; /* Pointer to the order counter on the upper layer, used if the bcol needs to be ordered */ int *next_inorder; }; typedef struct mca_bcol_base_module_t mca_bcol_base_module_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_bcol_base_module_t); struct mca_bcol_base_descriptor_t { ompi_free_list_item_t super; /* Vasily: will be described in the future */ }; typedef struct mca_bcol_base_descriptor_t mca_bcol_base_descriptor_t; #define MCA_BCOL_CHECK_ORDER(module, bcol_function_args) \ do { \ if (*((module)->next_inorder) != \ (bcol_function_args)->order_info.order_num) { \ return BCOL_FN_NOT_STARTED; \ } \ } while (0); #define MCA_BCOL_UPDATE_ORDER_COUNTER(module, order_info) \ do { \ (order_info)->bcols_started++; \ if ((order_info)->n_fns_need_ordering == \ (order_info)->bcols_started) { \ ++(*((module)->next_inorder)); \ } \ } while (0); #if defined(c_plusplus) || defined(__cplusplus) } #endif #endif /* MCA_BCOL_H */