2014-01-22 19:39:19 +04:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
2012-08-16 23:11:35 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
2014-01-22 19:39:19 +04:00
|
|
|
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
2014-03-28 03:25:31 +04:00
|
|
|
* Copyright (c) 2014 Research Organization for Information Science
|
|
|
|
* and Technology (RIST). All rights reserved.
|
2012-08-16 23:11:35 +04:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/** @file */
|
|
|
|
|
|
|
|
#ifndef MCA_COLL_ML_ML_H
|
|
|
|
#define MCA_COLL_ML_ML_H
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
|
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/datatype/opal_convertor.h"
|
|
|
|
#include "opal/threads/mutex.h"
|
|
|
|
|
|
|
|
#include "ompi/mca/coll/coll.h"
|
|
|
|
#include "ompi/request/request.h"
|
|
|
|
#include "ompi/mca/bcol/bcol.h"
|
|
|
|
#include "ompi/mca/sbgp/sbgp.h"
|
|
|
|
#include "ompi/op/op.h"
|
George did the work and deserves all the credit for it. Ralph did the merge, and deserves whatever blame results from errors in it :-)
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL
All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic.
This commit was SVN r32317.
2014-07-26 04:47:28 +04:00
|
|
|
#include "opal/class/ompi_free_list.h"
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
#include "coll_ml_lmngr.h"
|
|
|
|
#include "coll_ml_functions.h"
|
|
|
|
#include "coll_ml_colls.h"
|
|
|
|
#include "coll_ml_allocation.h"
|
|
|
|
#include "coll_ml_config.h"
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
|
|
|
/* macros for return status */
|
|
|
|
enum {
|
|
|
|
ML_OMPI_COMPLETE = 1,
|
|
|
|
ML_OMPI_INCOMPLETE
|
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
ML_SMALL_MSG,
|
|
|
|
ML_LARGE_MSG,
|
|
|
|
ML_NUM_MSG
|
|
|
|
};
|
|
|
|
|
|
|
|
/* ML collectives IDs */
|
|
|
|
enum {
|
|
|
|
/* blocking functions */
|
|
|
|
ML_ALLGATHER,
|
|
|
|
ML_ALLGATHERV,
|
|
|
|
ML_ALLREDUCE,
|
|
|
|
ML_ALLTOALL,
|
|
|
|
ML_ALLTOALLV,
|
|
|
|
ML_ALLTOALLW,
|
|
|
|
ML_BARRIER,
|
|
|
|
ML_BCAST,
|
|
|
|
ML_EXSCAN,
|
|
|
|
ML_GATHER,
|
|
|
|
ML_GATHERV,
|
|
|
|
ML_REDUCE,
|
|
|
|
ML_REDUCE_SCATTER,
|
|
|
|
ML_SCAN,
|
|
|
|
ML_SCATTER,
|
|
|
|
ML_SCATTERV,
|
|
|
|
ML_FANIN,
|
|
|
|
ML_FANOUT,
|
|
|
|
|
|
|
|
/* nonblocking functions */
|
|
|
|
ML_IALLGATHER,
|
|
|
|
ML_IALLGATHERV,
|
|
|
|
ML_IALLREDUCE,
|
|
|
|
ML_IALLTOALL,
|
|
|
|
ML_IALLTOALLV,
|
|
|
|
ML_IALLTOALLW,
|
|
|
|
ML_IBARRIER,
|
|
|
|
ML_IBCAST,
|
|
|
|
ML_IEXSCAN,
|
|
|
|
ML_IGATHER,
|
|
|
|
ML_IGATHERV,
|
|
|
|
ML_IREDUCE,
|
|
|
|
ML_IREDUCE_SCATTER,
|
|
|
|
ML_ISCAN,
|
|
|
|
ML_ISCATTER,
|
|
|
|
ML_ISCATTERV,
|
|
|
|
ML_IFANIN,
|
|
|
|
ML_IFANOUT,
|
|
|
|
ML_NUM_OF_FUNCTIONS
|
|
|
|
};
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* ML broadcast algorithms */
|
|
|
|
enum {
|
|
|
|
COLL_ML_STATIC_BCAST,
|
|
|
|
COLL_ML_SEQ_BCAST,
|
|
|
|
COLL_ML_UNKNOWN_BCAST,
|
|
|
|
};
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
struct mca_bcol_base_module_t;
|
|
|
|
|
|
|
|
/* collective function arguments - gives
|
|
|
|
* one function signature for calling all collective setup
|
|
|
|
* routines, with the initial call to a collective function having
|
|
|
|
* the context to access the right parts of the data structure.
|
|
|
|
* this information is used by each of the setup functions to
|
|
|
|
* setup the correct information for each of the functions in the
|
|
|
|
* hierarchy that will be called. */
|
|
|
|
|
|
|
|
/* RLG NOTE: Need to figure out what arguments to store here,
|
|
|
|
* and which ones directly in the message descriptor
|
|
|
|
*/
|
|
|
|
struct mpi_coll_fn_params_t {
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
ompi_communicator_t *comm;
|
|
|
|
int n_fanin_steps;
|
|
|
|
int n_fanout_steps;
|
|
|
|
int n_recursive_doubling_steps;
|
|
|
|
} ibarrier_recursive_doubling;
|
|
|
|
|
|
|
|
struct {
|
|
|
|
int root;
|
|
|
|
ompi_communicator_t *comm;
|
|
|
|
struct ompi_datatype_t *datatype;
|
|
|
|
} ibcast;
|
|
|
|
} coll_fn;
|
|
|
|
};
|
|
|
|
typedef struct mpi_coll_fn_params_t mpi_coll_fn_params_t;
|
|
|
|
|
|
|
|
/* algorithm parmeters needed for the setup function */
|
|
|
|
struct mpi_coll_algorithm_params_t {
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
int n_fanin_steps;
|
|
|
|
int n_fanout_steps;
|
|
|
|
int n_recursive_doubling_steps;
|
|
|
|
} ibarrier_recursive_doubling;
|
|
|
|
|
|
|
|
struct {
|
|
|
|
int place_holder;
|
|
|
|
} ibcast;
|
|
|
|
} coll_fn;
|
|
|
|
};
|
|
|
|
typedef struct mpi_coll_algorithm_params_t mpi_coll_algorithm_params_t;
|
|
|
|
|
|
|
|
/* setup function - used to setup each segment (or fragment)
|
|
|
|
* to be processed
|
|
|
|
*/
|
|
|
|
struct mca_coll_ml_module_t;
|
|
|
|
struct mca_coll_ml_topology_t;
|
|
|
|
|
|
|
|
typedef int (*coll_fragment_comm_setup_fn)(struct mca_coll_ml_module_t *ml_module,
|
|
|
|
mpi_coll_fn_params_t *fn_params, mpi_coll_algorithm_params_t *algorithm_params);
|
|
|
|
/* full collective description */
|
|
|
|
struct coll_ml_collective_description_t {
|
|
|
|
/* number of temp buffers */
|
|
|
|
int n_buffers;
|
|
|
|
|
|
|
|
/* description size */
|
|
|
|
int n_functions;
|
|
|
|
|
|
|
|
/* collective setup function - called for every non-blocking
|
|
|
|
* function, and for each fragment of such a message
|
|
|
|
*/
|
|
|
|
coll_fragment_comm_setup_fn *coll_fn_setup_fn;
|
|
|
|
|
|
|
|
/* algorithm parameters */
|
|
|
|
mpi_coll_algorithm_params_t alg_params;
|
|
|
|
|
|
|
|
/* list of functions */
|
2014-02-07 23:15:45 +04:00
|
|
|
mca_bcol_base_function_t *functions;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* function names - for debugging */
|
|
|
|
char **function_names;
|
|
|
|
|
|
|
|
/* Signalling collective completion */
|
|
|
|
bool completion_flag;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct coll_ml_collective_description_t coll_ml_collective_description_t;
|
|
|
|
|
|
|
|
/* Utility data structure */
|
|
|
|
struct rank_properties_t {
|
|
|
|
int rank;
|
|
|
|
int leaf;
|
|
|
|
int num_of_ranks_represented;
|
|
|
|
}; typedef struct rank_properties_t rank_properties_t;
|
|
|
|
|
|
|
|
/* data structure for holding node information for the nodes of the
|
|
|
|
* hierarchical communications tree.
|
|
|
|
*/
|
|
|
|
struct sub_group_params_t {
|
|
|
|
/* rank of root in the communicator */
|
|
|
|
int root_rank_in_comm;
|
|
|
|
|
|
|
|
/* index in subgroup */
|
|
|
|
int root_index;
|
|
|
|
|
|
|
|
/* number of ranks in subgroup */
|
|
|
|
int n_ranks;
|
|
|
|
|
|
|
|
/* index of the first element in the subgroup. The
|
|
|
|
* assumption is that
|
|
|
|
* ranks for all subgroups are stored in a single
|
|
|
|
* linear array
|
|
|
|
*/
|
|
|
|
int index_of_first_element;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* level in the hierarchy - subgroups at the same
|
2014-01-22 19:39:19 +04:00
|
|
|
* level don't overlap. May not be the same as the
|
|
|
|
* sbgp level.
|
2012-08-16 23:11:35 +04:00
|
|
|
*/
|
|
|
|
int level_in_hierarchy;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Information on the ranks in the subgroup. This includes
|
|
|
|
* the rank, and wether or not the rank is a source/sink of
|
|
|
|
* of data in this subgroup, or just a "pass through".
|
|
|
|
*/
|
|
|
|
rank_properties_t *rank_data;
|
|
|
|
|
|
|
|
/* level one index - for example,
|
|
|
|
for( i = 0; i < level_one_index; i++) will loop
|
|
|
|
through all level one subgroups, this is significant
|
|
|
|
since level one is a disjoint partitioning of all ranks
|
|
|
|
i.e. all ranks appear once and only once at level one
|
|
|
|
*/
|
|
|
|
int level_one_index;
|
|
|
|
};
|
|
|
|
typedef struct sub_group_params_t sub_group_params_t;
|
|
|
|
|
|
|
|
/* function to setup information on the order of a given bcol within
|
|
|
|
* a specific ML-level algorithm.
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_setup_scratch_vals(mca_coll_ml_compound_functions_t *func_list,
|
|
|
|
int *scratch_indx, int *scratch_num, int n_hiers);
|
|
|
|
|
|
|
|
/* driver for setting up collective communication description */
|
|
|
|
|
|
|
|
int ml_coll_schedule_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
int ml_coll_up_and_down_hier_setup(
|
|
|
|
struct mca_coll_ml_module_t *ml_module,
|
|
|
|
struct mca_coll_ml_topology_t *topo_info,
|
|
|
|
int up_function_idx,
|
|
|
|
int top_function_idx,
|
|
|
|
int down_function_idx,
|
|
|
|
int collective);
|
|
|
|
|
|
|
|
int ml_coll_barrier_constant_group_data_setup(
|
|
|
|
struct mca_coll_ml_topology_t *topo_info,
|
|
|
|
mca_coll_ml_collective_operation_description_t *schedule);
|
|
|
|
|
|
|
|
/* Barrier */
|
|
|
|
int ml_coll_hier_barrier_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
/* allreduce */
|
|
|
|
int ml_coll_hier_allreduce_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
int ml_coll_hier_allreduce_setup_new(struct mca_coll_ml_module_t *ml_module);
|
2014-03-28 03:25:31 +04:00
|
|
|
void ml_coll_hier_allreduce_cleanup_new(struct mca_coll_ml_module_t *ml_module);
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* alltoall */
|
|
|
|
int ml_coll_hier_alltoall_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
int ml_coll_hier_alltoall_setup_new(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
/* allgather */
|
|
|
|
int ml_coll_hier_allgather_setup(struct mca_coll_ml_module_t *ml_module);
|
2014-03-28 03:25:31 +04:00
|
|
|
void ml_coll_hier_allgather_cleanup(struct mca_coll_ml_module_t *ml_module);
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* gather */
|
|
|
|
int ml_coll_hier_gather_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
/* broadcast */
|
|
|
|
int ml_coll_hier_bcast_setup(struct mca_coll_ml_module_t *ml_module);
|
2014-03-28 03:25:31 +04:00
|
|
|
void ml_coll_hier_bcast_cleanup(struct mca_coll_ml_module_t *ml_module);
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* reduce */
|
|
|
|
int ml_coll_hier_reduce_setup(struct mca_coll_ml_module_t *ml_module);
|
2014-03-28 03:25:31 +04:00
|
|
|
void ml_coll_hier_reduce_cleanup(struct mca_coll_ml_module_t *ml_module);
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* reduce */
|
|
|
|
int ml_coll_hier_scatter_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
/* alltoall */
|
|
|
|
int mca_coll_ml_alltoall(void *sbuf, int scount,
|
|
|
|
struct ompi_datatype_t *sdtype,
|
|
|
|
void* rbuf, int rcount,
|
|
|
|
struct ompi_datatype_t *rdtype,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
int mca_coll_ml_alltoall_nb(void *sbuf, int scount,
|
|
|
|
struct ompi_datatype_t *sdtype,
|
|
|
|
void* rbuf, int rcount,
|
|
|
|
struct ompi_datatype_t *rdtype,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
|
|
|
|
/* allgather */
|
|
|
|
int mca_coll_ml_allgather(void *sbuf, int scount,
|
|
|
|
struct ompi_datatype_t *sdtype,
|
|
|
|
void* rbuf, int rcount,
|
|
|
|
struct ompi_datatype_t *rdtype,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/* non-blocking allgather */
|
|
|
|
int mca_coll_ml_allgather_nb(void *sbuf, int scount,
|
|
|
|
struct ompi_datatype_t *sdtype,
|
|
|
|
void* rbuf, int rcount,
|
|
|
|
struct ompi_datatype_t *rdtype,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/* gather */
|
|
|
|
int mca_coll_ml_gather(void *sbuf, int scount,
|
|
|
|
struct ompi_datatype_t *sdtype,
|
|
|
|
void* rbuf, int rcount,
|
|
|
|
struct ompi_datatype_t *rdtype,
|
|
|
|
int root,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/* nonblocking Barrier */
|
|
|
|
int ml_coll_hier_nonblocking_barrier_setup(struct mca_coll_ml_module_t *ml_module, struct mca_coll_ml_topology_t *topo_info);
|
|
|
|
|
|
|
|
/* Memory syncronization collective setup */
|
|
|
|
int ml_coll_memsync_setup(struct mca_coll_ml_module_t *ml_module);
|
|
|
|
|
|
|
|
/* Fragment descriptor */
|
|
|
|
struct mca_coll_ml_descriptor_t;
|
|
|
|
struct mca_coll_ml_fragment_t {
|
|
|
|
opal_list_item_t super;
|
|
|
|
|
|
|
|
struct mca_coll_ml_descriptor_t *full_msg_descriptor;
|
|
|
|
int offset; /*offset for progress pointer*/
|
|
|
|
int length; /*fragment length I assume*/
|
|
|
|
opal_convertor_t convertor; /*convertor for copy/pack data*/
|
|
|
|
|
|
|
|
/* current function index */
|
|
|
|
int current_fn_index;
|
|
|
|
|
|
|
|
/* array of function arguments */
|
|
|
|
struct bcol_function_args_t *fn_args;
|
|
|
|
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_fragment_t mca_coll_ml_fragment_t;
|
|
|
|
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_fragment_t);
|
|
|
|
|
|
|
|
#define MCA_COLL_ML_NO_BUFFER -1
|
|
|
|
|
|
|
|
#define MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, index, desc) \
|
|
|
|
do { \
|
|
|
|
(coll_op)->variable_fn_params.buffer_index = index; \
|
|
|
|
(coll_op)->fragment_data.buffer_desc = desc; \
|
|
|
|
/* pasha - why we duplicate it ? */ \
|
|
|
|
(coll_op)->variable_fn_params.src_desc = desc; \
|
|
|
|
(coll_op)->variable_fn_params.hier_factor = 1; \
|
2014-01-22 19:39:19 +04:00
|
|
|
(coll_op)->variable_fn_params.need_dt_support = false; \
|
2012-08-16 23:11:35 +04:00
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/*Full message descriptor*/
|
|
|
|
struct mca_coll_ml_descriptor_t {
|
|
|
|
ompi_request_t super; /*base request*/
|
|
|
|
struct ompi_datatype_t *datatype; /*ompi datatype*/
|
|
|
|
size_t count; /*count of user datatype elements*/
|
2014-01-24 21:56:52 +04:00
|
|
|
uint32_t sequence_num; /*sequence number for collective operation*/
|
2012-08-16 23:11:35 +04:00
|
|
|
size_t frags_limit; /*upper limit on # of fragments*/
|
|
|
|
size_t frags_start; /*number of fragments started*/
|
|
|
|
|
|
|
|
/*number of fragments completed*/
|
|
|
|
size_t frags_complete;
|
|
|
|
|
|
|
|
/* number of fragments needed to process this message */
|
|
|
|
size_t n_fragments;
|
|
|
|
|
|
|
|
volatile bool free_resource; /*signals release resource*/
|
|
|
|
|
|
|
|
/*pointer to reduction operation, e.g. MPI_MIN - need to handle
|
|
|
|
* user defined functions also */
|
|
|
|
/* ompi_predefined_op_t *operation; */
|
|
|
|
|
|
|
|
/*pointer to a communication schedule, data struct undefined*/
|
|
|
|
struct coll_ml_collective_description_t *local_comm_description;
|
|
|
|
|
|
|
|
/* fragment descriptor - we always have a fragment descriptor
|
|
|
|
* if we get a full message descriptor. Optimization for
|
|
|
|
* small messages */
|
|
|
|
mca_coll_ml_fragment_t fragment;
|
|
|
|
/* The ML memory buffer index that should consist the send and
|
|
|
|
recv information
|
|
|
|
if the index is -1, it means no buffer was allocated */
|
|
|
|
uint64_t buffer_index;
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_descriptor_t mca_coll_ml_descriptor_t;
|
|
|
|
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_descriptor_t);
|
|
|
|
|
|
|
|
/* sbgp and bcol module pairs */
|
|
|
|
struct hierarchy_pairs {
|
|
|
|
mca_sbgp_base_module_t *subgroup_module;
|
|
|
|
struct mca_bcol_base_module_t **bcol_modules;
|
|
|
|
int num_bcol_modules;
|
|
|
|
int bcol_index;
|
|
|
|
mca_bcol_base_component_t *bcol_component;
|
|
|
|
};
|
|
|
|
typedef struct hierarchy_pairs hierarchy_pairs;
|
|
|
|
|
|
|
|
/* list of ranks in each group */
|
|
|
|
struct ml_level_t {
|
|
|
|
int n_modules;
|
|
|
|
hierarchy_pairs *modules;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct ml_level_t ml_level_t;
|
|
|
|
|
|
|
|
enum {
|
|
|
|
COLL_ML_HR_FULL, /* Full hierarchy topology, all bcols and sbgps attends in discovery */
|
|
|
|
COLL_ML_HR_ALLREDUCE,
|
|
|
|
COLL_ML_HR_NBS, /* All hierarchy except base socket */
|
|
|
|
COLL_ML_HR_SINGLE_PTP, /* Single flat ptp hierarchy */
|
|
|
|
COLL_ML_HR_SINGLE_IBOFFLOAD, /* Single flat iboffload hierarchy */
|
|
|
|
COLL_ML_TOPO_MAX
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Topology-hierarchy discovery function */
|
|
|
|
struct mca_coll_ml_module_t; /* forward declaration for the function */
|
|
|
|
|
|
|
|
typedef int (* mca_coll_topo_discovery_fn_t)
|
|
|
|
(struct mca_coll_ml_module_t *ml_module, int n_hierarchies);
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
COLL_ML_TOPO_DISABLED = 0,
|
|
|
|
COLL_ML_TOPO_ENABLED = 1
|
|
|
|
} topo_status_t;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Structure to hold the sm coll component. First it holds the
|
|
|
|
* base coll component, and then holds a bunch of
|
|
|
|
* sm-coll-component-specific stuff (e.g., current MCA param
|
|
|
|
* values).
|
|
|
|
*/
|
|
|
|
struct mca_coll_ml_component_t {
|
|
|
|
/** Base coll component */
|
|
|
|
mca_coll_base_component_2_0_0_t super;
|
|
|
|
|
|
|
|
/** MCA parameter: Priority of this component */
|
|
|
|
int ml_priority;
|
|
|
|
|
|
|
|
/** MCA parameter: subgrouping components to use */
|
|
|
|
char *subgroups_string;
|
|
|
|
|
|
|
|
/** MCA parameter: basic collectives components to use */
|
|
|
|
char *bcols_string;
|
|
|
|
|
|
|
|
/** verbosity level */
|
|
|
|
int verbose;
|
|
|
|
|
|
|
|
/** max of communicators available to run ML */
|
2013-03-28 01:09:41 +04:00
|
|
|
unsigned int max_comm;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/** min size of comm to be available to run ML */
|
|
|
|
int min_comm_size;
|
|
|
|
|
|
|
|
/* base sequence number to use - the expectation is that
|
|
|
|
* this will be used as a basis for generating IDs for
|
|
|
|
* specific collective operations
|
|
|
|
*/
|
|
|
|
int64_t base_sequence_number;
|
|
|
|
|
|
|
|
/** memory pool */
|
|
|
|
mca_coll_ml_lmngr_t memory_manager;
|
|
|
|
|
|
|
|
/* We need it because some bcols cannot
|
|
|
|
support all possible allreduce data types */
|
|
|
|
bool need_allreduce_support;
|
|
|
|
|
|
|
|
int use_knomial_allreduce;
|
|
|
|
|
|
|
|
/* use hdl_framework */
|
|
|
|
bool use_hdl_bcast;
|
|
|
|
|
2013-08-27 20:36:54 +04:00
|
|
|
/* Enable / Disable fragmentation (0 - off, 1 - on, 2 - auto) */
|
|
|
|
int enable_fragmentation;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Broadcast algorithm */
|
|
|
|
int bcast_algorithm;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* frag size that is used by list memory_manager */
|
|
|
|
size_t lmngr_block_size;
|
|
|
|
|
|
|
|
/* alignment that is used by list memory_manager */
|
|
|
|
size_t lmngr_alignment;
|
|
|
|
|
|
|
|
/* list size for memory_manager */
|
|
|
|
size_t lmngr_size;
|
|
|
|
|
|
|
|
/* number of payload memory banks */
|
|
|
|
int n_payload_mem_banks;
|
|
|
|
|
|
|
|
/* number of payload buffers per bank */
|
|
|
|
int n_payload_buffs_per_bank;
|
|
|
|
|
|
|
|
/* size of payload buffer */
|
2013-03-28 01:09:41 +04:00
|
|
|
unsigned long long payload_buffer_size;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* pipeline depth for msg fragmentation */
|
|
|
|
int pipeline_depth;
|
|
|
|
|
|
|
|
/* Free list tunings */
|
|
|
|
int free_list_init_size;
|
|
|
|
|
|
|
|
int free_list_grow_size;
|
|
|
|
|
|
|
|
int free_list_max_size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* queues for asynchronous collective progress
|
|
|
|
*/
|
|
|
|
/* tasks that have not started, either because dependencies are not
|
|
|
|
* statisfied, or resources are lacking
|
|
|
|
*/
|
|
|
|
opal_list_t pending_tasks;
|
|
|
|
opal_mutex_t pending_tasks_mutex;
|
|
|
|
|
|
|
|
/* active incomplete tasks */
|
|
|
|
opal_list_t active_tasks;
|
|
|
|
opal_mutex_t active_tasks_mutex;
|
|
|
|
|
|
|
|
/* sequential collectives to progress */
|
|
|
|
opal_list_t sequential_collectives;
|
|
|
|
opal_mutex_t sequential_collectives_mutex;
|
|
|
|
|
|
|
|
bool progress_is_busy;
|
|
|
|
|
|
|
|
/* Temporary hack for IMB test - not all bcols have allgather */
|
2013-03-28 01:09:41 +04:00
|
|
|
bool disable_allgather;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* Temporary hack for IMB test - not all bcols have alltoall */
|
2013-03-28 01:09:41 +04:00
|
|
|
bool disable_alltoall;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Disable Reduce */
|
|
|
|
bool disable_reduce;
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
/* Brucks alltoall mca and other params */
|
|
|
|
int use_brucks_smsg_alltoall;
|
|
|
|
|
|
|
|
mca_coll_topo_discovery_fn_t topo_discovery_fn[COLL_ML_TOPO_MAX];
|
|
|
|
|
|
|
|
/* Configure file for collectives */
|
|
|
|
char *config_file_name;
|
|
|
|
|
|
|
|
per_collective_configuration_t coll_config[ML_NUM_OF_FUNCTIONS][ML_NUM_MSG];
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Convenience typedef
|
|
|
|
*/
|
|
|
|
typedef struct mca_coll_ml_component_t mca_coll_ml_component_t;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Global component instance
|
|
|
|
*/
|
|
|
|
OMPI_MODULE_DECLSPEC extern mca_coll_ml_component_t mca_coll_ml_component;
|
|
|
|
|
|
|
|
struct mca_coll_ml_leader_offset_info_t {
|
|
|
|
size_t offset;
|
|
|
|
int level_one_index;
|
|
|
|
bool leader;
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_leader_offset_info_t mca_coll_ml_leader_offset_info_t;
|
|
|
|
|
|
|
|
/* Topolody data structure */
|
|
|
|
struct mca_coll_ml_topology_t {
|
|
|
|
topo_status_t status; /* 0 - enabled , 1 - disabled */
|
|
|
|
/* information on the selected groups - needed for collective
|
|
|
|
** algorithms */
|
|
|
|
int32_t global_lowest_hier_group_index;
|
|
|
|
int32_t global_highest_hier_group_index;
|
|
|
|
int number_of_all_subgroups;
|
|
|
|
int n_levels;
|
|
|
|
/* bcols bits that describe supported features/modes */
|
|
|
|
uint64_t all_bcols_mode;
|
2014-02-07 23:15:45 +04:00
|
|
|
mca_bcol_base_route_info_t *route_vector;
|
2012-08-16 23:11:35 +04:00
|
|
|
coll_ml_collective_description_t *hierarchical_algorithms[BCOL_NUM_OF_FUNCTIONS];
|
|
|
|
sub_group_params_t *array_of_all_subgroups;
|
|
|
|
/* (sbgp, bcol) pairs */
|
|
|
|
hierarchy_pairs *component_pairs;
|
|
|
|
/* ordering of ranks when I am the root of the operation.
|
|
|
|
* This ordering guarantees that data need to be re-ordered
|
|
|
|
* only at the first or last step in rooted operations,
|
|
|
|
* depending on whether the opearation is a scatter or
|
|
|
|
* gather operation.
|
|
|
|
*/
|
|
|
|
int *sort_list;
|
|
|
|
mca_coll_ml_leader_offset_info_t *hier_layout_info;
|
|
|
|
/* are ranks laid out contiguously */
|
|
|
|
bool ranks_contiguous;
|
|
|
|
struct ordering_info_t {
|
|
|
|
int next_inorder;
|
|
|
|
int next_order_num;
|
|
|
|
int num_bcols_need_ordering;
|
|
|
|
} topo_ordering_info;
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_topology_t mca_coll_ml_topology_t;
|
|
|
|
|
|
|
|
struct mca_coll_ml_bcol_list_item_t {
|
|
|
|
opal_list_item_t super;
|
|
|
|
mca_bcol_base_module_t *bcol_module;
|
|
|
|
};
|
|
|
|
typedef struct mca_coll_ml_bcol_list_item_t mca_coll_ml_bcol_list_item_t;
|
|
|
|
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_bcol_list_item_t);
|
|
|
|
|
|
|
|
#define MCA_COLL_MAX_NUM_COLLECTIVES 40 /* ... I do not remember how much exactly collectives do we have */
|
|
|
|
#define MCA_COLL_MAX_NUM_SUBTYPES 15 /* Maximum number of algorithms per collective */
|
|
|
|
|
|
|
|
struct mca_coll_ml_module_t {
|
|
|
|
/* base structure */
|
|
|
|
mca_coll_base_module_t super;
|
|
|
|
|
|
|
|
/* ML module status - 0 was not initialized, 1 - was initialized */
|
|
|
|
bool initialized;
|
|
|
|
/* communicator */
|
|
|
|
struct ompi_communicator_t *comm;
|
|
|
|
|
|
|
|
/* reference convertor */
|
|
|
|
opal_convertor_t *reference_convertor;
|
|
|
|
|
|
|
|
mca_coll_ml_topology_t topo_list[COLL_ML_TOPO_MAX];
|
|
|
|
|
|
|
|
/* Collectives - Topology map */
|
|
|
|
int collectives_topology_map
|
|
|
|
[MCA_COLL_MAX_NUM_COLLECTIVES][MCA_COLL_MAX_NUM_SUBTYPES];
|
|
|
|
|
|
|
|
/* largest number of function calls for the collective routines.
|
|
|
|
* This is used to allocate resources */
|
|
|
|
int max_fn_calls;
|
|
|
|
|
|
|
|
/* collective sequence number - unique id for barrier type operations */
|
2014-01-24 21:56:52 +04:00
|
|
|
int32_t no_data_collective_sequence_num;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/* collective sequence number - unique id for each collective */
|
2014-01-24 21:56:52 +04:00
|
|
|
int32_t collective_sequence_num;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/** ompi free list of full message descriptors **/
|
|
|
|
ompi_free_list_t message_descriptors;
|
|
|
|
|
|
|
|
/** ompi free list of message fragment descriptors **/
|
|
|
|
ompi_free_list_t fragment_descriptors;
|
|
|
|
|
|
|
|
/** pointer to the payload memory block **/
|
2014-02-07 23:15:45 +04:00
|
|
|
struct mca_bcol_base_memory_block_desc_t *payload_block;
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
/** the maximum size of collective function description */
|
|
|
|
int max_dag_size;
|
|
|
|
|
|
|
|
/** data used to initialize coll_ml_collective_descriptors */
|
|
|
|
struct coll_desc_init {
|
|
|
|
int max_dag_size;
|
|
|
|
size_t max_n_bytes_per_proc_total;
|
|
|
|
mca_coll_base_module_t *bcol_base_module;
|
|
|
|
} coll_desc_init_data;
|
|
|
|
|
|
|
|
/** collective operation descriptor free list - used to manage a single
|
|
|
|
* collective operation. */
|
|
|
|
ompi_free_list_t coll_ml_collective_descriptors;
|
|
|
|
|
|
|
|
/** multiple function collective operation support */
|
|
|
|
/** broadcast */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_bcast_functions[ML_NUM_BCAST_FUNCTIONS];
|
|
|
|
|
|
|
|
/* bcast size selection criteria - cutoff for the largest size of
|
|
|
|
* data for which to apply the specified collective operation.
|
|
|
|
* This gives us the ability to choose algorithm based on size */
|
|
|
|
size_t bcast_cutoff_size[ML_N_DATASIZE_BINS];
|
|
|
|
|
|
|
|
/** Allreduce functions */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_allreduce_functions[ML_NUM_ALLREDUCE_FUNCTIONS];
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/** Reduce functions */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_reduce_functions[ML_NUM_REDUCE_FUNCTIONS];
|
|
|
|
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
/** scatter */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_scatter_functions[ML_NUM_SCATTER_FUNCTIONS];
|
|
|
|
|
|
|
|
/** alltoall */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_alltoall_functions[ML_NUM_ALLTOALL_FUNCTIONS];
|
|
|
|
|
|
|
|
/** allgather */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_allgather_functions[ML_NUM_ALLGATHER_FUNCTIONS];
|
|
|
|
|
|
|
|
/** gather */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_gather_functions[ML_NUM_GATHER_FUNCTIONS];
|
|
|
|
|
|
|
|
/** Barrier */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_barrier_function;
|
|
|
|
|
|
|
|
/** ML Memory Syncronization collective operation */
|
|
|
|
mca_coll_ml_collective_operation_description_t *
|
|
|
|
coll_ml_memsync_function;
|
|
|
|
|
|
|
|
/** The table of allreduce functions for specific type and op **/
|
|
|
|
bool allreduce_matrix[OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED][BCOL_NUM_OF_ELEM_TYPES];
|
|
|
|
|
|
|
|
/* data offset from ML */
|
|
|
|
int32_t data_offset;
|
|
|
|
|
|
|
|
int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];
|
|
|
|
|
|
|
|
/* fragmenation parameters */
|
|
|
|
int use_user_buffers;
|
|
|
|
uint64_t fragment_size;
|
|
|
|
uint32_t ml_fragment_size;
|
|
|
|
|
|
|
|
/* Bcast index table. Pasha: Do we need to define something more generic ?
|
|
|
|
the table x 2 (large/small)*/
|
|
|
|
int bcast_fn_index_table[2];
|
|
|
|
|
|
|
|
/* List of pointer to bcols that have been initilized and used.
|
|
|
|
* So far we use it only for ML memory management */
|
|
|
|
opal_list_t active_bcols_list;
|
|
|
|
|
|
|
|
/* Buffer size required for Bruck's algorithm */
|
|
|
|
int brucks_buffer_threshold_const;
|
|
|
|
|
|
|
|
/* log comm size */
|
|
|
|
/* We require this for alltoall algorithm */
|
|
|
|
int log_comm_size;
|
|
|
|
/* On this list we keep coll_op descriptors that were not
|
|
|
|
* be able to start, since no ml buffers were available */
|
|
|
|
opal_list_t waiting_for_memory_list;
|
2014-01-22 19:39:19 +04:00
|
|
|
|
|
|
|
/* fallback collectives */
|
|
|
|
mca_coll_base_comm_coll_t fallback;
|
2012-08-16 23:11:35 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct mca_coll_ml_module_t mca_coll_ml_module_t;
|
|
|
|
OBJ_CLASS_DECLARATION(mca_coll_ml_module_t);
|
|
|
|
|
|
|
|
|
|
|
|
/* query to see if the component is available for use, and can
|
|
|
|
* satisfy the thread and progress requirements
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_init_query(bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads);
|
|
|
|
|
|
|
|
/* query to see if the module is available for use on the given
|
|
|
|
* communicator, and if so, what it's priority is. This is where
|
|
|
|
* the backing shared-memory file is created.
|
|
|
|
*/
|
|
|
|
mca_coll_base_module_t *
|
|
|
|
mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority);
|
|
|
|
|
|
|
|
/* Barrier - blocking */
|
|
|
|
int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/* Barrier - non-blocking */
|
|
|
|
int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Allreduce with EXTRA TOPO using - blocking */
|
2012-08-16 23:11:35 +04:00
|
|
|
int mca_coll_ml_allreduce_dispatch(void *sbuf, void *rbuf, int count,
|
|
|
|
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
|
|
|
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Allreduce with EXTRA TOPO using - Non-blocking */
|
|
|
|
int mca_coll_ml_allreduce_dispatch_nb(void *sbuf, void *rbuf, int count,
|
|
|
|
ompi_datatype_t *dtype, ompi_op_t *op,
|
|
|
|
ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
/* Allreduce - blocking */
|
2014-01-22 19:39:19 +04:00
|
|
|
int mca_coll_ml_allreduce(void *sbuf, void *rbuf, int count,
|
2012-08-16 23:11:35 +04:00
|
|
|
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Allreduce - Non-blocking */
|
|
|
|
int mca_coll_ml_allreduce_nb(void *sbuf, void *rbuf, int count,
|
|
|
|
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
2012-08-16 23:11:35 +04:00
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
/* Reduce - Blocking */
|
2012-08-16 23:11:35 +04:00
|
|
|
int mca_coll_ml_reduce(void *sbuf, void *rbuf, int count,
|
|
|
|
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
2014-01-22 19:39:19 +04:00
|
|
|
int root, struct ompi_communicator_t *comm,
|
2012-08-16 23:11:35 +04:00
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
2014-01-22 19:39:19 +04:00
|
|
|
int mca_coll_ml_reduce_nb(void *sbuf, void *rbuf, int count,
|
|
|
|
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *module, int bank_index);
|
|
|
|
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
int coll_ml_progress_individual_message(mca_coll_ml_fragment_t *frag_descriptor);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the ml entry point for the broadcast function
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The ml function interface for non-blocking routines
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_bcast_unknown_root_nb(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
int mca_coll_ml_bcast_known_root_nb(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
OMPI_DECLSPEC int mca_coll_ml_bcast_unknown_root_with_frags_nb(void *buf, int count,
|
|
|
|
struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
ompi_request_t **req, mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
/* This routine sets up a sequential hierarchical scatter algorithm. The
|
|
|
|
* assumptions are that each rank knows in which sub-group that data will show
|
|
|
|
* up first, and that the scatter is executed sequentially, one subgroup at a
|
|
|
|
* time. This is needed, when the full collective needs to be specified before
|
|
|
|
* the collective operation starts up. The algorithm handles all data sizes
|
|
|
|
* and data types.
|
|
|
|
*/
|
|
|
|
|
|
|
|
OMPI_DECLSPEC int mca_coll_ml_scatter_sequential(
|
|
|
|
void *sbuf, int scount, struct ompi_datatype_t *sdtype,
|
|
|
|
void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
int mca_coll_ml_bcast_small_dynamic_root(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
int mca_coll_ml_bcast_small_known_root(void *buf, int count, struct ompi_datatype_t *dtype,
|
|
|
|
int root, struct ompi_communicator_t *comm,
|
|
|
|
mca_coll_base_module_t *module);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Topology discovery function */
|
|
|
|
|
|
|
|
int mca_coll_ml_fulltree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|
|
|
int n_hierarchies);
|
|
|
|
int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|
|
|
int n_hierarchies);
|
|
|
|
int mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|
|
|
int n_hierarchies);
|
|
|
|
int mca_coll_ml_fulltree_ptp_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|
|
|
int n_hierarchies);
|
|
|
|
int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
|
|
|
|
int n_hierarchies);
|
|
|
|
|
|
|
|
void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module,
|
|
|
|
const mca_bcol_base_component_2_0_0_t *bcol_component);
|
2014-01-22 19:39:19 +04:00
|
|
|
static inline int mca_coll_ml_err(const char* fmt, ...)
|
|
|
|
{
|
|
|
|
va_list list;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
va_start(list, fmt);
|
|
|
|
ret = vfprintf(stderr, fmt, list);
|
|
|
|
va_end(list);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
#define ML_ERROR(args) \
|
2012-08-29 18:10:42 +04:00
|
|
|
do { \
|
|
|
|
mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ", \
|
2013-01-28 03:25:10 +04:00
|
|
|
ompi_process_info.nodename, \
|
|
|
|
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
2012-08-29 18:10:42 +04:00
|
|
|
__FILE__, __LINE__, __func__); \
|
|
|
|
mca_coll_ml_err args; \
|
|
|
|
mca_coll_ml_err("\n"); \
|
|
|
|
} while(0)
|
2012-08-16 23:11:35 +04:00
|
|
|
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
|
|
#define ML_VERBOSE(level, args) \
|
2012-08-29 18:10:42 +04:00
|
|
|
do { \
|
|
|
|
if(mca_coll_ml_component.verbose >= level) { \
|
|
|
|
mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ", \
|
2013-01-28 03:25:10 +04:00
|
|
|
ompi_process_info.nodename, \
|
|
|
|
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
2012-08-29 18:10:42 +04:00
|
|
|
__FILE__, __LINE__, __func__); \
|
|
|
|
mca_coll_ml_err args; \
|
|
|
|
mca_coll_ml_err("\n"); \
|
|
|
|
} \
|
|
|
|
} while(0)
|
2012-08-16 23:11:35 +04:00
|
|
|
#else
|
|
|
|
#define ML_VERBOSE(level, args)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define IS_BCOL_TYPE_IDENTICAL(bcol1, bcol2) \
|
|
|
|
( (NULL != bcol1 && NULL != bcol2) && \
|
|
|
|
( /* chech if the len is the same */ \
|
|
|
|
(strlen(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name) == \
|
|
|
|
strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name)) \
|
|
|
|
&& /* check if the string are identical */ \
|
|
|
|
(0 == strncmp(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name, \
|
|
|
|
((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name, \
|
|
|
|
strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name))) \
|
|
|
|
) ? true : false)
|
|
|
|
|
|
|
|
#define GET_BCOL(module, indx) ((module)->component_pairs[(indx)].bcol_modules[0])
|
|
|
|
|
|
|
|
#define GET_BCOL_SYNC_FN(bcol) ((bcol)->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING] \
|
|
|
|
[BCOL_SYNC][1][0][0])
|
|
|
|
|
|
|
|
/* Allocator macros */
|
|
|
|
#define BUFFER_INDEX(bank,nbuffs,buffer) (bank*nbuffs+buffer)
|
|
|
|
|
|
|
|
#define ML_GET_FRAG_SIZE(op, coll) \
|
|
|
|
((op)->fragment_data.message_descriptor->n_bytes_total - \
|
|
|
|
(op)->fragment_data.message_descriptor->n_bytes_scheduled < \
|
|
|
|
(size_t) OP_ML_MODULE((op))->small_message_thresholds[coll] ? \
|
|
|
|
(op)->fragment_data.message_descriptor->n_bytes_total - \
|
|
|
|
(op)->fragment_data.message_descriptor->n_bytes_scheduled : \
|
|
|
|
(size_t) OP_ML_MODULE((op))->small_message_thresholds[coll])
|
|
|
|
|
|
|
|
/* Abort mpi process in case of fatal error */
|
|
|
|
void mca_coll_ml_abort_ml(char *message);
|
|
|
|
|
|
|
|
#define ML_SET_VARIABLE_PARAMS_BCAST(op, ml, cnt, datatype, b_desc, \
|
|
|
|
s_offset, r_offset, frag_len, buf) \
|
|
|
|
do { \
|
|
|
|
op->variable_fn_params.sequence_num = \
|
2014-01-24 21:56:52 +04:00
|
|
|
OPAL_THREAD_ADD32(&((ml)->collective_sequence_num), 1); \
|
2012-08-16 23:11:35 +04:00
|
|
|
op->variable_fn_params.count = cnt; \
|
|
|
|
op->variable_fn_params.dtype = datatype; \
|
|
|
|
op->variable_fn_params.buffer_index = (b_desc)->buffer_index; \
|
|
|
|
op->variable_fn_params.src_desc = (b_desc); \
|
|
|
|
op->variable_fn_params.sbuf_offset = s_offset; \
|
|
|
|
op->variable_fn_params.rbuf_offset = r_offset; \
|
|
|
|
op->variable_fn_params.frag_size = frag_len; \
|
|
|
|
op->variable_fn_params.sbuf = buf; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define MCA_COLL_ML_OP_BASIC_SETUP(op, total_bytes, offset_into_user_buff, src, dst, collective_schedule) \
|
|
|
|
do { \
|
|
|
|
op->coll_schedule = collective_schedule; \
|
|
|
|
op->process_fn = NULL; \
|
|
|
|
op->full_message.n_bytes_total = total_bytes; \
|
|
|
|
op->full_message.n_bytes_delivered = 0; \
|
|
|
|
op->full_message.n_bytes_scheduled = 0; \
|
|
|
|
op->full_message.dest_user_addr = dst; \
|
|
|
|
op->full_message.src_user_addr = src; \
|
|
|
|
op->full_message.n_active = 0; \
|
|
|
|
op->full_message.n_bytes_per_proc_total = 0; \
|
|
|
|
op->full_message.send_count = 0; \
|
|
|
|
op->full_message.recv_count = 0; \
|
|
|
|
op->full_message.send_extent = 0; \
|
|
|
|
op->full_message.recv_extent = 0; \
|
|
|
|
op->full_message.offset_into_send_buffer = 0; \
|
|
|
|
op->full_message.offset_into_recv_buffer = 0; \
|
|
|
|
op->full_message.send_data_type = 0; \
|
|
|
|
op->full_message.recv_data_type = 0; \
|
|
|
|
op->full_message.fragment_launcher = 0; \
|
|
|
|
op->sequential_routine.current_active_bcol_fn = 0; \
|
|
|
|
op->sequential_routine.current_bcol_status = SEQ_TASK_NOT_STARTED; \
|
|
|
|
\
|
|
|
|
op->fragment_data.offset_into_user_buffer = offset_into_user_buff; \
|
|
|
|
/* Pasha, is it constant ? what to put here */ \
|
|
|
|
op->fragment_data.fragment_size = total_bytes; \
|
|
|
|
op->fragment_data.message_descriptor = &op->full_message; \
|
|
|
|
op->fragment_data.current_coll_op = -1; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/* This routine re-orders and packs user data. The assumption is that
|
|
|
|
* there is per-process data, the amount of data is the same for all * ranks,
|
|
|
|
* and the user data is contigous.
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_pack_reorder_contiguous_data(
|
|
|
|
mca_coll_ml_collective_operation_progress_t *coll_op);
|
|
|
|
|
|
|
|
/* This routine re-orders and packs user data. The assumption is that
|
|
|
|
* there is per-process data, the amount of data is the same for all * ranks,
|
|
|
|
* and the user data is noncontigous.
|
|
|
|
*/
|
|
|
|
int mca_coll_ml_pack_reorder_noncontiguous_data(
|
|
|
|
mca_coll_ml_collective_operation_progress_t *coll_op);
|
|
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
|
|
|
|
|
|
|
|
#endif /* MCA_COLL_ML_ML_H */
|