openmpi/ompi/mca/coll/ml/coll_ml.h

/*
 * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
/** @file */

#ifndef MCA_COLL_ML_ML_H
#define MCA_COLL_ML_ML_H

#include "ompi_config.h"

#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/threads/mutex.h"

#include "orte/runtime/orte_globals.h"

#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/sbgp/sbgp.h"
#include "ompi/op/op.h"
#include "ompi/class/ompi_free_list.h"

#include "coll_ml_lmngr.h"
#include "coll_ml_functions.h"
#include "coll_ml_colls.h"
#include "coll_ml_allocation.h"
#include "coll_ml_config.h"

BEGIN_C_DECLS

/* macros for return status */
enum {
    ML_OMPI_COMPLETE = 1,
    ML_OMPI_INCOMPLETE
};

enum {
    ML_SMALL_MSG,
    ML_LARGE_MSG,
    ML_NUM_MSG
};

/* ML collectives IDs */
enum {
    /* blocking functions */
    ML_ALLGATHER,
    ML_ALLGATHERV,
    ML_ALLREDUCE,
    ML_ALLTOALL,
    ML_ALLTOALLV,
    ML_ALLTOALLW,
    ML_BARRIER,
    ML_BCAST,
    ML_EXSCAN,
    ML_GATHER,
    ML_GATHERV,
    ML_REDUCE,
    ML_REDUCE_SCATTER,
    ML_SCAN,
    ML_SCATTER,
    ML_SCATTERV,
    ML_FANIN,
    ML_FANOUT,

    /* nonblocking functions */
    ML_IALLGATHER,
    ML_IALLGATHERV,
    ML_IALLREDUCE,
    ML_IALLTOALL,
    ML_IALLTOALLV,
    ML_IALLTOALLW,
    ML_IBARRIER,
    ML_IBCAST,
    ML_IEXSCAN,
    ML_IGATHER,
    ML_IGATHERV,
    ML_IREDUCE,
    ML_IREDUCE_SCATTER,
    ML_ISCAN,
    ML_ISCATTER,
    ML_ISCATTERV,
    ML_IFANIN,
    ML_IFANOUT,
    ML_NUM_OF_FUNCTIONS
};

struct mca_bcol_base_module_t;
/* function description */
struct coll_ml_function_t {
    int fn_idx;
    /* module */
    struct mca_bcol_base_module_t *bcol_module;

    /*
     *  The following two parameters are used for bcol modules
     *  that want to do some optimizations based on the fact that
     *  n functions from the same bcol module are called in a row.
     *  For example, in the iboffload case, on the first call one
     *  will want to initialize the MWR, and start to instantiate
     *  it, but only post it at the end of the last call.
     *  The index of this function in a sequence of consecutive
     *  functions from the same bcol
     */
    int index_in_consecutive_same_bcol_calls;

    /* number of times functions from this bcol are
     * called in order
     */
    int n_of_this_type_in_a_row;

    /*
     * number of times functions from this module are called in the
     * collective operation.
     */
    int n_of_this_type_in_collective;
    int index_of_this_type_in_collective;
};
typedef struct coll_ml_function_t coll_ml_function_t;


/* collective function arguments - gives
 * one function signature for calling all collective setup
 * routines, with the initial call to a collective function having
 * the context to access the right parts of the data structure.
 * this information is used by each of the setup functions to
 * setup the correct information for each of the functions in the
 * hierarchy that will be called. */

/* RLG NOTE:  Need to figure out what arguments to store here,
 *   and which ones directly in the message descriptor
 */
struct mpi_coll_fn_params_t {
    union {
        struct {
            ompi_communicator_t *comm;
            int n_fanin_steps;
            int n_fanout_steps;
            int n_recursive_doubling_steps;
        } ibarrier_recursive_doubling;

        struct {
            int root;
            ompi_communicator_t *comm;
            struct ompi_datatype_t *datatype;
        } ibcast;
    } coll_fn;
};
typedef struct mpi_coll_fn_params_t mpi_coll_fn_params_t;

/* algorithm parmeters needed for the setup function */
struct mpi_coll_algorithm_params_t {
    union {
        struct {
            int n_fanin_steps;
            int n_fanout_steps;
            int n_recursive_doubling_steps;
        } ibarrier_recursive_doubling;

        struct {
            int place_holder;
        } ibcast;
    } coll_fn;
};
typedef struct mpi_coll_algorithm_params_t mpi_coll_algorithm_params_t;

/* setup function - used to setup each segment (or fragment)
 * to be processed
 */
struct mca_coll_ml_module_t;
struct mca_coll_ml_topology_t;

typedef int (*coll_fragment_comm_setup_fn)(struct mca_coll_ml_module_t *ml_module,
    mpi_coll_fn_params_t *fn_params, mpi_coll_algorithm_params_t *algorithm_params);
/* full collective description */
struct coll_ml_collective_description_t {
    /* number of temp buffers */
    int n_buffers;

    /* description size */
    int n_functions;

    /* collective setup function - called for every non-blocking
     * function, and for each fragment of such a message
     */
    coll_fragment_comm_setup_fn *coll_fn_setup_fn;

    /* algorithm parameters */
    mpi_coll_algorithm_params_t alg_params;

    /* list of functions */
    coll_ml_function_t *functions;

    /* function names - for debugging */
    char **function_names;

    /* Signalling collective completion */
    bool completion_flag;
};

typedef struct coll_ml_collective_description_t coll_ml_collective_description_t;

/* Utility data structure */
struct rank_properties_t {
    int rank;
    int leaf;
    int n_connected_subgroups;
    int *list_connected_subgroups;
    int num_of_ranks_represented;
}; typedef struct rank_properties_t rank_properties_t;

/* data structure for holding node information for the nodes of the
 * hierarchical communications tree.
 */
struct sub_group_params_t {
    /* rank of root in the communicator */
    int root_rank_in_comm;

    /* index in subgroup */
    int root_index;

    /* number of ranks in subgroup */
    int n_ranks;

    /* index of the first element in the subgroup.  The
     * assumption is that
     * ranks for all subgroups are stored in a single
     * linear array
     */
    int index_of_first_element;

    /*
     * level in the hierarchy - subgroups at the same
     * level don't
     * overlap.
     */
    int level_in_hierarchy;

    /*
     * Connected nodes
     */
    int n_connected_nodes;
    int *list_connected_nodes;

    /*
     * Information on the ranks in the subgroup.  This includes
     * the rank, and wether or not the rank is a source/sink of
     * of data in this subgroup, or just a "pass through".
     */
    rank_properties_t *rank_data;

    /*
     * Temp list of ranks
     */
    int *list_ranks;

    /* level one index - for example,
       for( i = 0; i < level_one_index; i++) will loop
       through all level one subgroups, this is significant
       since level one is a disjoint partitioning of all ranks
       i.e. all ranks appear once and only once at level one
     */
    int level_one_index;

};
typedef struct sub_group_params_t sub_group_params_t;

/* function to setup information on the order of a given bcol within
 * a specific ML-level algorithm.
 */
int mca_coll_ml_setup_scratch_vals(mca_coll_ml_compound_functions_t *func_list,
                int *scratch_indx, int *scratch_num, int n_hiers);

/* driver for setting up collective communication description */

int ml_coll_schedule_setup(struct mca_coll_ml_module_t *ml_module);

int ml_coll_up_and_down_hier_setup(
        struct mca_coll_ml_module_t *ml_module,
        struct mca_coll_ml_topology_t *topo_info,
        int up_function_idx,
        int top_function_idx,
        int down_function_idx,
        int collective);

int ml_coll_barrier_constant_group_data_setup(
        struct mca_coll_ml_topology_t *topo_info,
        mca_coll_ml_collective_operation_description_t  *schedule);

/* Barrier */
int ml_coll_hier_barrier_setup(struct mca_coll_ml_module_t *ml_module);

/* allreduce */
int ml_coll_hier_allreduce_setup(struct mca_coll_ml_module_t *ml_module);
int ml_coll_hier_allreduce_setup_new(struct mca_coll_ml_module_t *ml_module);

/* alltoall */
int ml_coll_hier_alltoall_setup(struct mca_coll_ml_module_t *ml_module);
int ml_coll_hier_alltoall_setup_new(struct mca_coll_ml_module_t *ml_module);

/* allgather */
int ml_coll_hier_allgather_setup(struct mca_coll_ml_module_t *ml_module);

/* gather */
int ml_coll_hier_gather_setup(struct mca_coll_ml_module_t *ml_module);

/* broadcast */
int ml_coll_hier_bcast_setup(struct mca_coll_ml_module_t *ml_module);

/* reduce */
int ml_coll_hier_reduce_setup(struct mca_coll_ml_module_t *ml_module);

/* reduce */
int ml_coll_hier_scatter_setup(struct mca_coll_ml_module_t *ml_module);

/* alltoall */
int mca_coll_ml_alltoall(void *sbuf, int scount,
        struct ompi_datatype_t *sdtype,
        void* rbuf, int rcount,
        struct ompi_datatype_t *rdtype,
        struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

int mca_coll_ml_alltoall_nb(void *sbuf, int scount,
        struct ompi_datatype_t *sdtype,
        void* rbuf, int rcount,
        struct ompi_datatype_t *rdtype,
        struct ompi_communicator_t *comm,
        ompi_request_t **req,
        mca_coll_base_module_t *module);


/* allgather */
int mca_coll_ml_allgather(void *sbuf, int scount,
        struct ompi_datatype_t *sdtype,
        void* rbuf, int rcount,
        struct ompi_datatype_t *rdtype,
        struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

/* non-blocking allgather */
int mca_coll_ml_allgather_nb(void *sbuf, int scount,
        struct ompi_datatype_t *sdtype,
        void* rbuf, int rcount,
        struct ompi_datatype_t *rdtype,
        struct ompi_communicator_t *comm,
        ompi_request_t **req,
        mca_coll_base_module_t *module);

/* gather */
int mca_coll_ml_gather(void *sbuf, int scount,
        struct ompi_datatype_t *sdtype,
        void* rbuf, int rcount,
        struct ompi_datatype_t *rdtype,
        int root,
        struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

/* nonblocking Barrier */
int ml_coll_hier_nonblocking_barrier_setup(struct mca_coll_ml_module_t *ml_module, struct mca_coll_ml_topology_t *topo_info);

/* Memory syncronization collective setup */
int ml_coll_memsync_setup(struct mca_coll_ml_module_t *ml_module);

/* Fragment descriptor */
struct mca_coll_ml_descriptor_t;
struct mca_coll_ml_fragment_t {
    opal_list_item_t super;

    struct mca_coll_ml_descriptor_t *full_msg_descriptor;
    int offset; /*offset for progress pointer*/
    int length; /*fragment length I assume*/
    opal_convertor_t convertor; /*convertor for copy/pack data*/

    /* current function index */
    int current_fn_index;

    /* array of function arguments */
    struct bcol_function_args_t *fn_args;

};
typedef struct mca_coll_ml_fragment_t mca_coll_ml_fragment_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_fragment_t);

#define MCA_COLL_ML_NO_BUFFER -1

#define MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, index, desc) \
do {                                                                \
    (coll_op)->variable_fn_params.buffer_index = index;             \
    (coll_op)->fragment_data.buffer_desc = desc;                    \
    /* pasha - why we duplicate it ? */                             \
    (coll_op)->variable_fn_params.src_desc = desc;                  \
    (coll_op)->variable_fn_params.hier_factor = 1;                  \
} while (0)

/*Full message descriptor*/
struct mca_coll_ml_descriptor_t {
    ompi_request_t super; /*base request*/
    struct ompi_datatype_t *datatype; /*ompi datatype*/
    size_t count; /*count of user datatype elements*/
    uint64_t sequence_num; /*sequence number for collective operation*/
    size_t frags_limit; /*upper limit on # of fragments*/
    size_t frags_start; /*number of fragments started*/

    /*number of fragments completed*/
    size_t frags_complete;

    /* number of fragments needed to process this message */
    size_t n_fragments;

    volatile bool free_resource; /*signals release resource*/

    /*pointer to reduction operation, e.g. MPI_MIN - need to handle
     * user defined functions also */
    /* ompi_predefined_op_t *operation;  */

    /*pointer to a communication schedule, data struct undefined*/
    struct coll_ml_collective_description_t *local_comm_description;

    /* fragment descriptor - we always have a fragment descriptor
     *   if we get a full message descriptor.  Optimization for
     *   small messages */
    mca_coll_ml_fragment_t fragment;
    /* The ML memory buffer index that should consist the send and
       recv information
       if the index is -1, it means no buffer was allocated */
    uint64_t buffer_index;
};
typedef struct mca_coll_ml_descriptor_t mca_coll_ml_descriptor_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_descriptor_t);

/* sbgp and bcol module pairs */
struct hierarchy_pairs {
    mca_sbgp_base_module_t *subgroup_module;
    struct mca_bcol_base_module_t **bcol_modules;
    int num_bcol_modules;
    int bcol_index;
    mca_bcol_base_component_t *bcol_component;
};
typedef struct hierarchy_pairs hierarchy_pairs;

/* list of ranks in each group */
struct ml_level_t {
    int n_modules;
    hierarchy_pairs *modules;
};

typedef struct ml_level_t ml_level_t;

enum {
    COLL_ML_HR_FULL,          /* Full hierarchy topology, all bcols and sbgps attends in discovery */
    COLL_ML_HR_ALLREDUCE,
    COLL_ML_HR_NBS,           /* All hierarchy except base socket */
    COLL_ML_HR_SINGLE_PTP,    /* Single flat ptp hierarchy */
    COLL_ML_HR_SINGLE_IBOFFLOAD,    /* Single flat iboffload hierarchy */
    COLL_ML_TOPO_MAX
};

/* Topology-hierarchy discovery function */
struct mca_coll_ml_module_t; /* forward declaration for the function */

typedef int (* mca_coll_topo_discovery_fn_t)
    (struct mca_coll_ml_module_t *ml_module, int n_hierarchies);

typedef enum {
    COLL_ML_TOPO_DISABLED = 0,
    COLL_ML_TOPO_ENABLED = 1
} topo_status_t;

/**
 * Structure to hold the sm coll component.  First it holds the
 * base coll component, and then holds a bunch of
 * sm-coll-component-specific stuff (e.g., current MCA param
 * values).
 */
struct mca_coll_ml_component_t {
    /** Base coll component */
    mca_coll_base_component_2_0_0_t super;

    /** MCA parameter: Priority of this component */
    int ml_priority;

    /** MCA parameter: Number of levels */
    int ml_n_levels;

    /** MCA parameter: subgrouping components to use */
    char *subgroups_string;

    /** MCA parameter: basic collectives components to use */
    char *bcols_string;

    /** verbosity level */
    int verbose;

    /** max of communicators available to run ML */
    uint32_t max_comm;

    /** min size of comm to be available to run ML */
    int min_comm_size;

    /* base sequence number to use - the expectation is that
     * this will be used as a basis for generating IDs for
     * specific collective operations
     */
    int64_t base_sequence_number;

    /** memory pool */
    mca_coll_ml_lmngr_t memory_manager;

    /* We need it because some bcols cannot
       support all possible allreduce data types */
    bool need_allreduce_support;

    int use_knomial_allreduce;

    /* Use global knowledge bcast algorithm */
    bool use_static_bcast;

    /* use hdl_framework */
    bool use_hdl_bcast;

    /* Enable / Disable fragmentation */
    bool enable_fragmentation;

    /* Use sequential bcast algorithm */
    bool use_sequential_bcast;

    /* frag size that is used by list memory_manager */
    size_t lmngr_block_size;

    /* alignment that is used by list memory_manager */
    size_t lmngr_alignment;

    /* list size for memory_manager */
    size_t lmngr_size;

    /* number of payload memory banks */
    int n_payload_mem_banks;

    /* number of payload buffers per bank */
    int n_payload_buffs_per_bank;

    /* size of payload buffer */
    size_t payload_buffer_size;

    /* pipeline depth for msg fragmentation */
    int pipeline_depth;

    /* Free list tunings */
    int free_list_init_size;

    int free_list_grow_size;

    int free_list_max_size;

    /*
     * queues for asynchronous collective progress
     */
    /* tasks that have not started, either because dependencies are not
     * statisfied, or resources are lacking
     */
    opal_list_t pending_tasks;
    opal_mutex_t pending_tasks_mutex;

    /* active incomplete tasks */
    opal_list_t active_tasks;
    opal_mutex_t active_tasks_mutex;

    /* sequential collectives to progress */
    opal_list_t sequential_collectives;
    opal_mutex_t sequential_collectives_mutex;

    bool progress_is_busy;

    /* Temporary hack for IMB test - not all bcols have allgather */
    int disable_allgather;

    /* Temporary hack for IMB test - not all bcols have alltoall */
    int disable_alltoall;

    /* Brucks alltoall mca and other params */
    int use_brucks_smsg_alltoall;

    mca_coll_topo_discovery_fn_t topo_discovery_fn[COLL_ML_TOPO_MAX];

    /* Configure file for collectives */
    char *config_file_name;

    per_collective_configuration_t coll_config[ML_NUM_OF_FUNCTIONS][ML_NUM_MSG];
};

/**
 * Convenience typedef
 */
typedef struct mca_coll_ml_component_t mca_coll_ml_component_t;

/**
 * Global component instance
 */
OMPI_MODULE_DECLSPEC extern mca_coll_ml_component_t mca_coll_ml_component;

struct mca_coll_ml_route_info_t {
    int level;
    int rank;
};
typedef struct mca_coll_ml_route_info_t mca_coll_ml_route_info_t;

struct mca_coll_ml_leader_offset_info_t {
    size_t offset;
    int level_one_index;
    bool leader;
};
typedef struct mca_coll_ml_leader_offset_info_t mca_coll_ml_leader_offset_info_t;

/* Topolody data structure */
struct mca_coll_ml_topology_t {
    topo_status_t status; /* 0 - enabled , 1 - disabled */
    /* information on the selected groups - needed for collective
     ** algorithms */
    int32_t global_lowest_hier_group_index;
    int32_t global_highest_hier_group_index;
    int number_of_all_subgroups;
    int n_levels;
    /* bcols bits that describe supported features/modes */
    uint64_t all_bcols_mode;
    mca_coll_ml_route_info_t *route_vector;
    coll_ml_collective_description_t *hierarchical_algorithms[BCOL_NUM_OF_FUNCTIONS];
    sub_group_params_t *array_of_all_subgroups;
    /* (sbgp, bcol) pairs */
    hierarchy_pairs *component_pairs;
    /* ordering of ranks when I am the root of the operation.
     * This ordering guarantees that data need to be re-ordered
     * only at the first or last step in rooted operations,
     * depending on whether the opearation is a scatter or
     * gather operation.
     */
    int *sort_list;
    mca_coll_ml_leader_offset_info_t *hier_layout_info;
    /* are ranks laid out contiguously */
    bool ranks_contiguous;
    struct ordering_info_t {
        int next_inorder;
        int next_order_num;
        int num_bcols_need_ordering;
    } topo_ordering_info;
};
typedef struct mca_coll_ml_topology_t mca_coll_ml_topology_t;

struct mca_coll_ml_bcol_list_item_t {
    opal_list_item_t super;
    mca_bcol_base_module_t *bcol_module;
};
typedef struct mca_coll_ml_bcol_list_item_t mca_coll_ml_bcol_list_item_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_ml_bcol_list_item_t);

#define MCA_COLL_MAX_NUM_COLLECTIVES 40 /* ... I do not remember how much exactly collectives do we have */
#define MCA_COLL_MAX_NUM_SUBTYPES 15    /* Maximum number of algorithms per collective */

struct mca_coll_ml_module_t {
    /* base structure */
    mca_coll_base_module_t super;

    /* ML module status - 0 was not initialized, 1 - was initialized */
    bool initialized;
    /* communicator */
    struct ompi_communicator_t *comm;

    /* reference convertor */
    opal_convertor_t *reference_convertor;

    mca_coll_ml_topology_t topo_list[COLL_ML_TOPO_MAX];

    /* Collectives - Topology map */
    int collectives_topology_map
        [MCA_COLL_MAX_NUM_COLLECTIVES][MCA_COLL_MAX_NUM_SUBTYPES];

    /* largest number of function calls for the collective routines.
     * This is used to allocate resources */
    int max_fn_calls;

    /* collective sequence number - unique id for barrier type operations */
    int64_t no_data_collective_sequence_num;

    /* collective sequence number - unique id for each collective */
    int64_t collective_sequence_num;

    /** ompi free list of full message descriptors **/
    ompi_free_list_t message_descriptors;

    /** ompi free list of message fragment descriptors **/
    ompi_free_list_t fragment_descriptors;

    /** pointer to the payload memory block **/
    struct ml_memory_block_desc_t *payload_block;

    /** the maximum size of collective function description */
    int max_dag_size;

    /** data used to initialize coll_ml_collective_descriptors */
    struct coll_desc_init {
        int max_dag_size;
        size_t max_n_bytes_per_proc_total;
        mca_coll_base_module_t *bcol_base_module;
    } coll_desc_init_data;

    /** collective operation descriptor free list - used to manage a single
     *  collective operation. */
    ompi_free_list_t coll_ml_collective_descriptors;

    /** multiple function collective operation support */
    /** broadcast */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_bcast_functions[ML_NUM_BCAST_FUNCTIONS];

    /* bcast size selection criteria - cutoff for the largest size of
     * data for which to apply the specified collective operation.
     * This gives us the ability to choose algorithm based on size */
    size_t bcast_cutoff_size[ML_N_DATASIZE_BINS];

     /** Allreduce functions */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_allreduce_functions[ML_NUM_ALLREDUCE_FUNCTIONS];

    /** scatter */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_scatter_functions[ML_NUM_SCATTER_FUNCTIONS];

    /** alltoall */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_alltoall_functions[ML_NUM_ALLTOALL_FUNCTIONS];

   /** allgather */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_allgather_functions[ML_NUM_ALLGATHER_FUNCTIONS];

   /** gather */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_gather_functions[ML_NUM_GATHER_FUNCTIONS];

    /** Barrier */
    mca_coll_ml_collective_operation_description_t *
                                coll_ml_barrier_function;

    /** ML Memory Syncronization collective operation */
    mca_coll_ml_collective_operation_description_t *
        coll_ml_memsync_function;

    /** The table of allreduce functions for specific type and op **/
    bool allreduce_matrix[OMPI_OP_NUM_OF_TYPES][OMPI_DATATYPE_MAX_PREDEFINED][BCOL_NUM_OF_ELEM_TYPES];

    /* data offset from ML */
    int32_t  data_offset;

    int small_message_thresholds[BCOL_NUM_OF_FUNCTIONS];

    /* fragmenation parameters */
    int use_user_buffers;
    uint64_t fragment_size;
    uint32_t ml_fragment_size;

    /* For carto graph */
    /* opal_carto_graph_t *sm_graph; */
    /* opal_carto_graph_t *ib_graph; */
    /* Bcast index table. Pasha: Do we need to define something more generic ?
     the table  x 2 (large/small)*/
    int bcast_fn_index_table[2];

    /* List of pointer to bcols that have been initilized and used.
     * So far we use it only for ML memory management */
    opal_list_t active_bcols_list;

    /* Buffer size required for Bruck's algorithm */
    int brucks_buffer_threshold_const;

    /* log comm size */
    /* We require this for alltoall algorithm */
    int log_comm_size;
    /* On this list we keep coll_op descriptors that were not
     * be able to start, since no ml buffers were available */
    opal_list_t waiting_for_memory_list;
};

typedef struct mca_coll_ml_module_t mca_coll_ml_module_t;
OBJ_CLASS_DECLARATION(mca_coll_ml_module_t);


/* query to see if the component is available for use, and can
 * satisfy the thread and progress requirements
 */
int mca_coll_ml_init_query(bool enable_progress_threads,
        bool enable_mpi_threads);

/* query to see if the module is available for use on the given
 * communicator, and if so, what it's priority is.  This is where
 * the backing shared-memory file is created.
 */
mca_coll_base_module_t *
mca_coll_ml_comm_query(struct ompi_communicator_t *comm, int *priority);

/* Barrier - blocking */
int mca_coll_ml_barrier_intra(struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module);

/* Barrier - non-blocking */
int mca_coll_ml_ibarrier_intra(struct ompi_communicator_t *comm,
                               ompi_request_t **req,
                               mca_coll_base_module_t *module);

int mca_coll_ml_allreduce_dispatch(void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                                struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

/* Allreduce - blocking */
int mca_coll_ml_allreduce_intra(void *sbuf, void *rbuf, int count,
                                struct ompi_datatype_t *dtype, struct ompi_op_t *op,
                                struct ompi_communicator_t *comm,
                                mca_coll_base_module_t *module);

int mca_coll_ml_memsync_intra(mca_coll_ml_module_t *module, int bank_index);

/* Reduce blocking */
int mca_coll_ml_reduce(void *sbuf, void *rbuf, int count,
        struct ompi_datatype_t *dtype, struct ompi_op_t *op,
        int root,
        struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

int coll_ml_progress_individual_message(mca_coll_ml_fragment_t *frag_descriptor);

/*
 * the ml entry point for the broadcast function
 */
int mca_coll_ml_parallel_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);
int mca_coll_ml_parallel_bcast_nb(void *buf, int count, struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        ompi_request_t **req,
        mca_coll_base_module_t *module);
int mca_coll_ml_bcast_sequential_root(void *buf, int count, struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

/*
 * The ml function interface for non-blocking routines
 */
int mca_coll_ml_bcast_unknown_root_nb(void *buf, int count, struct ompi_datatype_t *dtype,
                                         int root, struct ompi_communicator_t *comm,
                                         ompi_request_t **req,
                                         mca_coll_base_module_t *module);

int mca_coll_ml_bcast_known_root_nb(void *buf, int count, struct ompi_datatype_t *dtype,
                                         int root, struct ompi_communicator_t *comm,
                                         ompi_request_t **req,
                                         mca_coll_base_module_t *module);
OMPI_DECLSPEC int mca_coll_ml_bcast_unknown_root_with_frags_nb(void *buf, int count,
        struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        ompi_request_t **req, mca_coll_base_module_t *module);

/* This routine sets up a sequential hierarchical scatter algorithm.  The
 * assumptions are that each rank knows in which sub-group that data will show
 * up first, and that the scatter is executed sequentially, one subgroup at a
 * time.  This is needed, when the full collective needs to be specified before
 * the collective operation starts up.  The algorithm handles all data sizes
 * and data types.
 */

OMPI_DECLSPEC int mca_coll_ml_scatter_sequential(
        void *sbuf, int scount, struct ompi_datatype_t *sdtype,
        void *rbuf, int rcount, struct ompi_datatype_t *rdtype,
        int root, struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

#if 0
int mca_coll_ml_bcast_small_dynamic_root(void *buf, int count, struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);
int mca_coll_ml_bcast_small_known_root(void *buf, int count, struct ompi_datatype_t *dtype,
        int root, struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);
#endif

/* Topology discovery function */

int mca_coll_ml_fulltree_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
        int n_hierarchies);
int mca_coll_ml_allreduce_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
        int n_hierarchies);
int mca_coll_ml_fulltree_exclude_basesmsocket_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
        int n_hierarchies);
int mca_coll_ml_fulltree_ptp_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
        int n_hierarchies);
int mca_coll_ml_fulltree_iboffload_only_hierarchy_discovery(mca_coll_ml_module_t *ml_module,
        int n_hierarchies);

void mca_coll_ml_allreduce_matrix_init(mca_coll_ml_module_t *ml_module,
                     const mca_bcol_base_component_2_0_0_t *bcol_component);

static inline int mca_coll_ml_err(const char* fmt, ...)
{
    va_list list;
    int ret;

    va_start(list, fmt);
    ret = vfprintf(stderr, fmt, list);
    va_end(list);
    return ret;
}

#define ML_ERROR(args)                                       \
    do {                                                     \
        mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ",         \
            orte_process_info.nodename,                      \
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),              \
            __FILE__, __LINE__, __func__);                   \
        mca_coll_ml_err args;                                \
        mca_coll_ml_err("\n");                               \
    } while(0)

#if OPAL_ENABLE_DEBUG
#define ML_VERBOSE(level, args)                              \
    do {                                                     \
        if(mca_coll_ml_component.verbose >= level) {         \
            mca_coll_ml_err("[%s]%s[%s:%d:%s] COLL-ML ",     \
                    orte_process_info.nodename,              \
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),      \
                    __FILE__, __LINE__, __func__);           \
            mca_coll_ml_err args;                            \
            mca_coll_ml_err("\n");                           \
        }                                                    \
    } while(0)
#else
#define ML_VERBOSE(level, args)
#endif

#define IS_BCOL_TYPE_IDENTICAL(bcol1, bcol2)                                                         \
    (      (NULL != bcol1 && NULL != bcol2) &&                                                       \
     (     /* chech if the len is the same */                                                        \
     (strlen(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name) ==             \
      strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name))               \
      &&  /* check if the string are identical */                                                    \
     (0 == strncmp(((mca_base_component_t *)((bcol1)->bcol_component))->mca_component_name,          \
                   ((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name,          \
                   strlen(((mca_base_component_t *)((bcol2)->bcol_component))->mca_component_name))) \
     ) ? true : false)

#define GET_BCOL(module, indx) ((module)->component_pairs[(indx)].bcol_modules[0])

#define GET_BCOL_SYNC_FN(bcol) ((bcol)->filtered_fns_table[DATA_SRC_KNOWN][NON_BLOCKING] \
                                                          [BCOL_SYNC][1][0][0])

/* Allocator macros */
#define BUFFER_INDEX(bank,nbuffs,buffer) (bank*nbuffs+buffer)

#define ML_GET_FRAG_SIZE(op, coll)                                 \
    ((op)->fragment_data.message_descriptor->n_bytes_total -       \
     (op)->fragment_data.message_descriptor->n_bytes_scheduled <   \
     (size_t) OP_ML_MODULE((op))->small_message_thresholds[coll] ? \
     (op)->fragment_data.message_descriptor->n_bytes_total -       \
     (op)->fragment_data.message_descriptor->n_bytes_scheduled :   \
     (size_t) OP_ML_MODULE((op))->small_message_thresholds[coll])

static inline __opal_attribute_always_inline__ int ml_fls(int num)
{
    int i = 1;
    int j = 0;

    if (0 == num) {
        return 0;
    }

    while (i < num) {
        i *= 2;
        j++;
    }

    if (i > num) {
        j--;
    }

   return j;
}

/* Abort mpi process in case of fatal error */
void mca_coll_ml_abort_ml(char *message);

static inline __opal_attribute_always_inline__
        int mca_coll_ml_buffer_recycling(mca_coll_ml_collective_operation_progress_t *ml_request)
{
    mca_coll_ml_module_t *ml_module = (mca_coll_ml_module_t *)ml_request->coll_module;
    ml_memory_block_desc_t *ml_memblock = ml_module->payload_block;
    uint64_t bank_index = ml_request->fragment_data.buffer_desc->bank_index;
    int rc;

    opal_atomic_add(&ml_memblock->bank_release_counters[bank_index], 1);

    /* Check if the bank is ready for recycling */
    if (ml_memblock->bank_release_counters[bank_index] ==
            ml_memblock->num_buffers_per_bank ) {
        ml_memblock->ready_for_memsync[bank_index] = true;

        ML_VERBOSE(10, ("Sync count %d, bank %d", ml_memblock->memsync_counter, bank_index));
        assert(ml_memblock->bank_is_busy);
        if (ml_memblock->memsync_counter == (int)bank_index) {
            while(ml_memblock->ready_for_memsync[ml_memblock->memsync_counter]) {
                ML_VERBOSE(10, ("Calling for service barrier: ml_buffer_index - %d %d %d == %d.\n",
                            ml_request->fragment_data.buffer_desc->buffer_index,
                            ml_memblock->memsync_counter,
                            ml_memblock->bank_release_counters[ml_memblock->memsync_counter],
                            ml_memblock->num_buffers_per_bank));
                /* Setting the ready flag to 0 - unready - done */
                ml_memblock->ready_for_memsync[ml_memblock->memsync_counter] = false;

                rc = mca_coll_ml_memsync_intra(ml_module, ml_memblock->memsync_counter);
                if (OMPI_SUCCESS != rc) {
                    ML_ERROR(("Failed to start memory sync !!!"));
                    return rc;
                }

                opal_atomic_add(&ml_memblock->memsync_counter, 1);
                if (ml_memblock->memsync_counter == (int)ml_memblock->num_banks) {
                    ml_memblock->memsync_counter = 0;
                }
                ML_VERBOSE(10, ("After service barrier."));
            }
        } else {
            ML_VERBOSE(10, ("Out of order %d\n", ml_memblock->memsync_counter));
        }
    }

    return OMPI_SUCCESS;
}

static inline  __opal_attribute_always_inline__ int coll_ml_fragment_completion_processing(
        mca_coll_ml_collective_operation_progress_t *coll_op)
{
    /* local variables */
    int ret = OMPI_SUCCESS;
    size_t bytes_in_this_frag;
    struct full_message_t *full_msg_desc = coll_op->fragment_data.message_descriptor;
    bool ready_to_release = true, out_of_resource = false;

    ML_VERBOSE(10, ("Coll_op %p processing completion", coll_op));
    /* Call unpack/pack function */
    if (OPAL_LIKELY(NULL != coll_op->process_fn)) {
        ret = coll_op->process_fn(coll_op);
        switch(ret) {
            case OMPI_SUCCESS:
                ML_VERBOSE(10, ("unpack done"));
                ready_to_release = true;
                break;
            case ORTE_ERR_NO_MATCH_YET:
                ML_VERBOSE(10, ("unexpected packet"));
                ready_to_release = false;
                break;
            default:
                ML_ERROR(("Error, unexpected error code %d", ret));
                return ret;
        }
    }

    bytes_in_this_frag = coll_op->fragment_data.fragment_size;

    ML_VERBOSE(10, ("Delivered %d bytes in frag %d total %d",
                full_msg_desc->n_bytes_delivered,
                bytes_in_this_frag,
                full_msg_desc->n_bytes_total));

    /* check for full message completion */
    if(full_msg_desc->n_bytes_delivered + bytes_in_this_frag ==
            full_msg_desc->n_bytes_total) {
        /* message complete - don't update number of bytes delivered, just
         * mark the message complete
         */
        full_msg_desc->n_bytes_delivered += bytes_in_this_frag;

        /* decrement the number of fragments */
        full_msg_desc->n_active--;

        ML_VERBOSE(10, ("Signaling completion"));

        /* here we need to be sure that we point to the first fragment only */
        ompi_request_complete(&(coll_op->fragment_data.message_descriptor->super), true);
        coll_op->fragment_data.message_descriptor->super.req_status.MPI_ERROR = OMPI_SUCCESS;
    } else {
        assert(NULL != coll_op->fragment_data.buffer_desc);
        /* update the number of bytes delivered */
        full_msg_desc->n_bytes_delivered += bytes_in_this_frag;
        /* decrement the number of fragments */
        full_msg_desc->n_active--;
        /* here we need to start the next fragment */
        ML_VERBOSE(10, ("Launch frags for %p", coll_op));
        if (full_msg_desc->n_bytes_scheduled < full_msg_desc->n_bytes_total) {
            ret = coll_op->fragment_data.message_descriptor->fragment_launcher(coll_op);
            if (OPAL_UNLIKELY(OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret)) {
                out_of_resource = true;
            } else if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                ML_VERBOSE(10, ("Failed to launch fragment"));
                return ret;
            }
        }
    }

    if (ready_to_release) {
        /* Check if we have to recycle memory.
         * Note: It is safe to recycle ML buffers since the ML buffer data
         * already was unpacked to user buffer
         */
         if (NULL != coll_op->fragment_data.buffer_desc) {
             ret = mca_coll_ml_buffer_recycling(coll_op);
             if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
                 return ret;
             }
         }
        /* if this is not fragment 0, return fragment to the free list.
         * fragment 0 will be returned in mca_ml_request_free() which
         * is called from the MPI wait() and test() routines.
         * We can recover the pointer to the fragement descriptor from
         * the MPI level request object, wich is the first element
         * in the fragment descriptor.
         */
        ML_VERBOSE(10, ("Master ? %p %d", coll_op,  coll_op->fragment_data.offset_into_user_buffer));
        if (0 != coll_op->fragment_data.offset_into_user_buffer &&
                !out_of_resource) {
            /* non-zero offset ==> this is not fragment 0 */
            CHECK_AND_RECYCLE(coll_op);
        }
    }

    /* return */
    return OMPI_SUCCESS;
}

/* task completion */
static inline __opal_attribute_always_inline__ int coll_ml_task_dependency_processing(
        mca_coll_ml_task_status_t *task)
{
    /* update dependencies */
    mca_coll_ml_collective_operation_progress_t *my_schedule_instance =
        task->ml_coll_operation;
    int n_dependent_tasks = task->rt_num_dependent_tasks;
    int dep_task;

    for (dep_task = 0; dep_task < n_dependent_tasks; dep_task++)
    {
        int task_index;
        task_index = task->rt_dependent_task_indecies[dep_task];
        my_schedule_instance->dag_description.status_array[task_index].n_dep_satisfied++;
    }

    /* return */
    return OMPI_SUCCESS;
}

/* collective task completion processing -
 * "task" may be removed from list in this routine.
 * Thread safety is assumed to be handled outside this routine.
 */
static inline __opal_attribute_always_inline__ int mca_coll_ml_task_completion_processing(
        mca_coll_ml_task_status_t **task_status_g, opal_list_t *list)
{
    /* local variables */
    int ret = OMPI_SUCCESS;
    mca_coll_ml_task_status_t *task_status = *task_status_g;

    mca_coll_ml_collective_operation_progress_t *coll_op =
        task_status->ml_coll_operation;

    /* Pasha: Since all our collectives so far use the root
       flag, I replacing the call for custom call back function
       with setting root_flag.
       If we will see that we need some custom functionality,
       we will enable it later.
     */

    task_status->ml_coll_operation->variable_fn_params.root_flag = true;

#if 0
    /* process task completion function,
       if any was defined  */
    if (OPAL_LIKELY(NULL != task_status->task_comp_fn)) {
        ret = task_status->task_comp_fn(task_status);
        if (ret != OMPI_SUCCESS) {
            return ret;
        }
    }
#endif

    /* update dependencies */
    ret = coll_ml_task_dependency_processing(task_status);
    if (ret != OMPI_SUCCESS) {
        ML_VERBOSE(3,("Failed to coll_ml_task_dependency_processing"));
        return ret;
    }

    /* process task completion function,
       if any was defined  */
    if (OPAL_LIKELY(NULL != task_status->task_comp_fn)) {
        ret = task_status->task_comp_fn(task_status);
        if (ret != OMPI_SUCCESS) {
            ML_VERBOSE(3,("Failed to task_comp_fn"));
            return ret;
        }
    }

    /* remove the descriptor from the incomplete list
       (Pasha: if the list was provided) */
    /* No need to put this an any new list - it is associcated
     * with the mca_coll_ml_collective_operation_progress_t
     * descriptor already
     */

    if (NULL != list) {
        (*task_status_g) = (mca_coll_ml_task_status_t *)
            opal_list_remove_item(list, (opal_list_item_t *)(task_status));
    }

    /* update completion counter */
    coll_op->dag_description.num_tasks_completed++;

    if(coll_op->dag_description.num_tasks_completed ==
            coll_op->coll_schedule->n_fns)
    {
        /* the actual fragment descriptor is not on any list, as
         * we can get at it from the task descriptors
         */
        ret = coll_ml_fragment_completion_processing(coll_op);
        if (OMPI_SUCCESS != ret) {
            ML_VERBOSE(3,("Failed to coll_ml_fragment_completion_processing"));
            return ret;
        }
    }

    /* return */
    return ret;
}

static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectives_append_to_queue(
                                        mca_coll_ml_collective_operation_progress_t  *op_prog,
                                        mca_coll_ml_task_setup_fn_t task_setup)
{
    int fn_index;
    mca_coll_ml_collective_operation_description_t *op_desc =
        op_prog->coll_schedule;
    mca_coll_ml_compound_functions_t *func = NULL;
    mca_coll_ml_task_status_t *task_status = NULL;
    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

    ML_VERBOSE(9, ("Calling mca_coll_ml_generic_collectives_launcher"));

    /* Init all tasks, before we start them */
    for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) {
        func = &op_desc->component_functions[fn_index];
        task_status = &op_prog->dag_description.status_array[fn_index];

        ML_VERBOSE(9, ("Processing function index %d", fn_index));

        assert(NULL != func);

        /* Init task status */
        task_status->n_dep_satisfied = 0; /* start from zero */
        task_status->bcol_fn = func->bcol_function;
        /* setup run time parametres */
        /* Pasha: do we need the if proctection ? */
        if (OPAL_LIKELY(NULL != task_setup)) {
            task_setup(task_status, fn_index, func);
        }

        /* the pointer to operation progress supposed to be set during
           construction time. Just want to make sure that it is ok */
        assert(task_status->ml_coll_operation == op_prog);

        /* We assume that all pointer to functions are defined and it
         is not reson to check for null */
        assert(NULL != func->bcol_function->coll_fn);

        /* In order to preserve ordering on all ranks we have to add it to tail */
        /* TBD: Need to review the way we launch fragments */
        ML_VERBOSE(9, ("The task %p dependency is %d, appending it on pending list",
                    (void *)task_status, func->num_dependencies));
        OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex));
        opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status);
        OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex));
    }

    ML_VERBOSE(9, ("Collective was launched !"));
    return OMPI_SUCCESS;
}

static inline __opal_attribute_always_inline__ int mca_coll_ml_generic_collectives_launcher(
                                        mca_coll_ml_collective_operation_progress_t  *op_prog,
                                        mca_coll_ml_task_setup_fn_t task_setup)
{
    int fn_index;
    int rc, ret;
    mca_coll_ml_collective_operation_description_t *op_desc =
        op_prog->coll_schedule;
    mca_coll_ml_compound_functions_t *func = NULL;
    mca_coll_ml_task_status_t *task_status = NULL;
    mca_coll_ml_component_t *cm = &mca_coll_ml_component;

    ML_VERBOSE(9, ("Calling mca_coll_ml_generic_collectives_launcher"));

    /* Init all tasks, before we start them */
    for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) {
        func = &op_desc->component_functions[fn_index];
        task_status = &op_prog->dag_description.status_array[fn_index];

        ML_VERBOSE(9, ("Processing function index %d", fn_index));

        assert(NULL != func);

        /* Init task status */
        task_status->n_dep_satisfied = 0; /* start from zero */
        /* task_status->my_index_in_coll_schedule = fn_index;
        pasha: the value is set during init */
        task_status->bcol_fn = func->bcol_function;
        /* Pasha: disabling support for custom complition functions
        task_status->task_comp_fn = func->task_comp_fn;
        */

        /* setup run time parametres */
        /* Pasha: do we need the if proctection ? */
        if (OPAL_LIKELY(NULL != task_setup)) {
            task_setup(task_status, fn_index, func);
        }

        /* the pointer to operation progress supposed to be set during
           construction time. Just want to make sure that it is ok */
        assert(task_status->ml_coll_operation == op_prog);
        /* Task status is done */

        /* launch the task and put it on corresponding list (if required) */

        /* We assume that all pointer to functions are defined and it
         is not reason to check for null */
        assert(NULL != func->bcol_function->coll_fn);
    }

    /* try to start startable */
    for (fn_index = 0; fn_index < op_desc->n_fns; fn_index++) {
        func = &op_desc->component_functions[fn_index];
        task_status = &op_prog->dag_description.status_array[fn_index];
        /* fire the collective imidiate if it has no dependencies */
        if (0 == task_status->rt_num_dependencies) {
            rc = func->bcol_function->coll_fn(&op_prog->variable_fn_params,
                    /* Pasha: Need to update the prototype of the func,
                       right now it is ugly hack for compilation */
                    (struct coll_ml_function_t *)&func->constant_group_data);
            switch(rc) {
                case BCOL_FN_NOT_STARTED:
                    /* put it on pending list */
                    ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_NOT_STARTED, putting the task on pending list"));
                    OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex));
                    opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status);
                    OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex));
                    break;
                case BCOL_FN_STARTED:
                    /* put it on started list */
                    ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_STARTED, puting the task on active list"));
                    OPAL_THREAD_LOCK(&(mca_coll_ml_component.active_tasks_mutex));
                    opal_list_append(&cm->active_tasks, (opal_list_item_t *)task_status);
                    OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.active_tasks_mutex));
                    break;
                case BCOL_FN_COMPLETE:
                    /* the tast is done ! lets start relevant dependencies */
                    ML_VERBOSE(9, ("Call to bcol collecitive return BCOL_FN_COMPLETE"));
                    /* the task does not belong to any list, yes. So passing NULL */
                    ret = mca_coll_ml_task_completion_processing(&task_status, NULL);
                    if (OMPI_SUCCESS != ret) {
                        ML_VERBOSE(9, ("Failed to mca_coll_ml_task_completion_processing"));
                        return ret;
                    }
                    break;
                default:
                    ML_ERROR(("Unknow exit status %d", rc));
                    return OMPI_ERROR;
            }
        } else {
            /* the task is depend on other, lets put it on pending list */
            ML_VERBOSE(9, ("The task %p dependency is %d, putting it on pending list",
                        (void *)task_status, func->num_dependencies));
            OPAL_THREAD_LOCK(&(mca_coll_ml_component.pending_tasks_mutex));
            opal_list_append(&cm->pending_tasks, (opal_list_item_t *)task_status);
            OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.pending_tasks_mutex));
        }
    }
    ML_VERBOSE(9, ("Collective was launched !"));
    return OMPI_SUCCESS;
}

#define ML_SET_VARIABLE_PARAMS_BCAST(op, ml, cnt, datatype, b_desc,       \
                        s_offset, r_offset, frag_len, buf)                \
do {                                                                      \
            op->variable_fn_params.sequence_num =                         \
            OPAL_THREAD_ADD64(&((ml)->collective_sequence_num), 1);       \
            op->variable_fn_params.count = cnt;                           \
            op->variable_fn_params.dtype = datatype;                      \
            op->variable_fn_params.buffer_index = (b_desc)->buffer_index; \
            op->variable_fn_params.src_desc = (b_desc);                   \
            op->variable_fn_params.sbuf_offset = s_offset;                \
            op->variable_fn_params.rbuf_offset = r_offset;                \
            op->variable_fn_params.frag_size = frag_len;                  \
            op->variable_fn_params.sbuf = buf;                            \
} while (0)

#define MCA_COLL_ML_OP_BASIC_SETUP(op, total_bytes, offset_into_user_buff, src, dst, collective_schedule)      \
    do {                                                                                    \
        op->coll_schedule                         = collective_schedule;                    \
        op->process_fn                            = NULL;                                   \
        op->full_message.n_bytes_total            = total_bytes;                            \
        op->full_message.n_bytes_delivered        = 0;                                      \
        op->full_message.n_bytes_scheduled        = 0;                                      \
        op->full_message.dest_user_addr           = dst;                                    \
        op->full_message.src_user_addr            = src;                                    \
        op->full_message.n_active                 = 0;                                      \
        op->full_message.n_bytes_per_proc_total   = 0;                                      \
        op->full_message.send_count               = 0;                                      \
        op->full_message.recv_count               = 0;                                      \
        op->full_message.send_extent              = 0;                                      \
        op->full_message.recv_extent              = 0;                                      \
        op->full_message.offset_into_send_buffer  = 0;                                      \
        op->full_message.offset_into_recv_buffer  = 0;                                      \
        op->full_message.send_data_type           = 0;                                      \
        op->full_message.recv_data_type           = 0;                                      \
        op->full_message.fragment_launcher            = 0;                                  \
        op->sequential_routine.current_active_bcol_fn = 0;                                  \
        op->sequential_routine.current_bcol_status    = SEQ_TASK_NOT_STARTED;               \
                                                                                            \
        op->fragment_data.offset_into_user_buffer = offset_into_user_buff;                  \
        /* Pasha, is it constant ? what to put here */                                      \
        op->fragment_data.fragment_size           = total_bytes;                            \
        op->fragment_data.message_descriptor      = &op->full_message;                      \
        op->fragment_data.current_coll_op         = -1;                                     \
    } while (0)

static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t *
mca_coll_ml_alloc_op_prog_single_frag_dag(
        mca_coll_ml_module_t *ml_module,
        mca_coll_ml_collective_operation_description_t *coll_schedule,
        void *src, void *dst, size_t total_bytes,
        size_t offset_into_user_buffer
        )
{
    int rc;
    ompi_free_list_item_t *item;
    mca_coll_ml_collective_operation_progress_t  *coll_op = NULL;
    ompi_request_t *req;

    /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */
    OMPI_FREE_LIST_WAIT(&(ml_module->coll_ml_collective_descriptors),
                          item,
                          rc);

    coll_op = (mca_coll_ml_collective_operation_progress_t *) item;
    ML_VERBOSE(10, (">>> Allocating coll op %p", coll_op));
    assert(NULL != coll_op);
    assert(coll_op->dag_description.status_array[0].item.opal_list_item_refcount == 0);
    req = &(coll_op->full_message.super);

    OMPI_REQUEST_INIT(req, false);
    /* Mark the request ACTIVE. It is critical for MPI_Test()*/
    req->req_state = OMPI_REQUEST_ACTIVE;
    req->req_status._cancelled = 0;
    req->req_status.MPI_ERROR = OMPI_SUCCESS;

    MCA_COLL_ML_OP_BASIC_SETUP(coll_op, total_bytes,
            offset_into_user_buffer, src, dst, coll_schedule);

    /* We do not set sequential, since it is not sequential call */
    coll_op->dag_description.num_tasks_completed = 0;

    /* Release reference counter have to be zero */
    assert(0 == coll_op->pending);

    return coll_op;
}

static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t *
mca_coll_ml_duplicate_op_prog_single_frag_dag(
        mca_coll_ml_module_t *ml_module,
        mca_coll_ml_collective_operation_progress_t *old_op)
{
    mca_coll_ml_collective_operation_progress_t  *new_op = NULL;

    new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module,
            ml_module->coll_ml_bcast_functions[old_op->fragment_data.current_coll_op],
            old_op->fragment_data.message_descriptor->dest_user_addr,
            old_op->fragment_data.message_descriptor->src_user_addr,
            old_op->fragment_data.message_descriptor->n_bytes_total,
            old_op->fragment_data.message_descriptor->n_bytes_scheduled);

    new_op->fragment_data.current_coll_op = old_op->fragment_data.current_coll_op;
    new_op->fragment_data.message_descriptor = old_op->fragment_data.message_descriptor;

    return new_op;
}

static inline __opal_attribute_always_inline__ mca_coll_ml_collective_operation_progress_t *
                                                    mca_coll_ml_alloc_op_prog_single_frag_seq(
                                        mca_coll_ml_module_t *ml_module,
                                        mca_coll_ml_collective_operation_description_t *coll_schedule,
                                        void *src, void *dst,
                                        size_t total_bytes,
                                        size_t offset_into_user_buffer
                                        )
{
    int rc;
    ompi_free_list_item_t *item;
    mca_coll_ml_collective_operation_progress_t  *coll_op = NULL;

    /* Blocking call on fragment allocation (Maybe we want to make it non blocking ?) */
    OMPI_FREE_LIST_WAIT(&(ml_module->coll_ml_collective_descriptors),
                          item,
                          rc);

    coll_op = (mca_coll_ml_collective_operation_progress_t *) item;

    assert(NULL != coll_op);

    MCA_COLL_ML_OP_BASIC_SETUP(coll_op, total_bytes,
            offset_into_user_buffer, src, dst, coll_schedule);

    /* set sequential data */
    /* pasha - do we have something to set ? */

    return coll_op;
}

/* This routine re-orders and packs user data.  The assumption is that
 * there is per-process data, the amount of data is the same for all * ranks,
 * and the user data is contigous.
 */
int mca_coll_ml_pack_reorder_contiguous_data(
        mca_coll_ml_collective_operation_progress_t *coll_op);

/* This routine re-orders and packs user data.  The assumption is that
 * there is per-process data, the amount of data is the same for all * ranks,
 * and the user data is noncontigous.
 */
int mca_coll_ml_pack_reorder_noncontiguous_data(
        mca_coll_ml_collective_operation_progress_t *coll_op);

static inline __opal_attribute_always_inline__
                void mca_coll_ml_convertor_get_send_frag_size(mca_coll_ml_module_t *ml_module,
                                     size_t *frag_size, struct full_message_t *message_descriptor)
{
    size_t ml_fragment_size = ml_module->ml_fragment_size;
    opal_convertor_t *dummy_convertor = &message_descriptor->dummy_convertor;

    /* The last frag needs special service */
    if (ml_fragment_size >
          message_descriptor->send_converter_bytes_packed) {
        *frag_size = message_descriptor->send_converter_bytes_packed;
        message_descriptor->send_converter_bytes_packed = 0;

        return;
    }

    *frag_size = ml_fragment_size;
    message_descriptor->dummy_conv_position += ml_fragment_size;

    opal_convertor_generic_simple_position(dummy_convertor, &message_descriptor->dummy_conv_position);
    *frag_size -= dummy_convertor->partial_length;

    message_descriptor->send_converter_bytes_packed -= (*frag_size);
}

END_C_DECLS


#endif /* MCA_COLL_ML_ML_H */