/*
 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2006 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#ifndef MCA_COLL_TUNED_EXPORT_H
#define MCA_COLL_TUNED_EXPORT_H

#include "ompi_config.h"

#include "mpi.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"

/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"

/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"

/* need the forced user choice structures */
#include "coll_tuned_forced.h"

/* some fixed value index vars to simplify certain operations */
typedef enum COLLTYPE {ALLGATHER, ALLGATHERV, ALLREDUCE, ALLTOALL, ALLTOALLV, ALLTOALLW, BARRIER, BCAST,
EXSCAN, GATHER, GATHERV, REDUCE, REDUCESCATTER, SCAN, SCATTER, SCATTERV, COLLCOUNT} COLLTYPE_T;

/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps,  struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define BARRIER_ARGS struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,  struct ompi_op_t *op, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, struct mca_coll_base_module_1_1_0_t *module
/* end defined arg lists to simply auto inclusion of user overriding decision functions */

#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif

    /* these are the same across all modules and are loaded at component query time */
    extern int   ompi_coll_tuned_stream;
    extern int   ompi_coll_tuned_priority;
    extern int   ompi_coll_tuned_preallocate_memory_comm_size_limit;
    extern int   ompi_coll_tuned_use_dynamic_rules;
    extern char* ompi_coll_tuned_dynamic_rules_filename;
    extern int   ompi_coll_tuned_init_tree_fanout;
    extern int   ompi_coll_tuned_init_chain_fanout;
    extern int   ompi_coll_tuned_init_max_requests;

    /* forced algorithm choices */
    /* the indices to the MCA params so that modules can look them up at open / comm create time  */
    extern coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT];
    /* the actual max algorithm values (readonly), loaded at component open */
    extern int ompi_coll_tuned_forced_max_algorithms[COLLCOUNT];

    /*
     * coll API functions
     */

    /* API functions */

    int ompi_coll_tuned_init_query(bool enable_progress_threads,
				   bool enable_mpi_threads);

    struct mca_coll_base_module_1_1_0_t *
    ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);

    /* API functions of decision functions and any implementations */

    /*
     * Note this gets long as we have to have a prototype for each 
     * MPI collective 4 times.. 2 for the comm type and 2 for each decision
     * type. 
     * we might cut down the decision prototypes by conditional compiling
     */

    /* All Gather */
    int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_allgather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
    int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);

    /* All GatherV */
    int ompi_coll_tuned_allgatherv_intra_dec_fixed(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_allgatherv_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
    int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);

    /* All Reduce */
    int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
    int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
    int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);

    /* AlltoAll */
    int ompi_coll_tuned_alltoall_intra_dec_fixed(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
    int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
    int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
    int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);

    /* AlltoAllV */
    int ompi_coll_tuned_alltoallv_intra_dec_fixed(ALLTOALLV_ARGS);
    int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
    int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
    int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);

    /* AlltoAllW */
    int ompi_coll_tuned_alltoallw_intra_dec_fixed(ALLTOALLW_ARGS);
    int ompi_coll_tuned_alltoallw_intra_dec_dynamic(ALLTOALLW_ARGS);
    int ompi_coll_tuned_alltoallw_inter_dec_fixed(ALLTOALLW_ARGS);
    int ompi_coll_tuned_alltoallw_inter_dec_dynamic(ALLTOALLW_ARGS);

    /* Barrier */
    int ompi_coll_tuned_barrier_intra_dec_fixed(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_dec_dynamic(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_do_forced(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
    int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);

    /* Bcast */
    int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
    int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS);
    int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
    int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
    int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
    int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
    int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
    int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
    int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
    int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
    int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
    int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);

    /* Exscan */
    int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
    int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
    int ompi_coll_tuned_exscan_inter_dec_fixed(EXSCAN_ARGS);
    int ompi_coll_tuned_exscan_inter_dec_dynamic(EXSCAN_ARGS);

    /* Gather */
    int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
    int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
    int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
    int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
    int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
    int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
    int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
    int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);

    /* GatherV */
    int ompi_coll_tuned_gatherv_intra_dec_fixed(GATHERV_ARGS);
    int ompi_coll_tuned_gatherv_intra_dec_dynamic(GATHER_ARGS);
    int ompi_coll_tuned_gatherv_inter_dec_fixed(GATHER_ARGS);
    int ompi_coll_tuned_gatherv_inter_dec_dynamic(GATHER_ARGS);

    /* Reduce */
    int ompi_coll_tuned_reduce_generic( REDUCE_ARGS, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
    int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
    int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
    int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
    int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
    int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
    int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
    int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);

    /* Reduce_scatter */
    int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);

    int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
    int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
 
    /* Scan */
    int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
    int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
    int ompi_coll_tuned_scan_inter_dec_fixed(SCAN_ARGS);
    int ompi_coll_tuned_scan_inter_dec_dynamic(SCAN_ARGS);

    /* Scatter */
    int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
    int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
    int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
    int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);

    /* ScatterV */
    int ompi_coll_tuned_scatterv_intra_dec_fixed(SCATTERV_ARGS);
    int ompi_coll_tuned_scatterv_intra_dec_dynamic(SCATTERV_ARGS);
    int ompi_coll_tuned_scatterv_inter_dec_fixed(SCATTERV_ARGS);
    int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);

    int mca_coll_tuned_ft_event(int state);


    /* Utility functions */

    static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
    {
	int i;
	for (i = 0; i < count; ++i)
	    ompi_request_free(&reqs[i]);
    }

    struct mca_coll_tuned_component_t {
	/** Base coll component */ 
	mca_coll_base_component_1_1_0_t super;

	/** MCA parameter: Priority of this component */
	int tuned_priority;

	/** global stuff that I need the component to store */

	/* MCA parameters first */

	/* cached decision table stuff (moved from MCW module) */
	ompi_coll_alg_rule_t *all_base_rules;       
    };
    /**
     * Convenience typedef
     */
    typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;

    /**
     * Global component instance
     */
    OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;

    /*
     * Data structure for hanging data off the communicator 
     * i.e. per module instance
     */
    struct mca_coll_tuned_comm_t {
	/* standard data for requests and PML usage */

	/* Precreate space for requests 
	 * Note this does not effect basic, 
	 * but if in wrong context can confuse a debugger
	 * this is controlled by an MCA param
	 */

	ompi_request_t **mcct_reqs;
	int mcct_num_reqs;

	/* 
	 * tuned topo information caching per communicator 
	 *
	 * for each communicator we cache the topo information so we can 
	 * reuse without regenerating if we change the root, [or fanout]
	 * then regenerate and recache this information 
	 *
	 */

	/* general tree with n fan out */
	ompi_coll_tree_t *cached_ntree;
	int cached_ntree_root; 
	int cached_ntree_fanout; 

	/* binary tree */
	ompi_coll_tree_t *cached_bintree;
	int cached_bintree_root; 

	/* binomial tree */
	ompi_coll_tree_t *cached_bmtree;
	int cached_bmtree_root;

	/* binomial tree */
	ompi_coll_tree_t *cached_in_order_bmtree;
	int cached_in_order_bmtree_root;

	/* chained tree (fanout followed by pipelines) */
	ompi_coll_tree_t *cached_chain;
	int cached_chain_root;
	int cached_chain_fanout; 

	/* pipeline */
	ompi_coll_tree_t *cached_pipeline;
	int cached_pipeline_root;

	/* in-order binary tree (root of the in-order binary tree is rank 0) */
	ompi_coll_tree_t *cached_in_order_bintree;
	
	/* extra data required by the decision functions */
	ompi_coll_alg_rule_t *all_base_rules;       /* stored only on MCW, all other coms ref it */
	/* moving to the component */
	ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */

	/* for forced algorithms we store the information on the module */
	/* previously we only had one shared copy, ops, it really is per comm/module */
	coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
    };
    /**
     * Convenience typedef
     */
    typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;

    struct mca_coll_tuned_module_t {
	mca_coll_base_module_1_1_0_t super;

	mca_coll_tuned_comm_t *tuned_data;
    };
    typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
    OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT )	\
do {                                                                                       \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;                  	   \
    if( !( (coll_comm->cached_bintree)                                                     \
           && (coll_comm->cached_bintree_root == (ROOT)) ) ) {                             \
        if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */       \
            ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) );             \
        }                                                                                  \
        coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
        coll_comm->cached_bintree_root = (ROOT);                                           \
    }                                                                                      \
} while (0)

#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT )	\
do {                                                                                         \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;                           \
    if( !( (coll_comm->cached_bmtree)                                                        \
           && (coll_comm->cached_bmtree_root == (ROOT)) ) ) {                                \
        if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */          \
            ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) );                \
        }                                                                                    \
        coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
        coll_comm->cached_bmtree_root = (ROOT);                                              \
    }                                                                                        \
} while (0)

#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do {                                                                                         \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;                           \
    if( !( (coll_comm->cached_in_order_bmtree)                                               \
           && (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) {                       \
        if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
            ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) );       \
        }                                                                                    \
        coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
        coll_comm->cached_in_order_bmtree_root = (ROOT);                                     \
    }                                                                                        \
} while (0)

#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT )	\
do {                                                                                             \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;                               \
    if( !( (coll_comm->cached_pipeline)                                                          \
           && (coll_comm->cached_pipeline_root == (ROOT)) ) ) {                                  \
        if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */             \
            ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) );                  \
        }                                                                                        \
        coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
        coll_comm->cached_pipeline_root = (ROOT);                                                \
    }                                                                                            \
} while (0)

#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT )	\
do {                                                                                             \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;                               \
    if( !( (coll_comm->cached_chain)                                                             \
           && (coll_comm->cached_chain_root == (ROOT))                                           \
           && (coll_comm->cached_chain_fanout == (FANOUT)) ) ) {                                 \
        if( coll_comm->cached_chain) { /* destroy previous chain if defined */                   \
            ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) );                     \
        }                                                                                        \
        coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
        coll_comm->cached_chain_root = (ROOT);                                                   \
        coll_comm->cached_chain_fanout = (FANOUT);                                               \
    }                                                                                            \
} while (0)

#define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE )	\
do {                                                                           \
    mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data;             \
    if( !(coll_comm->cached_in_order_bintree) ) {                              \
        /* In-order binary tree topology is defined by communicator size */    \
        /* Thus, there is no need to destroy anything */                       \
        coll_comm->cached_in_order_bintree =                                   \
	    ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \
    }                                                                          \
} while (0)

/**
 * This macro give a generic way to compute the best count of
 * the segment (i.e. the number of complete datatypes that
 * can fit in the specified SEGSIZE). Beware, when this macro
 * is called, the SEGCOUNT should be initialized to the count as
 * expected by the collective call.
 */
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT)        \
    if( ((SEGSIZE) >= (TYPELNG)) &&                                     \
        ((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) {                      \
        size_t residual;                                                \
        (SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG));                      \
        residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG);                  \
        if( residual > ((TYPELNG) >> 1) )                               \
            (SEGCOUNT)++;                                               \
    }                                                                   \

/**
 * This macro gives a generic wait to compute the well distributed block counts
 * when the count and number of blocks are fixed.
 * Macro returns "early-block" count, "late-block" count, and "split-index"
 * which is the block at which we switch from "early-block" count to 
 * the "late-block" count.
 * count = split_index * early_block_count + 
 *         (block_count - split_index) * late_block_count
 * We do not perform ANY error checks - make sure that the input values 
 * make sense (eg. count > num_blocks).
 */
#define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX,       \
                                       EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
    EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS;               \
    SPLIT_INDEX = COUNT % NUM_BLOCKS;                                        \
    if (0 != SPLIT_INDEX) {                                                  \
        EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1;                           \
    }                                                                        \


#endif /* MCA_COLL_TUNED_EXPORT_H */