openmpi/ompi/mca/bcol/basesmuma/bcol_basesmuma.h

/*
 * Copyright (c) 2009-2012 Oak Ridge National Laboratory.  All rights reserved.
 * Copyright (c) 2009-2012 Mellanox Technologies.  All rights reserved.
 * Copyright (c) 2014 Cisco Systems, Inc.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */


#ifndef MCA_BCOL_basesmuma_EXPORT_H
#define MCA_BCOL_basesmuma_EXPORT_H

#include "ompi_config.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/coll/ml/coll_ml.h"
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
#include "ompi/request/request.h"
#include "ompi/proc/proc.h"
#include "ompi/patterns/net/netpatterns.h"

#include "opal/mca/mca.h"
#include "opal/util/arch.h"
#include "opal/util/argv.h"
#include "opal/datatype/opal_datatype.h"
#include "opal/util/output.h"

#include "bcol_basesmuma_smcm.h"
BEGIN_C_DECLS

struct list_data_t  {
    opal_list_item_t super;
    void *data;
};
typedef struct list_data_t list_data_t;
OBJ_CLASS_DECLARATION(list_data_t);

/*
 * Macro's for manipulating the 64 bit shared memory control bits.
 * The 64 bit field is devided into 4 bit fields
 *
 *   | 48-63: src  |  32-47: index |  16-31: flag |  0-15: sequence number |
 *
 * Only the low 16 bits of the sequence number will be put in the header
 * space.  We will use the fact that the use of the shared buffers is
 * synchronous, and get the upper 48 bits from the local process space.
 */

#define BASESMUMA_CACHE_LINE_SIZE 128

#define SHIFT_UP   <<
#define SHIFT_DOWN >>

#define SEQ_WIDTH  16
#define SEQ_BASE    0
#define FIELD_SEQ_MASK   ( ( 1 SHIFT_UP SEQ_WIDTH ) - 1 )
#define INPLACE_SEQ_MASK ( (int64_t)FIELD_SEQ_MASK SHIFT_UP SEQ_BASE)

#define FLAG_WIDTH 16
#define FLAG_BASE  16
#define FIELD_FLAG_MASK   ( ( 1 SHIFT_UP FLAG_WIDTH ) - 1 )
#define INPLACE_FLAG_MASK ( (int64_t)FIELD_FLAG_MASK SHIFT_UP FLAG_BASE)

#define INDX_WIDTH 16
#define INDX_BASE  32
#define FIELD_INDX_MASK   ( ( 1 SHIFT_UP INDX_WIDTH ) - 1 )
#define INPLACE_INDX_MASK ( (int64_t)FIELD_INDX_MASK SHIFT_UP INDX_BASE)

#define SRC_WIDTH  16
#define SRC_BASE   48
#define FIELD_SRC_MASK   ( ( 1 SHIFT_UP SRC_WIDTH ) - 1 )
#define INPLACE_SRC_MASK ( (int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE)
/*int64_t INPLACE_SRC_MASK= ((int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE); */


#define EXTRACT_FLAG(INPUT, OUTPUT, OUTPUT_TYPE, FIELD_BASE, FIELD_MASK) \
        OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK )

#define STORE_FLAG(INPUT, OUTPUT, INPUT_TYPE, OUTPUT_TYPE, FIELD_BASE, INPLACE_FIELD_MASK ) \
        OUTPUT =  \
     ( \
         /* 3 */ \
         ( \
             /* 2 */ \
             ( \
                 /* 1 - shift the input field to the proper location */ \
                 (OUTPUT_TYPE)( \
                   ((OUTPUT_TYPE)((INPUT_TYPE) (INPUT)))  \
                        SHIFT_UP FIELD_BASE ) \
                 /* mask off the extra bits */ \
                  & ((OUTPUT_TYPE)INPLACE_FIELD_MASK)  \
             ) \
         /* store back to the OUTPUT field, w/o destroying other fields */ \
         ) | OUTPUT \
     )

/**
 * Structure to hold the basic shared memory bcoll component.
 */
struct mca_bcol_basesmuma_component_t {
    /** Base coll component */
    mca_bcol_base_component_2_0_0_t super;

    /* management data for collectives with no user data */

    /** MCA parameter: number of memory banks */
    int basesmuma_num_mem_banks;

    /** MCA parameter: number of regions per memory bank */
    int basesmuma_num_regions_per_bank;

    /** MCA parameter: Number of simultaneous groups supported */
    int n_groups_supported;

    /* management data for collectives with user data (ud) - the memory
     * is actually obtained at the ML level
     */

    /** MCA paramenter:  number of polling loops to run while waiting
     *  for children or parent to complete their work
     */
    int n_poll_loops;

    /* mpool size */
    size_t mpool_size;


    /* mpool inited - will use this to test whether or not the
     * shared memory has been inited
     */
    bool mpool_inited;

    /* shared memory control buffer - the control structures reside
     *   in shared memory */
    bcol_basesmuma_smcm_mmap_t *sm_ctl_structs;

    /* shared memory payload buffer
     */
    bcol_basesmuma_smcm_mmap_t *sm_payload_structs;

    /*
     * list of shared memory control structures
     */
    opal_list_t ctl_structures;


    /** opal list in which the list of peers that I am "connected" to is stored
     */
    opal_list_t sm_connections_list;

    /* opal list in which the list of payload peers that I am "connected" to
     * is stored
     */
    opal_list_t sm_payload_connections_list;

    /*
     * list of non-blocking admin barriers to progress */
    opal_mutex_t nb_admin_barriers_mutex;
    opal_list_t nb_admin_barriers;

    /*
     * order of fan-in tree
     */
    int radix_fanin;

    /*
     * order of fan-out tree
     */
    int radix_fanout;

    /*
     * Order of read tree
     */
    int radix_read_tree;

    /*
     * order of reduction fan-out tree
     */
    int order_reduction_tree;

    /*
     * K-nomial tree radix
     */
    int k_nomial_radix;

    /*
     * K-ary scatter tree radix
     */
    int scatter_kary_radix;

    /*
     * number of polling loops
     */
    int num_to_probe;

	/*
	 * Portals addressing info
	 * void*: because wanted to keep portal library dependencies
	 * as local as possible
	 */
	void *portals_info;
	bool portals_init;

    /*
     * verbosity level
     */
    int verbose;

    /*
     * control file name base string
     */
    char *clt_base_fname;

    /*
     * data file name base string
     */
    char *payload_base_fname;

    /*
     * shared memory scratch space.  This is mapped at the end of the
     * segement of memory holding the control structures.
     */
    char *my_scratch_shared_memory;

    /*
     * size of scratch memory
     */
    size_t my_scratch_shared_memory_size;

    /* the offset will be the same for all ranks */
    size_t scratch_offset_from_base_ctl_file;
};

static inline int mca_bcol_basesmuma_err(const char* fmt, ...)
{
    va_list list;
    int ret;

    va_start(list, fmt);
    ret = vfprintf(stderr, fmt, list);
    va_end(list);
    return ret;
}

#if OPAL_ENABLE_DEBUG
#define BASESMUMA_VERBOSE(level, args)                              \
do {                                                     \
    if(mca_bcol_basesmuma_component.verbose >= level) {         \
        mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ",     \
                ompi_process_info.nodename,              \
                OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),      \
                __FILE__, __LINE__, __func__);           \
        mca_bcol_basesmuma_err args;                            \
        mca_bcol_basesmuma_err("\n");                           \
    }                                                    \
} while(0)
#else
#define BASESMUMA_VERBOSE(level, args)
#endif


/**
 * Convenience typedef */
typedef struct mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component_t;

#if 0
    /*
     * Implemented function index list
     */

    /* barrier */
    enum{
        FANIN_FAN_OUT_BARRIER_FN,
        RECURSIVE_DOUBLING_BARRIER_FN,
        N_BARRIER_FNS
    };

    /* reduce */
    enum{
        FANIN_REDUCE_FN,
        REDUCE_SCATTER_GATHER_FN,
        N_REDUCE_FNS
    };
    enum{
        SHORT_DATA_FN_REDUCE,
        LONG_DATA_FN_REDUCE,
        N_REDUCE_FNS_USED
    };

    /* all-reduce */
    enum{
        FANIN_FANOUT_ALLREDUCE_FN,
        REDUCE_SCATTER_ALLGATHER_FN,
        N_ALLREDUCE_FNS
    };
    enum{
        SHORT_DATA_FN_ALLREDUCE,
        LONG_DATA_FN_ALLREDUCE,
        N_ALLREDUCE_FNS_USED
    };


    /* enum for node type */
    enum{
        ROOT_NODE,
        LEAF_NODE,
        INTERIOR_NODE
    };


    /*
     * N-order tree node description
     */
    struct tree_node_t {
        /* my rank within the group */
        int my_rank;
        /* my node type - root, leaf, or interior */
        int my_node_type;
        /* number of nodes in the tree */
        int tree_size;
        /* number of parents (0/1) */
        int n_parents;
        /* number of children */
        int n_children;
        /* parent rank within the group */
        int parent_rank;
        /* chidren ranks within the group */
        int *children_ranks;
    };
    typedef struct tree_node_t tree_node_t;

    /*
     * Pair-wise data exchange
     */
    /* enum for node type */
    enum{
        EXCHANGE_NODE,
        EXTRA_NODE
    };

    struct pair_exchange_node_t {

	/* my rank within the group */
	int my_rank;

        /* number of nodes this node will exchange data with */
        int n_exchanges;

        /* ranks of nodes involved in data exchnge */
        int *rank_exchanges;

        /* number of extra sources of data - outside largest power of 2 in
         *  this group */
        int n_extra_sources;

        /* rank of the extra source */
        int rank_extra_source;

        /* number of tags needed per stripe */
        int n_tags;

        /* log 2 of largest full power of 2 for this node set */
        int log_2;

        /* largest power of 2 that fits in this group */
        int n_largest_pow_2;

        /* node type */
        int node_type;

    };
    typedef struct pair_exchange_node_t pair_exchange_node_t;
#endif
    /*
     * descriptor for managing the admin nonblocking barrier routine.
     *   This is an sm internal routine, and assumes only 1 outstanding
     *   nb-barrier collective call per block.
     */
    /* forward declarations */
    struct mca_bcol_basesmuma_module_t;
    struct sm_buffer_mgmt;

    struct sm_nbbar_desc_t {
        /* make sure we can put this on a list */
        opal_list_item_t super;

        /* phase of the collective operation - needed to know how to continue
         * progressing the nb-barrier */
        int collective_phase;

        /* iteration to continue at */
        int recursive_dbl_iteration;

        /* pointer to the collective module this is associated with */
        struct mca_bcol_basesmuma_module_t *sm_module;

	    /* pointer to payload/control structs buffers */
        struct sm_buffer_mgmt *coll_buff;

        /* pool index */
        int pool_index;

        /* pointer to the ml_memory_block_desc_t structure
         * that is actually managing this registration.
         * This is meaningful when these control structures
         * are used in conjunction with the user payload
         * data that is allocated at the ml level.
         */
         void *ml_memory_block_descriptor;

    };
    typedef struct sm_nbbar_desc_t sm_nbbar_desc_t;

    /*
     * Barrier request objects
     */

    /* shared memory data strucutures */
    struct mca_bcol_basesmuma_nb_request_process_shared_mem_t {
        volatile uint64_t coll_index;
        /* flag used to indicate the status of this memory region */
        volatile uint64_t flag;
        volatile uint64_t index;

        /* pading */
        /* Note: need to change this so it takes less memory */
        char padding[BASESMUMA_CACHE_LINE_SIZE-3*sizeof(uint64_t)];
    };

    typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t
        mca_bcol_basesmuma_nb_request_process_shared_mem_t;

    /* enum for phase at which the nb barrier is in */
    enum{
        NB_BARRIER_INACTIVE,

        /* fan-in/fan-out */
        NB_BARRIER_FAN_IN,
        NB_BARRIER_FAN_OUT,

        /* recursive doubling */
        NB_PRE_PHASE,
        NB_RECURSIVE_DOUBLING,
        NB_POST_PHASE,

        /* done and not started are the same for all practicle
         * purposes, as the init funtion always sets this flag
         */
        NB_BARRIER_DONE
    };


    /* forward declartion */
    struct mca_bcol_basesmuma_module_t;


    /* control segment for shared memory */
    struct mca_bcol_basesmuma_ctl_struct_t {
	    /* collective identifier */
	    volatile int64_t sequence_number;
	    volatile int64_t flag;
        volatile int64_t index;
        volatile int64_t offset;
        volatile int64_t offset_zip;


        /* used for non-blocking algorithms */
        int status;
        int active_requests;
        int iteration;

        int *src_ptr;

        int start;

        /* process private data */
        int starting_flag_value;

        /* experiment for large data colls */
        int n_sends;
        int length;


#ifdef __PORTALS_AVAIL__
		struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr;
#endif
	    /* padding */
        /* ok, no room to pad anymore */
	    /*char padding[BASESMUMA_CACHE_LINE_SIZE-5*sizeof(int64_t)-8*sizeof(int)];*/
    };
    typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t;


#define SM_BCOLS_MAX 2

    enum {
        ALLGATHER_FLAG,
        ALLREDUCE_FLAG,
        BARRIER_FANIN_FLAG,
        BARRIER_FANOUT_FLAG,
        BARRIER_RKING_FLAG,
        BCAST_FLAG,
        GATHER_FLAG,
        REDUCE_FLAG,
        NUM_SIGNAL_FLAGS
    };


    /* control region for colls with user data - shared memory */
    struct mca_bcol_basesmuma_header_t {
        /* collective identifier */
        volatile int64_t sequence_number;
        volatile int8_t  flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX];
        volatile int32_t src; /* src of bcast data for unknown root,
                                 bcol id for known root
                               */
        /* starting flag - hierarchies */
        int8_t starting_flag_value[SM_BCOLS_MAX];

    };
    typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t;

    /* data needed for large messages */
    struct mca_bcol_basesmuma_large_msg_t {
        /* scatter allgather data */
        uint64_t offset;
        uint64_t n_sends;
        uint64_t length;

        /* portals data */

    };
    typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t;

    /* payload struct */
	struct mca_bcol_basesmuma_payload_t {

		/* base pointer to shared memory control structure */
		mca_bcol_basesmuma_header_t *ctl_struct;
        void *payload;

	};

	typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t;


    /* memory bank memory management structure */
    struct mem_bank_management_t {

        /* generation counter */
        uint64_t bank_gen_counter;

        /* descriptor for the non-blocking barrier.  This is
         *  used to manage this bank of memory.
         */
        sm_nbbar_desc_t nb_barrier_desc;

        /* the number of buffers that are not in use, and are
         * available.  The assumption is that the buffers are
         * recycled all at once, so are available for re-use
         * until all buffers have been made available for re-use.
         */
        volatile int available_buffers;

        /*
         * number of buffers freed */
        volatile int n_buffs_freed;

        /* mutex to ensure atomic recycling of resrouces */
        opal_mutex_t mutex;

        /* number of buffers being managed */
        int number_of_buffers;

        /* shared memory control structures */
        int index_shared_mem_ctl_structs;


    };
    typedef struct mem_bank_management_t mem_bank_management_t;

/* data structure for shared buffers */
struct sm_buffer_mgmt {
    /* number of buffers per process */
    int number_of_buffs;

    /* size of group */
    int size_of_group;

    /* number of memory banks */
    int num_mem_banks;

    /* number of buffers per memory bank */
    int num_buffs_per_mem_bank;

    /* log base 2 of num_buffs_per_mem_bank */
    int log2_num_buffs_per_mem_bank;

    /* log base 2 total number of buffers */
    int log2_number_of_buffs;

    /* mask - masks off the bits corresponding to buffer index */
    int mask;

    /* control buffers - these point to regions in shared memory */
    /* leading dimension is the group size - all pointers for a given
     * set of buffers appear consecutively in this array
     */
    volatile void **ctl_buffs;

    /* management data for the control structures -
     * one per bank of control structures - Will be used for
     * the payload buffers as well.
     */
    mem_bank_management_t *ctl_buffs_mgmt;

    /* data buffers - these point to regions in shared memory */
    /* leading dimension is the group size - all pointers for a given
     * set of buffers appear consecutively in this array
     */

    volatile mca_bcol_basesmuma_payload_t *data_buffs;


};
typedef struct sm_buffer_mgmt sm_buffer_mgmt;


struct mca_bcol_basesmuma_nb_coll_buff_desc_t {
    void     *data_addr;
    uint64_t     bank_index;
    uint64_t     buffer_index;
    int       active_requests;
    ompi_request_t **requests;
    int          data_src;
    int          radix_mask;
    int          radix_mask_pow;
    int          iteration;
    int          status;
   	/* this is for testing */
	int 		tag;

	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer;
	volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer;
};

typedef struct mca_bcol_basesmuma_nb_coll_buff_desc_t mca_bcol_basesmuma_nb_coll_buff_desc_t;

struct mca_bcol_basesmuma_local_mlmem_desc_t {

	uint32_t bank_index_for_release;
    struct ml_memory_block_desc_t *ml_mem_desc;
    uint32_t     num_banks;
    uint32_t     num_buffers_per_bank;
    uint32_t     size_buffer;
    uint32_t     *bank_release_counter;

	/*
	 * Number of descriptors allocated is equivalent to number of ml buffers
	 * (number of banks * number of buffers per bank)
	 */
	mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc;
};

typedef struct mca_bcol_basesmuma_local_mlmem_desc_t mca_bcol_basesmuma_local_mlmem_desc_t;

#ifdef __PORTALS_AVAIL__
#define MAX_SM_GROUP_SIZE 32


struct portals_scatter_allgather_nb_bcast_state_t
{
    /* local variables */
    uint64_t length;
    int my_rank, src, matched;
    int src_list[MAX_SM_GROUP_SIZE];
    int group_size;
	int64_t ready_flag;
    int pow_2, pow_2_levels;
    int src_list_index;
    uint64_t fragment_size;  /* user buffer size */

	/* Input argument variables */
	void *my_userbuf;
	int64_t sequence_number;

	/* Extra source variables */
	bool secondary_root;
	int partner , extra_partner;

	/* Scatter Allgather offsets */
	uint64_t local_sg_offset , global_sg_offset , partner_offset ;

	/* Portals messaging relevant variables */
	/*
	 * ptl_handle_eq_t allgather_eq_h;
	 */
	ptl_handle_eq_t read_eq;
	ptl_event_t  allgather_event;
	bool msg_posted;

	/* OMPI module and component variables */
    mca_bcol_basesmuma_component_t *cs;
    struct mca_bcol_basesmuma_module_t *bcol_module;

	/* Control structure and payload variables */
	volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    volatile mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer;
	volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer; /* scatter source */
	volatile mca_bcol_basesmuma_ctl_struct_t  *extra_partner_ctl_pointer; /* scatter source */

	int phase;
};


typedef struct portals_scatter_allgather_nb_bcast_state_t sg_state_t;
#endif

#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX) \
    ((LEAD_DIM)*(BUF_INDEX)+(PROC_INDEX))
        /* debug */
#define BARRIER_BANK_LIST_SIZE 32
        /* end debug */

struct mca_bcol_basesmuma_module_t {
    /* base structure */
    mca_bcol_base_module_t super;

    /* free list item with the control structures used for
     * the no user data collective operations
     */
    list_data_t *no_userdata_ctl;

    /* free list item with the control structures used for
     * the with user data collective operations
     */
    list_data_t *userdata_ctl;

    /*
     * information on sm control backing files for the subgroup
     * associated with this module.
     */
    bcol_basesmuma_smcm_proc_item_t **ctl_backing_files_info;

    /*
     * information on sm payload backing files for the subgroup
     * associated with this module.
     */
    bcol_basesmuma_smcm_proc_item_t **payload_backing_files_info;

    /*
     * buffers for the collective that do not involve user data -
     *   barrier, fanin, fanout.
     */
    sm_buffer_mgmt colls_no_user_data;

    /*
     * buffers for the collective with user data.
     */
    sm_buffer_mgmt colls_with_user_data;

    /* recursive-doubling tree node */
    netpatterns_pair_exchange_node_t recursive_doubling_tree;

    /* k-nomial gather/allgather tree */
    netpatterns_k_exchange_node_t knomial_allgather_tree;

    /* fanin tree node - root is rank 0 */
    netpatterns_tree_node_t fanin_node;

    /* fanout tree node - root is rank 0 */
    netpatterns_tree_node_t fanout_node;

    /* index of blocking barrier memory region to use */
    int index_blocking_barrier_memory_bank;

    /* comm to shared memory map */
    int *comm_to_sm_map;

    /* reduction fanout tree */
    netpatterns_tree_node_t* reduction_tree;

    /* broadcast fanout tree */
    netpatterns_tree_node_t* fanout_read_tree;

    /* scatter - k-ary tree */
    int scatter_kary_radix;
    netpatterns_tree_node_t *scatter_kary_tree;

	/* Knomial exchange tree */
	/* Currently used for only large message reduce */
	netpatterns_k_exchange_node_t knomial_exchange_tree;

    /* sequence number offset - want to make sure that we start
     *   id'ing collectives with id 0, so we can have simple
     *   resource management.
     */
    int64_t squence_number_offset;

    /* basesmuma specific header size into ml buffer
     * was calculated at ml level - it is the sum of
     * all headers from all bcols and then aligned to
     * whatever alignment was requested
     */
    uint32_t total_header_size;

    /* list of possible sources */
    int *src_list;

    /* Number of possible sources */
    int src_size;

    /* smallest power of k that is smaller
     * than or equal in size to the uma group
     */
    int pow_k_levels;

    /* size of power-of-k group */
    int pow_k;

    /* smallest power of 2 that is smaller
     * than or equal to the smuma group size
     */
    int pow_2_levels;

    /* size of power-of-2 group */
    int pow_2;

    /* pointer to the shared memory scratch array of each
     * process in the group.
     */
    void **shared_memory_scratch_space;

    /*
	 * Caching information for re-entrant collectives
	 */
	mca_bcol_basesmuma_local_mlmem_desc_t ml_mem;

	/*
	 * Cached offsets for lmsg reduce
	 */
	int **reduce_offsets;

    /*XXX:
     * Starting to explore the beauty of zero-copy for large message
     */
    struct mca_hdl_base_module_t **hdl_module;

#ifdef __PORTALS_AVAIL__
	/*
	 * Store state for NB blocking functions
	 */
	sg_state_t sg_state;

#endif
};

typedef struct mca_bcol_basesmuma_module_t mca_bcol_basesmuma_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_basesmuma_module_t);

/* shared memory specific arguments for the bcol registration function */
typedef struct bcol_basesmuma_registration_data_t {
    char *file_name; /* filename for payload */
    void *base_addr; /* base address to be mapped */
    size_t size;     /* size of memory block to be "registered" */
    size_t size_ctl_structure;
    size_t data_seg_alignment;
    bcol_basesmuma_smcm_mmap_t *sm_mmap; /* shared memory map struct */
    mca_coll_ml_release_buff_fn_t buff_release_cb; /* buffer release
                                                      call back */
} bcol_basesmuma_registration_data_t;


/* enum for signaling flag bank, when
 * adding to this list, please keep
 * it alphabetical
 */
/*
enum {
    ALLGATHER_FLAG,
    ALLREDUCE_FLAG,
    BARRIER_FANIN_FLAG,
    BARRIER_FANOUT_FLAG,
    BARRIER_RKING_FLAG,
    BCAST_FLAG,
    GATHER_FLAG,
    SCATTER_FLAG,
    NUM_SIGNAL_FLAGS
};

*/

enum {
    BUFFER_AVAILABLE,
    STARTED,
    FANIN,
    FANOUT
};

/* enum used for non-blocking large
 * message bcast
 */

enum {
   INIT,
   START,
   NOT_STARTED,
   SCATTER,
   ALLGATHER,
   EXTRA_RANK,
   PROBE,
   SCATTER_ROOT_WAIT,
   SCATTER_EXTRA_ROOT_WAIT,
   SCATTER_PARENT_WAIT,
   FINISHED
};

/**
 * Global component instance
 */
OMPI_MODULE_DECLSPEC extern mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component;


/*
 * coll module functions
 */

/* query to see if the component is available for use, and can
 * satisfy the thread and progress requirements
 */
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
        bool enable_mpi_threads);

/* query to see if the module is available for use on the given
 * communicator, and if so, what it's priority is.
 */
mca_bcol_base_module_t **
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules);


/* shared memory specific memory registration function - this will be passed into the mpool */
int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
	    void **reg);

/* shared memory specific memory deregistration function - also needed by the mpool */
int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg);

/* setup the new k_nomial tree for collectives */
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super);

/* allocate the memory pool for the shared memory control structures */
int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t
		*component);

/* initialize the internal scratch buffers and control structs that will be
   used by the module */
int base_bcol_basesmuma_setup_library_buffers(
		 		mca_bcol_basesmuma_module_t *sm_module,
		 		mca_bcol_basesmuma_component_t *cs);


/* shared memory recursive doubling initialization */
int bcol_basesmuma_rd_barrier_init(mca_bcol_base_module_t *module);

/* shared memory recusive double barrier */
int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);
/* shared memory fanin */
int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super);

/* shared memory fanout */
int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super);

/* shared memory recursive k-ing non-blocking barrier */
int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super);

/* Shared memory broadcast */
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super);

int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args);

/* Shared memory non-blocking broadcast */
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

/* Shared memory non-blocking broadcast - Large message anyroot */
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
            coll_ml_function_t *c_input_args);

#if 0
/*FIXME: having fun here*/
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
                                      coll_ml_function_t   *c_input_args);
#endif

int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args);

int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args);

int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args);

int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args);

/*
 *  shared memory scatter
 */
int bcol_basesmuma_scatter_init(mca_bcol_base_module_t *super);

/* shared memory nonblocking scatter - known root */
int bcol_basesmuma_nb_scatter_k_array_knownroot(
        bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

/* shared memory non-blocking k-nomial barrier init */
int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args);

/* shared memory non-blocking k-nomial barrier progress */
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
                                struct coll_ml_function_t *const_args);

/*shared memory non-blocking k-nomial allgather init */
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
                        struct coll_ml_function_t *const_args);

/* shared memory non-blocking k-nomial allgather progress */
int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
                        struct coll_ml_function_t *const_args);

/* shared memory allgather -- selection logic api */
int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super);

/* shared memory blocking k-nomial gather */
int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

/* shared memory non blocking k-nomial gather */
int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args,
                coll_ml_function_t *c_input_args);

/* shared memory non blocking k-nomial gather progress*/
int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args,
                coll_ml_function_t *c_input_args);

/* shared memory init */
int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super);

/* allocate shared memory control memory */
int mca_bcol_basesmuma_allocate_sm_ctl_memory(
        mca_bcol_basesmuma_component_t *cs);

/* Shared memory basesmuma reduce */
int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super);
int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);
int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf,
        int count, struct ompi_datatype_t *dtype,
        struct ompi_op_t *op,
        int root,
        struct ompi_communicator_t *comm,
        mca_coll_base_module_t *module);

/* Shared memory basesmuma allreduce */
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super);

int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args,
        coll_ml_function_t *c_input_args);

int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
													  coll_ml_function_t *c_input_args);

/* initialize non-blocking barrier for recycling the memory buffers.
 *  This is not a general purpose nb_barrier, and relies on the
 *  fact that we will have only one outstanding nb-barrier per bank
 *  at a time.
 */
int bcol_basesmuma_rd_nb_barrier_init_admin(sm_nbbar_desc_t *sm_desc);

/* admin nonblocking barrier - progress function */
int bcol_basesmuma_rd_nb_barrier_progress_admin(sm_nbbar_desc_t *sm_desc);

/* Memory syncronization registration function */
int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super);

/* smcm allgather function used to exchange file offsets. */
int bcol_basesmuma_smcm_allgather_connection(
        mca_bcol_basesmuma_module_t *sm_bcol_module,
        mca_sbgp_base_module_t *module,
        opal_list_t *peer_list,
        bcol_basesmuma_smcm_proc_item_t ***backing_files,
        ompi_communicator_t *comm,
        bcol_basesmuma_smcm_file_t input, char *base_fname,
        bool map_all);

/*
 * this function initializes the internal scratch buffers and control
 * structures that will be used by the module
 */
int base_bcol_masesmuma_setup_library_buffers(
        mca_bcol_basesmuma_module_t *sm_bcol_module,
        mca_bcol_basesmuma_component_t *sm_bcol_component);

/* get the index of the shared memory buffer to be used */
int bcol_basesmuma_get_buff_index( sm_buffer_mgmt * buff_block,
    uint64_t buff_id );

int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
    uint64_t buff_id );

/* This function does bcol_basesmuma specific memory registration and
   issues call back for ml level bank recycling
  */
int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
		mca_bcol_base_module_t *bcol_module,
		void *reg_data);

/* bank init which is used for shared memory optimization, fall back to
 * the bank init above if this causes problems
 */
int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
		mca_bcol_base_module_t *bcol_module,
		void *reg_data);

/* used for shared memory offset exchange */
int base_bcol_basesmuma_exchange_offsets(
    mca_bcol_basesmuma_module_t *sm_bcol_module,
    void **result_array, uint64_t mem_offset, int loop_limit,
    int leading_dim);


/* the progress function to be called from the opal progress function
 */
int bcol_basesmuma_progress(void);

/* Macro for initializing my shared memory control structure */
#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, sequence_number, bcol_id) \
do{                                                                                \
    int i,j;                                                                       \
    int8_t flag_offset = 0;                                                        \
    /* setup resource recycling */                                                 \
    if( my_ctl_pointer->sequence_number < sequence_number ) {                      \
        /* Signal arrival */                                                       \
        for( j = 0; j < SM_BCOLS_MAX; j++){                                        \
            my_ctl_pointer->starting_flag_value[j]=0;                              \
            for( i = 0; i < NUM_SIGNAL_FLAGS; i++){                                \
                my_ctl_pointer->flags[i][j] = -1;                                  \
            }                                                                      \
        }                                                                          \
    }                                                                              \
    /* increment the starting flag by one and return */                            \
    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];                    \
    ready_flag = flag_offset + 1;                                                  \
    MB();                                                                          \
    my_ctl_pointer->sequence_number = sequence_number;                             \
}while(0)

/* these are all the same, am using a single macro for all collectives */

#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id)\
		(((peer)->sequence_number == (my_sequence_number) &&                \
		   (peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false )

#if 0
#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number)\
		(((peer)->sequence_number == (my_sequence_number) && \
		   (peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag) \
	   	 )? true : false )

#define IS_GDATA_READY(peer, my_flag, my_sequence_number)\
		(((peer)->sequence_number == (my_sequence_number) && \
		   (peer)->flags[GATHER_FLAG][bcol_id] == (my_flag) \
	   	 )? true : false )

#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number)\
		((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \
		  (((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
		  ((peer)->flags[flag_index][bcol_id] == (my_flag))) \
	   	 )? true : false )

#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number)\
		 ((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
		   (((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \
	   	 )? true : false )
#endif

#define IS_LAST_BCOL_FUNC(ml_args) \
	    ((((ml_args)->n_of_this_type_in_collective == \
		   (ml_args)->index_of_this_type_in_collective + 1 ) )? true : false)

static inline __opal_attribute_always_inline__
            size_t bcol_basesmuma_data_offset_calc(
                      mca_bcol_basesmuma_module_t *basesmuma_module)
{
    uint32_t offset = basesmuma_module->super.header_size;
    offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;

    return (size_t) offset;
}


END_C_DECLS

#endif /* MCA_BCOL_basesmuma_EXPORT_H */