1
1
openmpi/ompi/mca/bcol/basesmuma/bcol_basesmuma.h
2013-02-05 21:52:55 +00:00

1230 строки
36 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_basesmuma_EXPORT_H
#define MCA_BCOL_basesmuma_EXPORT_H
#include "ompi_config.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/coll/ml/coll_ml.h"
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
#include "ompi/request/request.h"
#include "ompi/proc/proc.h"
#include "ompi/patterns/net/netpatterns.h"
#include "opal/mca/mca.h"
#include "opal/util/arch.h"
#include "opal/util/argv.h"
#include "opal/datatype/opal_datatype.h"
#include "opal/util/output.h"
#include "bcol_basesmuma_smcm.h"
BEGIN_C_DECLS
struct list_data_t {
opal_list_item_t super;
void *data;
};
typedef struct list_data_t list_data_t;
OBJ_CLASS_DECLARATION(list_data_t);
/*
* Macro's for manipulating the 64 bit shared memory control bits.
* The 64 bit field is devided into 4 bit fields
*
* | 48-63: src | 32-47: index | 16-31: flag | 0-15: sequence number |
*
* Only the low 16 bits of the sequence number will be put in the header
* space. We will use the fact that the use of the shared buffers is
* synchronous, and get the upper 48 bits from the local process space.
*/
#define CACHE_LINE_SIZE 128
#define SHIFT_UP <<
#define SHIFT_DOWN >>
#define SEQ_WIDTH 16
#define SEQ_BASE 0
#define FIELD_SEQ_MASK ( ( 1 SHIFT_UP SEQ_WIDTH ) - 1 )
#define INPLACE_SEQ_MASK ( (int64_t)FIELD_SEQ_MASK SHIFT_UP SEQ_BASE)
#define FLAG_WIDTH 16
#define FLAG_BASE 16
#define FIELD_FLAG_MASK ( ( 1 SHIFT_UP FLAG_WIDTH ) - 1 )
#define INPLACE_FLAG_MASK ( (int64_t)FIELD_FLAG_MASK SHIFT_UP FLAG_BASE)
#define INDX_WIDTH 16
#define INDX_BASE 32
#define FIELD_INDX_MASK ( ( 1 SHIFT_UP INDX_WIDTH ) - 1 )
#define INPLACE_INDX_MASK ( (int64_t)FIELD_INDX_MASK SHIFT_UP INDX_BASE)
#define SRC_WIDTH 16
#define SRC_BASE 48
#define FIELD_SRC_MASK ( ( 1 SHIFT_UP SRC_WIDTH ) - 1 )
#define INPLACE_SRC_MASK ( (int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE)
/*int64_t INPLACE_SRC_MASK= ((int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE); */
#define EXTRACT_FLAG(INPUT, OUTPUT, OUTPUT_TYPE, FIELD_BASE, FIELD_MASK) \
OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK )
#define STORE_FLAG(INPUT, OUTPUT, INPUT_TYPE, OUTPUT_TYPE, FIELD_BASE, INPLACE_FIELD_MASK ) \
OUTPUT = \
( \
/* 3 */ \
( \
/* 2 */ \
( \
/* 1 - shift the input field to the proper location */ \
(OUTPUT_TYPE)( \
((OUTPUT_TYPE)((INPUT_TYPE) (INPUT))) \
SHIFT_UP FIELD_BASE ) \
/* mask off the extra bits */ \
& ((OUTPUT_TYPE)INPLACE_FIELD_MASK) \
) \
/* store back to the OUTPUT field, w/o destroying other fields */ \
) | OUTPUT \
)
/**
* Structure to hold the basic shared memory bcoll component.
*/
struct mca_bcol_basesmuma_component_t {
/** Base coll component */
mca_bcol_base_component_2_0_0_t super;
/* management data for collectives with no user data */
/** MCA parameter: control region size (bytes), per proc */
size_t basesmuma_ctl_size_per_proc;
/** MCA parameter: control region alignment */
size_t basesmuma_ctl_alignment;
/** MCA parameter: number of memory banks */
size_t basesmuma_num_mem_banks;
/** MCA parameter: number of regions per memory bank */
int basesmuma_num_regions_per_bank;
/** MCA parameter: Number of simultaneous groups supported */
int n_groups_supported;
/* management data for collectives with user data (ud) - the memory
* is actually obtained at the ML level
*/
/** MCA paramenter: number of polling loops to run while waiting
* for children or parent to complete their work
*/
int n_poll_loops;
/* mpool size */
size_t mpool_size;
/* mpool inited - will use this to test whether or not the
* shared memory has been inited
*/
bool mpool_inited;
/* shared memory control buffer - the control structures reside
* in shared memory */
bcol_basesmuma_smcm_mmap_t *sm_ctl_structs;
/* shared memory payload buffer
*/
bcol_basesmuma_smcm_mmap_t *sm_payload_structs;
/*
* list of shared memory control structures
*/
opal_list_t ctl_structures;
/** opal list in which the list of peers that I am "connected" to is stored
*/
opal_list_t sm_connections_list;
/* opal list in which the list of payload peers that I am "connected" to
* is stored
*/
opal_list_t sm_payload_connections_list;
/*
* list of non-blocking admin barriers to progress */
opal_mutex_t nb_admin_barriers_mutex;
opal_list_t nb_admin_barriers;
/*
* order of fan-in tree
*/
int radix_fanin;
/*
* order of fan-out tree
*/
int radix_fanout;
/*
* Order of read tree
*/
int radix_read_tree;
/*
* order of reduction fan-out tree
*/
int order_reduction_tree;
/*
* K-nomial tree radix
*/
int k_nomial_radix;
/*
* K-ary scatter tree radix
*/
int scatter_kary_radix;
/*
* number of polling loops
*/
int num_to_probe;
/*
* Portals addressing info
* void*: because wanted to keep portal library dependencies
* as local as possible
*/
void *portals_info;
bool portals_init;
/*
* verbosity level
*/
int verbose;
/*
* control file name base string
*/
char *clt_base_fname;
/*
* data file name base string
*/
char *payload_base_fname;
/*
* shared memory scratch space. This is mapped at the end of the
* segement of memory holding the control structures.
*/
char *my_scratch_shared_memory;
/*
* size of scratch memory
*/
size_t my_scratch_shared_memory_size;
/* the offset will be the same for all ranks */
size_t scratch_offset_from_base_ctl_file;
};
static inline int mca_bcol_basesmuma_err(const char* fmt, ...)
{
va_list list;
int ret;
va_start(list, fmt);
ret = vfprintf(stderr, fmt, list);
va_end(list);
return ret;
}
#if OPAL_ENABLE_DEBUG
#define BASESMUMA_VERBOSE(level, args) \
do { \
if(mca_bcol_basesmuma_component.verbose >= level) { \
mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_basesmuma_err args; \
mca_bcol_basesmuma_err("\n"); \
} \
} while(0)
#else
#define BASESMUMA_VERBOSE(level, args)
#endif
/**
* Convenience typedef */
typedef struct mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component_t;
#if 0
/*
* Implemented function index list
*/
/* barrier */
enum{
FANIN_FAN_OUT_BARRIER_FN,
RECURSIVE_DOUBLING_BARRIER_FN,
N_BARRIER_FNS
};
/* reduce */
enum{
FANIN_REDUCE_FN,
REDUCE_SCATTER_GATHER_FN,
N_REDUCE_FNS
};
enum{
SHORT_DATA_FN_REDUCE,
LONG_DATA_FN_REDUCE,
N_REDUCE_FNS_USED
};
/* all-reduce */
enum{
FANIN_FANOUT_ALLREDUCE_FN,
REDUCE_SCATTER_ALLGATHER_FN,
N_ALLREDUCE_FNS
};
enum{
SHORT_DATA_FN_ALLREDUCE,
LONG_DATA_FN_ALLREDUCE,
N_ALLREDUCE_FNS_USED
};
/* enum for node type */
enum{
ROOT_NODE,
LEAF_NODE,
INTERIOR_NODE
};
/*
* N-order tree node description
*/
struct tree_node_t {
/* my rank within the group */
int my_rank;
/* my node type - root, leaf, or interior */
int my_node_type;
/* number of nodes in the tree */
int tree_size;
/* number of parents (0/1) */
int n_parents;
/* number of children */
int n_children;
/* parent rank within the group */
int parent_rank;
/* chidren ranks within the group */
int *children_ranks;
};
typedef struct tree_node_t tree_node_t;
/*
* Pair-wise data exchange
*/
/* enum for node type */
enum{
EXCHANGE_NODE,
EXTRA_NODE
};
struct pair_exchange_node_t {
/* my rank within the group */
int my_rank;
/* number of nodes this node will exchange data with */
int n_exchanges;
/* ranks of nodes involved in data exchnge */
int *rank_exchanges;
/* number of extra sources of data - outside largest power of 2 in
* this group */
int n_extra_sources;
/* rank of the extra source */
int rank_extra_source;
/* number of tags needed per stripe */
int n_tags;
/* log 2 of largest full power of 2 for this node set */
int log_2;
/* largest power of 2 that fits in this group */
int n_largest_pow_2;
/* node type */
int node_type;
};
typedef struct pair_exchange_node_t pair_exchange_node_t;
#endif
/*
* descriptor for managing the admin nonblocking barrier routine.
* This is an sm internal routine, and assumes only 1 outstanding
* nb-barrier collective call per block.
*/
/* forward declarations */
struct mca_bcol_basesmuma_module_t;
struct sm_buffer_mgmt;
struct sm_nbbar_desc_t {
/* make sure we can put this on a list */
opal_list_item_t super;
/* phase of the collective operation - needed to know how to continue
* progressing the nb-barrier */
int collective_phase;
/* iteration to continue at */
int recursive_dbl_iteration;
/* pointer to the collective module this is associated with */
struct mca_bcol_basesmuma_module_t *sm_module;
/* pointer to payload/control structs buffers */
struct sm_buffer_mgmt *coll_buff;
/* pool index */
int pool_index;
/* pointer to the ml_memory_block_desc_t structure
* that is actually managing this registration.
* This is meaningful when these control structures
* are used in conjunction with the user payload
* data that is allocated at the ml level.
*/
void *ml_memory_block_descriptor;
};
typedef struct sm_nbbar_desc_t sm_nbbar_desc_t;
/*
* Barrier request objects
*/
/* shared memory data strucutures */
struct mca_bcol_basesmuma_nb_request_process_shared_mem_t {
volatile uint64_t coll_index;
/* flag used to indicate the status of this memory region */
volatile uint64_t flag;
volatile uint64_t index;
/* pading */
/* Note: need to change this so it takes less memory */
char padding[CACHE_LINE_SIZE-3*sizeof(uint64_t)];
};
typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t
mca_bcol_basesmuma_nb_request_process_shared_mem_t;
/* enum for phase at which the nb barrier is in */
enum{
NB_BARRIER_INACTIVE,
/* fan-in/fan-out */
NB_BARRIER_FAN_IN,
NB_BARRIER_FAN_OUT,
/* recursive doubling */
NB_PRE_PHASE,
NB_RECURSIVE_DOUBLING,
NB_POST_PHASE,
/* done and not started are the same for all practicle
* purposes, as the init funtion always sets this flag
*/
NB_BARRIER_DONE
};
/* forward declartion */
struct mca_bcol_basesmuma_module_t;
/* control segment for shared memory */
struct mca_bcol_basesmuma_ctl_struct_t {
/* collective identifier */
volatile int64_t sequence_number;
volatile int64_t flag;
volatile int64_t index;
volatile int64_t offset;
volatile int64_t offset_zip;
/* used for non-blocking algorithms */
int status;
int active_requests;
int iteration;
int *src_ptr;
int start;
/* process private data */
int starting_flag_value;
/* experiment for large data colls */
int n_sends;
int length;
#ifdef __PORTALS_AVAIL__
struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr;
#endif
/* padding */
/* ok, no room to pad anymore */
/*char padding[CACHE_LINE_SIZE-5*sizeof(int64_t)-8*sizeof(int)];*/
};
typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t;
#define SM_BCOLS_MAX 2
enum {
ALLGATHER_FLAG,
ALLREDUCE_FLAG,
BARRIER_FANIN_FLAG,
BARRIER_FANOUT_FLAG,
BARRIER_RKING_FLAG,
BCAST_FLAG,
GATHER_FLAG,
REDUCE_FLAG,
NUM_SIGNAL_FLAGS
};
/* control region for colls with user data - shared memory */
struct mca_bcol_basesmuma_header_t {
/* collective identifier */
volatile int64_t sequence_number;
volatile int8_t flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX];
volatile int32_t src; /* src of bcast data for unknown root,
bcol id for known root
*/
/* starting flag - hierarchies */
int8_t starting_flag_value[SM_BCOLS_MAX];
};
typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t;
/* data needed for large messages */
struct mca_bcol_basesmuma_large_msg_t {
/* scatter allgather data */
uint64_t offset;
uint64_t n_sends;
uint64_t length;
/* portals data */
};
typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t;
/* payload struct */
struct mca_bcol_basesmuma_payload_t {
/* base pointer to shared memory control structure */
mca_bcol_basesmuma_header_t *ctl_struct;
void *payload;
};
typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t;
/* memory bank memory management structure */
struct mem_bank_management_t {
/* generation counter */
uint64_t bank_gen_counter;
/* descriptor for the non-blocking barrier. This is
* used to manage this bank of memory.
*/
sm_nbbar_desc_t nb_barrier_desc;
/* the number of buffers that are not in use, and are
* available. The assumption is that the buffers are
* recycled all at once, so are available for re-use
* until all buffers have been made available for re-use.
*/
volatile int available_buffers;
/*
* number of buffers freed */
volatile int n_buffs_freed;
/* mutex to ensure atomic recycling of resrouces */
opal_mutex_t mutex;
/* number of buffers being managed */
int number_of_buffers;
/* shared memory control structures */
int index_shared_mem_ctl_structs;
};
typedef struct mem_bank_management_t mem_bank_management_t;
/* data structure for shared buffers */
struct sm_buffer_mgmt {
/* number of buffers per process */
int number_of_buffs;
/* size of group */
int size_of_group;
/* number of memory banks */
int num_mem_banks;
/* number of buffers per memory bank */
int num_buffs_per_mem_bank;
/* log base 2 of num_buffs_per_mem_bank */
int log2_num_buffs_per_mem_bank;
/* log base 2 total number of buffers */
int log2_number_of_buffs;
/* mask - masks off the bits corresponding to buffer index */
int mask;
/* control buffers - these point to regions in shared memory */
/* leading dimension is the group size - all pointers for a given
* set of buffers appear consecutively in this array
*/
volatile void **ctl_buffs;
/* management data for the control structures -
* one per bank of control structures - Will be used for
* the payload buffers as well.
*/
mem_bank_management_t *ctl_buffs_mgmt;
/* data buffers - these point to regions in shared memory */
/* leading dimension is the group size - all pointers for a given
* set of buffers appear consecutively in this array
*/
volatile mca_bcol_basesmuma_payload_t *data_buffs;
};
typedef struct sm_buffer_mgmt sm_buffer_mgmt;
struct mca_bcol_basesmuma_nb_coll_buff_desc_t {
void *data_addr;
uint64_t bank_index;
uint64_t buffer_index;
int active_requests;
ompi_request_t **requests;
int data_src;
int radix_mask;
int radix_mask_pow;
int iteration;
int status;
/* this is for testing */
int tag;
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer;
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer;
};
typedef struct mca_bcol_basesmuma_nb_coll_buff_desc_t mca_bcol_basesmuma_nb_coll_buff_desc_t;
struct mca_bcol_basesmuma_local_mlmem_desc_t {
uint32_t bank_index_for_release;
struct ml_memory_block_desc_t *ml_mem_desc;
uint32_t num_banks;
uint32_t num_buffers_per_bank;
uint32_t size_buffer;
uint32_t *bank_release_counter;
/*
* Number of descriptors allocated is equivalent to number of ml buffers
* (number of banks * number of buffers per bank)
*/
mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc;
};
typedef struct mca_bcol_basesmuma_local_mlmem_desc_t mca_bcol_basesmuma_local_mlmem_desc_t;
#ifdef __PORTALS_AVAIL__
#define MAX_SM_GROUP_SIZE 32
struct portals_scatter_allgather_nb_bcast_state_t
{
/* local variables */
uint64_t length;
int my_rank, src, matched;
int src_list[MAX_SM_GROUP_SIZE];
int group_size;
int64_t ready_flag;
int pow_2, pow_2_levels;
int src_list_index;
uint64_t fragment_size; /* user buffer size */
/* Input argument variables */
void *my_userbuf;
int64_t sequence_number;
/* Extra source variables */
bool secondary_root;
int partner , extra_partner;
/* Scatter Allgather offsets */
uint64_t local_sg_offset , global_sg_offset , partner_offset ;
/* Portals messaging relevant variables */
/*
* ptl_handle_eq_t allgather_eq_h;
*/
ptl_handle_eq_t read_eq;
ptl_event_t allgather_event;
bool msg_posted;
/* OMPI module and component variables */
mca_bcol_basesmuma_component_t *cs;
struct mca_bcol_basesmuma_module_t *bcol_module;
/* Control structure and payload variables */
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */
int phase;
};
typedef struct portals_scatter_allgather_nb_bcast_state_t sg_state_t;
#endif
#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX) \
((LEAD_DIM)*(BUF_INDEX)+(PROC_INDEX))
/* debug */
#define BARRIER_BANK_LIST_SIZE 32
/* end debug */
struct mca_bcol_basesmuma_module_t {
/* base structure */
mca_bcol_base_module_t super;
/* free list item with the control structures used for
* the no user data collective operations
*/
list_data_t *no_userdata_ctl;
/* free list item with the control structures used for
* the with user data collective operations
*/
list_data_t *userdata_ctl;
/*
* information on sm control backing files for the subgroup
* associated with this module.
*/
bcol_basesmuma_smcm_proc_item_t **ctl_backing_files_info;
/*
* information on sm payload backing files for the subgroup
* associated with this module.
*/
bcol_basesmuma_smcm_proc_item_t **payload_backing_files_info;
/*
* buffers for the collective that do not involve user data -
* barrier, fanin, fanout.
*/
sm_buffer_mgmt colls_no_user_data;
/*
* buffers for the collective with user data.
*/
sm_buffer_mgmt colls_with_user_data;
/* recursive-doubling tree node */
netpatterns_pair_exchange_node_t recursive_doubling_tree;
/* k-nomial gather/allgather tree */
netpatterns_k_exchange_node_t knomial_allgather_tree;
/* fanin tree node - root is rank 0 */
netpatterns_tree_node_t fanin_node;
/* fanout tree node - root is rank 0 */
netpatterns_tree_node_t fanout_node;
/* index of blocking barrier memory region to use */
int index_blocking_barrier_memory_bank;
/* comm to shared memory map */
int *comm_to_sm_map;
/* reduction fanout tree */
netpatterns_tree_node_t* reduction_tree;
/* broadcast fanout tree */
netpatterns_tree_node_t* fanout_read_tree;
/* scatter - k-ary tree */
int scatter_kary_radix;
netpatterns_tree_node_t *scatter_kary_tree;
/* Knomial exchange tree */
/* Currently used for only large message reduce */
netpatterns_k_exchange_node_t knomial_exchange_tree;
/* sequence number offset - want to make sure that we start
* id'ing collectives with id 0, so we can have simple
* resource management.
*/
int64_t squence_number_offset;
/* basesmuma specific header size into ml buffer
* was calculated at ml level - it is the sum of
* all headers from all bcols and then aligned to
* whatever alignment was requested
*/
uint32_t total_header_size;
/* list of possible sources */
int *src_list;
/* Number of possible sources */
int src_size;
/* smallest power of k that is smaller
* than or equal in size to the uma group
*/
int pow_k_levels;
/* size of power-of-k group */
int pow_k;
/* smallest power of 2 that is smaller
* than or equal to the smuma group size
*/
int pow_2_levels;
/* size of power-of-2 group */
int pow_2;
/* pointer to the shared memory scratch array of each
* process in the group.
*/
void **shared_memory_scratch_space;
/*
* Caching information for re-entrant collectives
*/
mca_bcol_basesmuma_local_mlmem_desc_t ml_mem;
/*
* Cached offsets for lmsg reduce
*/
int **reduce_offsets;
/*XXX:
* Starting to explore the beauty of zero-copy for large message
*/
struct mca_hdl_base_module_t **hdl_module;
#ifdef __PORTALS_AVAIL__
/*
* Store state for NB blocking functions
*/
sg_state_t sg_state;
#endif
};
typedef struct mca_bcol_basesmuma_module_t mca_bcol_basesmuma_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_basesmuma_module_t);
/* shared memory specific arguments for the bcol registration function */
typedef struct bcol_basesmuma_registration_data_t {
char *file_name; /* filename for payload */
void *base_addr; /* base address to be mapped */
size_t size; /* size of memory block to be "registered" */
size_t size_ctl_structure;
size_t data_seg_alignment;
bcol_basesmuma_smcm_mmap_t *sm_mmap; /* shared memory map struct */
mca_coll_ml_release_buff_fn_t buff_release_cb; /* buffer release
call back */
} bcol_basesmuma_registration_data_t;
/* enum for signaling flag bank, when
* adding to this list, please keep
* it alphabetical
*/
/*
enum {
ALLGATHER_FLAG,
ALLREDUCE_FLAG,
BARRIER_FANIN_FLAG,
BARRIER_FANOUT_FLAG,
BARRIER_RKING_FLAG,
BCAST_FLAG,
GATHER_FLAG,
SCATTER_FLAG,
NUM_SIGNAL_FLAGS
};
*/
enum {
BUFFER_AVAILABLE,
STARTED,
FANIN,
FANOUT
};
/* enum used for non-blocking large
* message bcast
*/
enum {
INIT,
START,
NOT_STARTED,
SCATTER,
ALLGATHER,
EXTRA_RANK,
PROBE,
SCATTER_ROOT_WAIT,
SCATTER_EXTRA_ROOT_WAIT,
SCATTER_PARENT_WAIT,
FINISHED
};
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC extern mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component;
/*
* coll module functions
*/
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is.
*/
mca_bcol_base_module_t **
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules);
/* shared memory specific memory registration function - this will be passed into the mpool */
int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
void **reg);
/* shared memory specific memory deregistration function - also needed by the mpool */
int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg);
/* setup the new k_nomial tree for collectives */
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super);
/* allocate the memory pool for the shared memory control structures */
int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t
*component);
/* initialize the internal scratch buffers and control structs that will be
used by the module */
int base_bcol_basesmuma_setup_library_buffers(
mca_bcol_basesmuma_module_t *sm_module,
mca_bcol_basesmuma_component_t *cs);
/* shared memory recursive doubling initialization */
int bcol_basesmuma_rd_barrier_init(mca_bcol_base_module_t *module);
/* shared memory recusive double barrier */
int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* shared memory fanin */
int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super);
/* shared memory fanout */
int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super);
/* shared memory recursive k-ing non-blocking barrier */
int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super);
/* Shared memory broadcast */
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super);
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* Shared memory non-blocking broadcast */
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* Shared memory non-blocking broadcast - Large message anyroot */
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
#if 0
/*FIXME: having fun here*/
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
#endif
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/*
* shared memory scatter
*/
int bcol_basesmuma_scatter_init(mca_bcol_base_module_t *super);
/* shared memory nonblocking scatter - known root */
int bcol_basesmuma_nb_scatter_k_array_knownroot(
bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* shared memory non-blocking k-nomial barrier init */
int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
struct coll_ml_function_t *const_args);
/* shared memory non-blocking k-nomial barrier progress */
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
struct coll_ml_function_t *const_args);
/*shared memory non-blocking k-nomial allgather init */
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
struct coll_ml_function_t *const_args);
/* shared memory non-blocking k-nomial allgather progress */
int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
struct coll_ml_function_t *const_args);
/* shared memory allgather -- selection logic api */
int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super);
/* shared memory blocking k-nomial gather */
int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* shared memory non blocking k-nomial gather */
int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* shared memory non blocking k-nomial gather progress*/
int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* shared memory init */
int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super);
/* allocate shared memory control memory */
int mca_bcol_basesmuma_allocate_sm_ctl_memory(
mca_bcol_basesmuma_component_t *cs);
/* Shared memory basesmuma reduce */
int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super);
int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf,
int count, struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
/* Shared memory basesmuma allreduce */
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super);
int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
coll_ml_function_t *c_input_args);
/* initialize non-blocking barrier for recycling the memory buffers.
* This is not a general purpose nb_barrier, and relies on the
* fact that we will have only one outstanding nb-barrier per bank
* at a time.
*/
int bcol_basesmuma_rd_nb_barrier_init_admin(sm_nbbar_desc_t *sm_desc);
/* admin nonblocking barrier - progress function */
int bcol_basesmuma_rd_nb_barrier_progress_admin(sm_nbbar_desc_t *sm_desc);
/* Memory syncronization registration function */
int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super);
/* smcm allgather function used to exchange file offsets. */
int bcol_basesmuma_smcm_allgather_connection(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_sbgp_base_module_t *module,
opal_list_t *peer_list,
bcol_basesmuma_smcm_proc_item_t ***backing_files,
ompi_communicator_t *comm,
bcol_basesmuma_smcm_file_t input, char *base_fname,
bool map_all);
/*
* this function initializes the internal scratch buffers and control
* structures that will be used by the module
*/
int base_bcol_masesmuma_setup_library_buffers(
mca_bcol_basesmuma_module_t *sm_bcol_module,
mca_bcol_basesmuma_component_t *sm_bcol_component);
/* get the index of the shared memory buffer to be used */
int bcol_basesmuma_get_buff_index( sm_buffer_mgmt * buff_block,
uint64_t buff_id );
int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
uint64_t buff_id );
/* This function does bcol_basesmuma specific memory registration and
issues call back for ml level bank recycling
*/
int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
mca_bcol_base_module_t *bcol_module,
void *reg_data);
/* bank init which is used for shared memory optimization, fall back to
* the bank init above if this causes problems
*/
int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
mca_bcol_base_module_t *bcol_module,
void *reg_data);
/* used for shared memory offset exchange */
int base_bcol_basesmuma_exchange_offsets(
mca_bcol_basesmuma_module_t *sm_bcol_module,
void **result_array, uint64_t mem_offset, int loop_limit,
int leading_dim);
/* the progress function to be called from the opal progress function
*/
int bcol_basesmuma_progress(void);
/* Macro for initializing my shared memory control structure */
#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, sequence_number, bcol_id) \
do{ \
int i,j; \
int8_t flag_offset = 0; \
/* setup resource recycling */ \
if( my_ctl_pointer->sequence_number < sequence_number ) { \
/* Signal arrival */ \
for( j = 0; j < SM_BCOLS_MAX; j++){ \
my_ctl_pointer->starting_flag_value[j]=0; \
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ \
my_ctl_pointer->flags[i][j] = -1; \
} \
} \
} \
/* increment the starting flag by one and return */ \
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; \
ready_flag = flag_offset + 1; \
MB(); \
my_ctl_pointer->sequence_number = sequence_number; \
}while(0)
/* these are all the same, am using a single macro for all collectives */
#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id)\
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false )
#if 0
#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number)\
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag) \
)? true : false )
#define IS_GDATA_READY(peer, my_flag, my_sequence_number)\
(((peer)->sequence_number == (my_sequence_number) && \
(peer)->flags[GATHER_FLAG][bcol_id] == (my_flag) \
)? true : false )
#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number)\
((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \
(((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
((peer)->flags[flag_index][bcol_id] == (my_flag))) \
)? true : false )
#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number)\
((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
(((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \
)? true : false )
#endif
#define IS_LAST_BCOL_FUNC(ml_args) \
((((ml_args)->n_of_this_type_in_collective == \
(ml_args)->index_of_this_type_in_collective + 1 ) )? true : false)
static inline __opal_attribute_always_inline__
size_t bcol_basesmuma_data_offset_calc(
mca_bcol_basesmuma_module_t *basesmuma_module)
{
uint32_t offset = basesmuma_module->super.header_size;
offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;
return (size_t) offset;
}
END_C_DECLS
#endif /* MCA_BCOL_basesmuma_EXPORT_H */