53a3defde9
name clash on some BSDs. cmr=v1.7.4:reviewer=pasha This commit was SVN r30230.
1225 строки
36 KiB
C
1225 строки
36 KiB
C
/*
|
|
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
|
|
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
|
|
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#ifndef MCA_BCOL_basesmuma_EXPORT_H
|
|
#define MCA_BCOL_basesmuma_EXPORT_H
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/mca/bcol/bcol.h"
|
|
#include "ompi/mca/mpool/mpool.h"
|
|
#include "ompi/mca/coll/ml/coll_ml.h"
|
|
#include "ompi/mca/coll/ml/coll_ml_allocation.h"
|
|
#include "ompi/request/request.h"
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/patterns/net/netpatterns.h"
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/util/arch.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/datatype/opal_datatype.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "bcol_basesmuma_smcm.h"
|
|
BEGIN_C_DECLS
|
|
|
|
struct list_data_t {
|
|
opal_list_item_t super;
|
|
void *data;
|
|
};
|
|
typedef struct list_data_t list_data_t;
|
|
OBJ_CLASS_DECLARATION(list_data_t);
|
|
|
|
/*
|
|
* Macro's for manipulating the 64 bit shared memory control bits.
|
|
* The 64 bit field is devided into 4 bit fields
|
|
*
|
|
* | 48-63: src | 32-47: index | 16-31: flag | 0-15: sequence number |
|
|
*
|
|
* Only the low 16 bits of the sequence number will be put in the header
|
|
* space. We will use the fact that the use of the shared buffers is
|
|
* synchronous, and get the upper 48 bits from the local process space.
|
|
*/
|
|
|
|
#define BASESMUMA_CACHE_LINE_SIZE 128
|
|
|
|
#define SHIFT_UP <<
|
|
#define SHIFT_DOWN >>
|
|
|
|
#define SEQ_WIDTH 16
|
|
#define SEQ_BASE 0
|
|
#define FIELD_SEQ_MASK ( ( 1 SHIFT_UP SEQ_WIDTH ) - 1 )
|
|
#define INPLACE_SEQ_MASK ( (int64_t)FIELD_SEQ_MASK SHIFT_UP SEQ_BASE)
|
|
|
|
#define FLAG_WIDTH 16
|
|
#define FLAG_BASE 16
|
|
#define FIELD_FLAG_MASK ( ( 1 SHIFT_UP FLAG_WIDTH ) - 1 )
|
|
#define INPLACE_FLAG_MASK ( (int64_t)FIELD_FLAG_MASK SHIFT_UP FLAG_BASE)
|
|
|
|
#define INDX_WIDTH 16
|
|
#define INDX_BASE 32
|
|
#define FIELD_INDX_MASK ( ( 1 SHIFT_UP INDX_WIDTH ) - 1 )
|
|
#define INPLACE_INDX_MASK ( (int64_t)FIELD_INDX_MASK SHIFT_UP INDX_BASE)
|
|
|
|
#define SRC_WIDTH 16
|
|
#define SRC_BASE 48
|
|
#define FIELD_SRC_MASK ( ( 1 SHIFT_UP SRC_WIDTH ) - 1 )
|
|
#define INPLACE_SRC_MASK ( (int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE)
|
|
/*int64_t INPLACE_SRC_MASK= ((int64_t)FIELD_SRC_MASK SHIFT_UP SRC_BASE); */
|
|
|
|
|
|
#define EXTRACT_FLAG(INPUT, OUTPUT, OUTPUT_TYPE, FIELD_BASE, FIELD_MASK) \
|
|
OUTPUT = (OUTPUT_TYPE) ( (INPUT SHIFT_DOWN FIELD_BASE ) & FIELD_MASK )
|
|
|
|
#define STORE_FLAG(INPUT, OUTPUT, INPUT_TYPE, OUTPUT_TYPE, FIELD_BASE, INPLACE_FIELD_MASK ) \
|
|
OUTPUT = \
|
|
( \
|
|
/* 3 */ \
|
|
( \
|
|
/* 2 */ \
|
|
( \
|
|
/* 1 - shift the input field to the proper location */ \
|
|
(OUTPUT_TYPE)( \
|
|
((OUTPUT_TYPE)((INPUT_TYPE) (INPUT))) \
|
|
SHIFT_UP FIELD_BASE ) \
|
|
/* mask off the extra bits */ \
|
|
& ((OUTPUT_TYPE)INPLACE_FIELD_MASK) \
|
|
) \
|
|
/* store back to the OUTPUT field, w/o destroying other fields */ \
|
|
) | OUTPUT \
|
|
)
|
|
|
|
/**
|
|
* Structure to hold the basic shared memory bcoll component.
|
|
*/
|
|
struct mca_bcol_basesmuma_component_t {
|
|
/** Base coll component */
|
|
mca_bcol_base_component_2_0_0_t super;
|
|
|
|
/* management data for collectives with no user data */
|
|
|
|
/** MCA parameter: number of memory banks */
|
|
int basesmuma_num_mem_banks;
|
|
|
|
/** MCA parameter: number of regions per memory bank */
|
|
int basesmuma_num_regions_per_bank;
|
|
|
|
/** MCA parameter: Number of simultaneous groups supported */
|
|
int n_groups_supported;
|
|
|
|
/* management data for collectives with user data (ud) - the memory
|
|
* is actually obtained at the ML level
|
|
*/
|
|
|
|
/** MCA paramenter: number of polling loops to run while waiting
|
|
* for children or parent to complete their work
|
|
*/
|
|
int n_poll_loops;
|
|
|
|
/* mpool size */
|
|
size_t mpool_size;
|
|
|
|
|
|
/* mpool inited - will use this to test whether or not the
|
|
* shared memory has been inited
|
|
*/
|
|
bool mpool_inited;
|
|
|
|
/* shared memory control buffer - the control structures reside
|
|
* in shared memory */
|
|
bcol_basesmuma_smcm_mmap_t *sm_ctl_structs;
|
|
|
|
/* shared memory payload buffer
|
|
*/
|
|
bcol_basesmuma_smcm_mmap_t *sm_payload_structs;
|
|
|
|
/*
|
|
* list of shared memory control structures
|
|
*/
|
|
opal_list_t ctl_structures;
|
|
|
|
|
|
/** opal list in which the list of peers that I am "connected" to is stored
|
|
*/
|
|
opal_list_t sm_connections_list;
|
|
|
|
/* opal list in which the list of payload peers that I am "connected" to
|
|
* is stored
|
|
*/
|
|
opal_list_t sm_payload_connections_list;
|
|
|
|
/*
|
|
* list of non-blocking admin barriers to progress */
|
|
opal_mutex_t nb_admin_barriers_mutex;
|
|
opal_list_t nb_admin_barriers;
|
|
|
|
/*
|
|
* order of fan-in tree
|
|
*/
|
|
int radix_fanin;
|
|
|
|
/*
|
|
* order of fan-out tree
|
|
*/
|
|
int radix_fanout;
|
|
|
|
/*
|
|
* Order of read tree
|
|
*/
|
|
int radix_read_tree;
|
|
|
|
/*
|
|
* order of reduction fan-out tree
|
|
*/
|
|
int order_reduction_tree;
|
|
|
|
/*
|
|
* K-nomial tree radix
|
|
*/
|
|
int k_nomial_radix;
|
|
|
|
/*
|
|
* K-ary scatter tree radix
|
|
*/
|
|
int scatter_kary_radix;
|
|
|
|
/*
|
|
* number of polling loops
|
|
*/
|
|
int num_to_probe;
|
|
|
|
/*
|
|
* Portals addressing info
|
|
* void*: because wanted to keep portal library dependencies
|
|
* as local as possible
|
|
*/
|
|
void *portals_info;
|
|
bool portals_init;
|
|
|
|
/*
|
|
* verbosity level
|
|
*/
|
|
int verbose;
|
|
|
|
/*
|
|
* control file name base string
|
|
*/
|
|
char *clt_base_fname;
|
|
|
|
/*
|
|
* data file name base string
|
|
*/
|
|
char *payload_base_fname;
|
|
|
|
/*
|
|
* shared memory scratch space. This is mapped at the end of the
|
|
* segement of memory holding the control structures.
|
|
*/
|
|
char *my_scratch_shared_memory;
|
|
|
|
/*
|
|
* size of scratch memory
|
|
*/
|
|
size_t my_scratch_shared_memory_size;
|
|
|
|
/* the offset will be the same for all ranks */
|
|
size_t scratch_offset_from_base_ctl_file;
|
|
};
|
|
|
|
static inline int mca_bcol_basesmuma_err(const char* fmt, ...)
|
|
{
|
|
va_list list;
|
|
int ret;
|
|
|
|
va_start(list, fmt);
|
|
ret = vfprintf(stderr, fmt, list);
|
|
va_end(list);
|
|
return ret;
|
|
}
|
|
|
|
#if OPAL_ENABLE_DEBUG
|
|
#define BASESMUMA_VERBOSE(level, args) \
|
|
do { \
|
|
if(mca_bcol_basesmuma_component.verbose >= level) { \
|
|
mca_bcol_basesmuma_err("[%s]%s[%s:%d:%s] BCOL-BASESMUMA ", \
|
|
ompi_process_info.nodename, \
|
|
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
|
|
__FILE__, __LINE__, __func__); \
|
|
mca_bcol_basesmuma_err args; \
|
|
mca_bcol_basesmuma_err("\n"); \
|
|
} \
|
|
} while(0)
|
|
#else
|
|
#define BASESMUMA_VERBOSE(level, args)
|
|
#endif
|
|
|
|
|
|
/**
|
|
* Convenience typedef */
|
|
typedef struct mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component_t;
|
|
|
|
#if 0
|
|
/*
|
|
* Implemented function index list
|
|
*/
|
|
|
|
/* barrier */
|
|
enum{
|
|
FANIN_FAN_OUT_BARRIER_FN,
|
|
RECURSIVE_DOUBLING_BARRIER_FN,
|
|
N_BARRIER_FNS
|
|
};
|
|
|
|
/* reduce */
|
|
enum{
|
|
FANIN_REDUCE_FN,
|
|
REDUCE_SCATTER_GATHER_FN,
|
|
N_REDUCE_FNS
|
|
};
|
|
enum{
|
|
SHORT_DATA_FN_REDUCE,
|
|
LONG_DATA_FN_REDUCE,
|
|
N_REDUCE_FNS_USED
|
|
};
|
|
|
|
/* all-reduce */
|
|
enum{
|
|
FANIN_FANOUT_ALLREDUCE_FN,
|
|
REDUCE_SCATTER_ALLGATHER_FN,
|
|
N_ALLREDUCE_FNS
|
|
};
|
|
enum{
|
|
SHORT_DATA_FN_ALLREDUCE,
|
|
LONG_DATA_FN_ALLREDUCE,
|
|
N_ALLREDUCE_FNS_USED
|
|
};
|
|
|
|
|
|
/* enum for node type */
|
|
enum{
|
|
ROOT_NODE,
|
|
LEAF_NODE,
|
|
INTERIOR_NODE
|
|
};
|
|
|
|
|
|
/*
|
|
* N-order tree node description
|
|
*/
|
|
struct tree_node_t {
|
|
/* my rank within the group */
|
|
int my_rank;
|
|
/* my node type - root, leaf, or interior */
|
|
int my_node_type;
|
|
/* number of nodes in the tree */
|
|
int tree_size;
|
|
/* number of parents (0/1) */
|
|
int n_parents;
|
|
/* number of children */
|
|
int n_children;
|
|
/* parent rank within the group */
|
|
int parent_rank;
|
|
/* chidren ranks within the group */
|
|
int *children_ranks;
|
|
};
|
|
typedef struct tree_node_t tree_node_t;
|
|
|
|
/*
|
|
* Pair-wise data exchange
|
|
*/
|
|
/* enum for node type */
|
|
enum{
|
|
EXCHANGE_NODE,
|
|
EXTRA_NODE
|
|
};
|
|
|
|
struct pair_exchange_node_t {
|
|
|
|
/* my rank within the group */
|
|
int my_rank;
|
|
|
|
/* number of nodes this node will exchange data with */
|
|
int n_exchanges;
|
|
|
|
/* ranks of nodes involved in data exchnge */
|
|
int *rank_exchanges;
|
|
|
|
/* number of extra sources of data - outside largest power of 2 in
|
|
* this group */
|
|
int n_extra_sources;
|
|
|
|
/* rank of the extra source */
|
|
int rank_extra_source;
|
|
|
|
/* number of tags needed per stripe */
|
|
int n_tags;
|
|
|
|
/* log 2 of largest full power of 2 for this node set */
|
|
int log_2;
|
|
|
|
/* largest power of 2 that fits in this group */
|
|
int n_largest_pow_2;
|
|
|
|
/* node type */
|
|
int node_type;
|
|
|
|
};
|
|
typedef struct pair_exchange_node_t pair_exchange_node_t;
|
|
#endif
|
|
/*
|
|
* descriptor for managing the admin nonblocking barrier routine.
|
|
* This is an sm internal routine, and assumes only 1 outstanding
|
|
* nb-barrier collective call per block.
|
|
*/
|
|
/* forward declarations */
|
|
struct mca_bcol_basesmuma_module_t;
|
|
struct sm_buffer_mgmt;
|
|
|
|
struct sm_nbbar_desc_t {
|
|
/* make sure we can put this on a list */
|
|
opal_list_item_t super;
|
|
|
|
/* phase of the collective operation - needed to know how to continue
|
|
* progressing the nb-barrier */
|
|
int collective_phase;
|
|
|
|
/* iteration to continue at */
|
|
int recursive_dbl_iteration;
|
|
|
|
/* pointer to the collective module this is associated with */
|
|
struct mca_bcol_basesmuma_module_t *sm_module;
|
|
|
|
/* pointer to payload/control structs buffers */
|
|
struct sm_buffer_mgmt *coll_buff;
|
|
|
|
/* pool index */
|
|
int pool_index;
|
|
|
|
/* pointer to the ml_memory_block_desc_t structure
|
|
* that is actually managing this registration.
|
|
* This is meaningful when these control structures
|
|
* are used in conjunction with the user payload
|
|
* data that is allocated at the ml level.
|
|
*/
|
|
void *ml_memory_block_descriptor;
|
|
|
|
};
|
|
typedef struct sm_nbbar_desc_t sm_nbbar_desc_t;
|
|
|
|
/*
|
|
* Barrier request objects
|
|
*/
|
|
|
|
/* shared memory data strucutures */
|
|
struct mca_bcol_basesmuma_nb_request_process_shared_mem_t {
|
|
volatile uint64_t coll_index;
|
|
/* flag used to indicate the status of this memory region */
|
|
volatile uint64_t flag;
|
|
volatile uint64_t index;
|
|
|
|
/* pading */
|
|
/* Note: need to change this so it takes less memory */
|
|
char padding[BASESMUMA_CACHE_LINE_SIZE-3*sizeof(uint64_t)];
|
|
};
|
|
|
|
typedef struct mca_bcol_basesmuma_nb_request_process_shared_mem_t
|
|
mca_bcol_basesmuma_nb_request_process_shared_mem_t;
|
|
|
|
/* enum for phase at which the nb barrier is in */
|
|
enum{
|
|
NB_BARRIER_INACTIVE,
|
|
|
|
/* fan-in/fan-out */
|
|
NB_BARRIER_FAN_IN,
|
|
NB_BARRIER_FAN_OUT,
|
|
|
|
/* recursive doubling */
|
|
NB_PRE_PHASE,
|
|
NB_RECURSIVE_DOUBLING,
|
|
NB_POST_PHASE,
|
|
|
|
/* done and not started are the same for all practicle
|
|
* purposes, as the init funtion always sets this flag
|
|
*/
|
|
NB_BARRIER_DONE
|
|
};
|
|
|
|
|
|
|
|
/* forward declartion */
|
|
struct mca_bcol_basesmuma_module_t;
|
|
|
|
|
|
/* control segment for shared memory */
|
|
struct mca_bcol_basesmuma_ctl_struct_t {
|
|
/* collective identifier */
|
|
volatile int64_t sequence_number;
|
|
volatile int64_t flag;
|
|
volatile int64_t index;
|
|
volatile int64_t offset;
|
|
volatile int64_t offset_zip;
|
|
|
|
|
|
/* used for non-blocking algorithms */
|
|
int status;
|
|
int active_requests;
|
|
int iteration;
|
|
|
|
int *src_ptr;
|
|
|
|
int start;
|
|
|
|
/* process private data */
|
|
int starting_flag_value;
|
|
|
|
/* experiment for large data colls */
|
|
int n_sends;
|
|
int length;
|
|
|
|
|
|
#ifdef __PORTALS_AVAIL__
|
|
struct mca_bcol_basesmuma_portal_buf_addr_t portals_buf_addr;
|
|
#endif
|
|
/* padding */
|
|
/* ok, no room to pad anymore */
|
|
/*char padding[BASESMUMA_CACHE_LINE_SIZE-5*sizeof(int64_t)-8*sizeof(int)];*/
|
|
};
|
|
typedef struct mca_bcol_basesmuma_ctl_struct_t mca_bcol_basesmuma_ctl_struct_t;
|
|
|
|
|
|
#define SM_BCOLS_MAX 2
|
|
|
|
enum {
|
|
ALLGATHER_FLAG,
|
|
ALLREDUCE_FLAG,
|
|
BARRIER_FANIN_FLAG,
|
|
BARRIER_FANOUT_FLAG,
|
|
BARRIER_RKING_FLAG,
|
|
BCAST_FLAG,
|
|
GATHER_FLAG,
|
|
REDUCE_FLAG,
|
|
NUM_SIGNAL_FLAGS
|
|
};
|
|
|
|
|
|
/* control region for colls with user data - shared memory */
|
|
struct mca_bcol_basesmuma_header_t {
|
|
/* collective identifier */
|
|
volatile int64_t sequence_number;
|
|
volatile int8_t flags[NUM_SIGNAL_FLAGS][SM_BCOLS_MAX];
|
|
volatile int32_t src; /* src of bcast data for unknown root,
|
|
bcol id for known root
|
|
*/
|
|
/* starting flag - hierarchies */
|
|
int8_t starting_flag_value[SM_BCOLS_MAX];
|
|
|
|
};
|
|
typedef struct mca_bcol_basesmuma_header_t mca_bcol_basesmuma_header_t;
|
|
|
|
/* data needed for large messages */
|
|
struct mca_bcol_basesmuma_large_msg_t {
|
|
/* scatter allgather data */
|
|
uint64_t offset;
|
|
uint64_t n_sends;
|
|
uint64_t length;
|
|
|
|
/* portals data */
|
|
|
|
};
|
|
typedef struct mca_bcol_basesmuma_large_msg_t mca_bcol_basesmuma_large_msg_t;
|
|
|
|
/* payload struct */
|
|
struct mca_bcol_basesmuma_payload_t {
|
|
|
|
/* base pointer to shared memory control structure */
|
|
mca_bcol_basesmuma_header_t *ctl_struct;
|
|
void *payload;
|
|
|
|
};
|
|
|
|
typedef struct mca_bcol_basesmuma_payload_t mca_bcol_basesmuma_payload_t;
|
|
|
|
|
|
|
|
|
|
/* memory bank memory management structure */
|
|
struct mem_bank_management_t {
|
|
|
|
/* generation counter */
|
|
uint64_t bank_gen_counter;
|
|
|
|
/* descriptor for the non-blocking barrier. This is
|
|
* used to manage this bank of memory.
|
|
*/
|
|
sm_nbbar_desc_t nb_barrier_desc;
|
|
|
|
/* the number of buffers that are not in use, and are
|
|
* available. The assumption is that the buffers are
|
|
* recycled all at once, so are available for re-use
|
|
* until all buffers have been made available for re-use.
|
|
*/
|
|
volatile int available_buffers;
|
|
|
|
/*
|
|
* number of buffers freed */
|
|
volatile int n_buffs_freed;
|
|
|
|
/* mutex to ensure atomic recycling of resrouces */
|
|
opal_mutex_t mutex;
|
|
|
|
/* number of buffers being managed */
|
|
int number_of_buffers;
|
|
|
|
/* shared memory control structures */
|
|
int index_shared_mem_ctl_structs;
|
|
|
|
|
|
};
|
|
typedef struct mem_bank_management_t mem_bank_management_t;
|
|
|
|
/* data structure for shared buffers */
|
|
struct sm_buffer_mgmt {
|
|
/* number of buffers per process */
|
|
int number_of_buffs;
|
|
|
|
/* size of group */
|
|
int size_of_group;
|
|
|
|
/* number of memory banks */
|
|
int num_mem_banks;
|
|
|
|
/* number of buffers per memory bank */
|
|
int num_buffs_per_mem_bank;
|
|
|
|
/* log base 2 of num_buffs_per_mem_bank */
|
|
int log2_num_buffs_per_mem_bank;
|
|
|
|
/* log base 2 total number of buffers */
|
|
int log2_number_of_buffs;
|
|
|
|
/* mask - masks off the bits corresponding to buffer index */
|
|
int mask;
|
|
|
|
/* control buffers - these point to regions in shared memory */
|
|
/* leading dimension is the group size - all pointers for a given
|
|
* set of buffers appear consecutively in this array
|
|
*/
|
|
volatile void **ctl_buffs;
|
|
|
|
/* management data for the control structures -
|
|
* one per bank of control structures - Will be used for
|
|
* the payload buffers as well.
|
|
*/
|
|
mem_bank_management_t *ctl_buffs_mgmt;
|
|
|
|
/* data buffers - these point to regions in shared memory */
|
|
/* leading dimension is the group size - all pointers for a given
|
|
* set of buffers appear consecutively in this array
|
|
*/
|
|
|
|
volatile mca_bcol_basesmuma_payload_t *data_buffs;
|
|
|
|
|
|
|
|
};
|
|
typedef struct sm_buffer_mgmt sm_buffer_mgmt;
|
|
|
|
|
|
struct mca_bcol_basesmuma_nb_coll_buff_desc_t {
|
|
void *data_addr;
|
|
uint64_t bank_index;
|
|
uint64_t buffer_index;
|
|
int active_requests;
|
|
ompi_request_t **requests;
|
|
int data_src;
|
|
int radix_mask;
|
|
int radix_mask_pow;
|
|
int iteration;
|
|
int status;
|
|
/* this is for testing */
|
|
int tag;
|
|
|
|
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer;
|
|
};
|
|
|
|
typedef struct mca_bcol_basesmuma_nb_coll_buff_desc_t mca_bcol_basesmuma_nb_coll_buff_desc_t;
|
|
|
|
struct mca_bcol_basesmuma_local_mlmem_desc_t {
|
|
|
|
uint32_t bank_index_for_release;
|
|
struct ml_memory_block_desc_t *ml_mem_desc;
|
|
uint32_t num_banks;
|
|
uint32_t num_buffers_per_bank;
|
|
uint32_t size_buffer;
|
|
uint32_t *bank_release_counter;
|
|
|
|
/*
|
|
* Number of descriptors allocated is equivalent to number of ml buffers
|
|
* (number of banks * number of buffers per bank)
|
|
*/
|
|
mca_bcol_basesmuma_nb_coll_buff_desc_t *nb_coll_desc;
|
|
};
|
|
|
|
typedef struct mca_bcol_basesmuma_local_mlmem_desc_t mca_bcol_basesmuma_local_mlmem_desc_t;
|
|
|
|
#ifdef __PORTALS_AVAIL__
|
|
#define MAX_SM_GROUP_SIZE 32
|
|
|
|
|
|
struct portals_scatter_allgather_nb_bcast_state_t
|
|
{
|
|
/* local variables */
|
|
uint64_t length;
|
|
int my_rank, src, matched;
|
|
int src_list[MAX_SM_GROUP_SIZE];
|
|
int group_size;
|
|
int64_t ready_flag;
|
|
int pow_2, pow_2_levels;
|
|
int src_list_index;
|
|
uint64_t fragment_size; /* user buffer size */
|
|
|
|
/* Input argument variables */
|
|
void *my_userbuf;
|
|
int64_t sequence_number;
|
|
|
|
/* Extra source variables */
|
|
bool secondary_root;
|
|
int partner , extra_partner;
|
|
|
|
/* Scatter Allgather offsets */
|
|
uint64_t local_sg_offset , global_sg_offset , partner_offset ;
|
|
|
|
/* Portals messaging relevant variables */
|
|
/*
|
|
* ptl_handle_eq_t allgather_eq_h;
|
|
*/
|
|
ptl_handle_eq_t read_eq;
|
|
ptl_event_t allgather_event;
|
|
bool msg_posted;
|
|
|
|
/* OMPI module and component variables */
|
|
mca_bcol_basesmuma_component_t *cs;
|
|
struct mca_bcol_basesmuma_module_t *bcol_module;
|
|
|
|
/* Control structure and payload variables */
|
|
volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer;
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer; /* scatter source */
|
|
volatile mca_bcol_basesmuma_ctl_struct_t *extra_partner_ctl_pointer; /* scatter source */
|
|
|
|
int phase;
|
|
};
|
|
|
|
|
|
typedef struct portals_scatter_allgather_nb_bcast_state_t sg_state_t;
|
|
#endif
|
|
|
|
#define SM_ARRAY_INDEX(LEAD_DIM,BUF_INDEX,PROC_INDEX) \
|
|
((LEAD_DIM)*(BUF_INDEX)+(PROC_INDEX))
|
|
/* debug */
|
|
#define BARRIER_BANK_LIST_SIZE 32
|
|
/* end debug */
|
|
|
|
struct mca_bcol_basesmuma_module_t {
|
|
/* base structure */
|
|
mca_bcol_base_module_t super;
|
|
|
|
/* free list item with the control structures used for
|
|
* the no user data collective operations
|
|
*/
|
|
list_data_t *no_userdata_ctl;
|
|
|
|
/* free list item with the control structures used for
|
|
* the with user data collective operations
|
|
*/
|
|
list_data_t *userdata_ctl;
|
|
|
|
/*
|
|
* information on sm control backing files for the subgroup
|
|
* associated with this module.
|
|
*/
|
|
bcol_basesmuma_smcm_proc_item_t **ctl_backing_files_info;
|
|
|
|
/*
|
|
* information on sm payload backing files for the subgroup
|
|
* associated with this module.
|
|
*/
|
|
bcol_basesmuma_smcm_proc_item_t **payload_backing_files_info;
|
|
|
|
/*
|
|
* buffers for the collective that do not involve user data -
|
|
* barrier, fanin, fanout.
|
|
*/
|
|
sm_buffer_mgmt colls_no_user_data;
|
|
|
|
/*
|
|
* buffers for the collective with user data.
|
|
*/
|
|
sm_buffer_mgmt colls_with_user_data;
|
|
|
|
/* recursive-doubling tree node */
|
|
netpatterns_pair_exchange_node_t recursive_doubling_tree;
|
|
|
|
/* k-nomial gather/allgather tree */
|
|
netpatterns_k_exchange_node_t knomial_allgather_tree;
|
|
|
|
/* fanin tree node - root is rank 0 */
|
|
netpatterns_tree_node_t fanin_node;
|
|
|
|
/* fanout tree node - root is rank 0 */
|
|
netpatterns_tree_node_t fanout_node;
|
|
|
|
/* index of blocking barrier memory region to use */
|
|
int index_blocking_barrier_memory_bank;
|
|
|
|
/* comm to shared memory map */
|
|
int *comm_to_sm_map;
|
|
|
|
/* reduction fanout tree */
|
|
netpatterns_tree_node_t* reduction_tree;
|
|
|
|
/* broadcast fanout tree */
|
|
netpatterns_tree_node_t* fanout_read_tree;
|
|
|
|
/* scatter - k-ary tree */
|
|
int scatter_kary_radix;
|
|
netpatterns_tree_node_t *scatter_kary_tree;
|
|
|
|
/* Knomial exchange tree */
|
|
/* Currently used for only large message reduce */
|
|
netpatterns_k_exchange_node_t knomial_exchange_tree;
|
|
|
|
/* sequence number offset - want to make sure that we start
|
|
* id'ing collectives with id 0, so we can have simple
|
|
* resource management.
|
|
*/
|
|
int64_t squence_number_offset;
|
|
|
|
/* basesmuma specific header size into ml buffer
|
|
* was calculated at ml level - it is the sum of
|
|
* all headers from all bcols and then aligned to
|
|
* whatever alignment was requested
|
|
*/
|
|
uint32_t total_header_size;
|
|
|
|
/* list of possible sources */
|
|
int *src_list;
|
|
|
|
/* Number of possible sources */
|
|
int src_size;
|
|
|
|
/* smallest power of k that is smaller
|
|
* than or equal in size to the uma group
|
|
*/
|
|
int pow_k_levels;
|
|
|
|
/* size of power-of-k group */
|
|
int pow_k;
|
|
|
|
/* smallest power of 2 that is smaller
|
|
* than or equal to the smuma group size
|
|
*/
|
|
int pow_2_levels;
|
|
|
|
/* size of power-of-2 group */
|
|
int pow_2;
|
|
|
|
/* pointer to the shared memory scratch array of each
|
|
* process in the group.
|
|
*/
|
|
void **shared_memory_scratch_space;
|
|
|
|
/*
|
|
* Caching information for re-entrant collectives
|
|
*/
|
|
mca_bcol_basesmuma_local_mlmem_desc_t ml_mem;
|
|
|
|
/*
|
|
* Cached offsets for lmsg reduce
|
|
*/
|
|
int **reduce_offsets;
|
|
|
|
/*XXX:
|
|
* Starting to explore the beauty of zero-copy for large message
|
|
*/
|
|
struct mca_hdl_base_module_t **hdl_module;
|
|
|
|
#ifdef __PORTALS_AVAIL__
|
|
/*
|
|
* Store state for NB blocking functions
|
|
*/
|
|
sg_state_t sg_state;
|
|
|
|
#endif
|
|
};
|
|
|
|
typedef struct mca_bcol_basesmuma_module_t mca_bcol_basesmuma_module_t;
|
|
OBJ_CLASS_DECLARATION(mca_bcol_basesmuma_module_t);
|
|
|
|
/* shared memory specific arguments for the bcol registration function */
|
|
typedef struct bcol_basesmuma_registration_data_t {
|
|
char *file_name; /* filename for payload */
|
|
void *base_addr; /* base address to be mapped */
|
|
size_t size; /* size of memory block to be "registered" */
|
|
size_t size_ctl_structure;
|
|
size_t data_seg_alignment;
|
|
bcol_basesmuma_smcm_mmap_t *sm_mmap; /* shared memory map struct */
|
|
mca_coll_ml_release_buff_fn_t buff_release_cb; /* buffer release
|
|
call back */
|
|
} bcol_basesmuma_registration_data_t;
|
|
|
|
|
|
/* enum for signaling flag bank, when
|
|
* adding to this list, please keep
|
|
* it alphabetical
|
|
*/
|
|
/*
|
|
enum {
|
|
ALLGATHER_FLAG,
|
|
ALLREDUCE_FLAG,
|
|
BARRIER_FANIN_FLAG,
|
|
BARRIER_FANOUT_FLAG,
|
|
BARRIER_RKING_FLAG,
|
|
BCAST_FLAG,
|
|
GATHER_FLAG,
|
|
SCATTER_FLAG,
|
|
NUM_SIGNAL_FLAGS
|
|
};
|
|
|
|
*/
|
|
|
|
enum {
|
|
BUFFER_AVAILABLE,
|
|
STARTED,
|
|
FANIN,
|
|
FANOUT
|
|
};
|
|
|
|
/* enum used for non-blocking large
|
|
* message bcast
|
|
*/
|
|
|
|
enum {
|
|
INIT,
|
|
START,
|
|
NOT_STARTED,
|
|
SCATTER,
|
|
ALLGATHER,
|
|
EXTRA_RANK,
|
|
PROBE,
|
|
SCATTER_ROOT_WAIT,
|
|
SCATTER_EXTRA_ROOT_WAIT,
|
|
SCATTER_PARENT_WAIT,
|
|
FINISHED
|
|
};
|
|
|
|
/**
|
|
* Global component instance
|
|
*/
|
|
OMPI_MODULE_DECLSPEC extern mca_bcol_basesmuma_component_t mca_bcol_basesmuma_component;
|
|
|
|
|
|
/*
|
|
* coll module functions
|
|
*/
|
|
|
|
/* query to see if the component is available for use, and can
|
|
* satisfy the thread and progress requirements
|
|
*/
|
|
int mca_bcol_basesmuma_init_query(bool enable_progress_threads,
|
|
bool enable_mpi_threads);
|
|
|
|
/* query to see if the module is available for use on the given
|
|
* communicator, and if so, what it's priority is.
|
|
*/
|
|
mca_bcol_base_module_t **
|
|
mca_bcol_basesmuma_comm_query(mca_sbgp_base_module_t *module, int *num_modules);
|
|
|
|
|
|
|
|
/* shared memory specific memory registration function - this will be passed into the mpool */
|
|
int mca_bcol_basesmuma_register_sm(void *context_data, void *base, size_t size,
|
|
void **reg);
|
|
|
|
/* shared memory specific memory deregistration function - also needed by the mpool */
|
|
int mca_bcol_basesmuma_deregister_sm(void *context_data, void *reg);
|
|
|
|
/* setup the new k_nomial tree for collectives */
|
|
int bcol_basesmuma_setup_knomial_tree(mca_bcol_base_module_t *super);
|
|
|
|
/* allocate the memory pool for the shared memory control structures */
|
|
int mca_bcol_basesmuma_allocate_pool_memory(mca_bcol_basesmuma_component_t
|
|
*component);
|
|
|
|
/* initialize the internal scratch buffers and control structs that will be
|
|
used by the module */
|
|
int base_bcol_basesmuma_setup_library_buffers(
|
|
mca_bcol_basesmuma_module_t *sm_module,
|
|
mca_bcol_basesmuma_component_t *cs);
|
|
|
|
|
|
/* shared memory recursive doubling initialization */
|
|
int bcol_basesmuma_rd_barrier_init(mca_bcol_base_module_t *module);
|
|
|
|
/* shared memory recusive double barrier */
|
|
int bcol_basesmuma_recursive_double_barrier(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
/* shared memory fanin */
|
|
int bcol_basesmuma_fanin_init(mca_bcol_base_module_t *super);
|
|
|
|
/* shared memory fanout */
|
|
int bcol_basesmuma_fanout_init(mca_bcol_base_module_t *super);
|
|
|
|
/* shared memory recursive k-ing non-blocking barrier */
|
|
int bcol_basesmuma_barrier_init(mca_bcol_base_module_t *super);
|
|
|
|
/* Shared memory broadcast */
|
|
int bcol_basesmuma_bcast_init(mca_bcol_base_module_t *super);
|
|
|
|
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* Shared memory non-blocking broadcast */
|
|
int bcol_basesmuma_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_bcast_k_nomial_knownroot(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* Shared memory non-blocking broadcast - Large message anyroot */
|
|
int bcol_basesmuma_binary_scatter_allgather_segment(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
#if 0
|
|
/*FIXME: having fun here*/
|
|
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
#endif
|
|
|
|
int bcol_basesmuma_lmsg_bcast_k_nomial_anyroot(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_lmsg_scatter_allgather_portals_bcast(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_bcast(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_lmsg_scatter_allgather_portals_nb_knownroot_bcast(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/*
|
|
* shared memory scatter
|
|
*/
|
|
int bcol_basesmuma_scatter_init(mca_bcol_base_module_t *super);
|
|
|
|
/* shared memory nonblocking scatter - known root */
|
|
int bcol_basesmuma_nb_scatter_k_array_knownroot(
|
|
bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* shared memory non-blocking k-nomial barrier init */
|
|
int bcol_basesmuma_k_nomial_barrier_init(bcol_function_args_t *input_args,
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
/* shared memory non-blocking k-nomial barrier progress */
|
|
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
/*shared memory non-blocking k-nomial allgather init */
|
|
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
/* shared memory non-blocking k-nomial allgather progress */
|
|
int bcol_basesmuma_k_nomial_allgather_progress(bcol_function_args_t *input_args,
|
|
struct coll_ml_function_t *const_args);
|
|
|
|
/* shared memory allgather -- selection logic api */
|
|
int bcol_basesmuma_allgather_init(mca_bcol_base_module_t *super);
|
|
|
|
/* shared memory blocking k-nomial gather */
|
|
int bcol_basesmuma_k_nomial_gather(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* shared memory non blocking k-nomial gather */
|
|
int bcol_basesmuma_k_nomial_gather_init(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* shared memory non blocking k-nomial gather progress*/
|
|
int bcol_basesmuma_k_nomial_gather_progress(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* shared memory init */
|
|
int bcol_basesmuma_gather_init(mca_bcol_base_module_t *super);
|
|
|
|
/* allocate shared memory control memory */
|
|
int mca_bcol_basesmuma_allocate_sm_ctl_memory(
|
|
mca_bcol_basesmuma_component_t *cs);
|
|
|
|
/* Shared memory basesmuma reduce */
|
|
int bcol_basesmuma_reduce_init(mca_bcol_base_module_t *super);
|
|
int bcol_basesmuma_reduce_intra_fanin(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
int bcol_basesmuma_reduce_intra_fanin_old(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_reduce_intra_reducescatter_gather(void *sbuf, void *rbuf,
|
|
int count, struct ompi_datatype_t *dtype,
|
|
struct ompi_op_t *op,
|
|
int root,
|
|
struct ompi_communicator_t *comm,
|
|
mca_coll_base_module_t *module);
|
|
|
|
/* Shared memory basesmuma allreduce */
|
|
int bcol_basesmuma_allreduce_init(mca_bcol_base_module_t *super);
|
|
|
|
int bcol_basesmuma_allreduce_intra_fanin_fanout(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
int bcol_basesmuma_allreduce_intra_recursive_doubling(bcol_function_args_t *input_args,
|
|
coll_ml_function_t *c_input_args);
|
|
|
|
/* initialize non-blocking barrier for recycling the memory buffers.
|
|
* This is not a general purpose nb_barrier, and relies on the
|
|
* fact that we will have only one outstanding nb-barrier per bank
|
|
* at a time.
|
|
*/
|
|
int bcol_basesmuma_rd_nb_barrier_init_admin(sm_nbbar_desc_t *sm_desc);
|
|
|
|
/* admin nonblocking barrier - progress function */
|
|
int bcol_basesmuma_rd_nb_barrier_progress_admin(sm_nbbar_desc_t *sm_desc);
|
|
|
|
/* Memory syncronization registration function */
|
|
int bcol_basesmuma_memsync_init(mca_bcol_base_module_t *super);
|
|
|
|
/* smcm allgather function used to exchange file offsets. */
|
|
int bcol_basesmuma_smcm_allgather_connection(
|
|
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
|
mca_sbgp_base_module_t *module,
|
|
opal_list_t *peer_list,
|
|
bcol_basesmuma_smcm_proc_item_t ***backing_files,
|
|
ompi_communicator_t *comm,
|
|
bcol_basesmuma_smcm_file_t input, char *base_fname,
|
|
bool map_all);
|
|
|
|
/*
|
|
* this function initializes the internal scratch buffers and control
|
|
* structures that will be used by the module
|
|
*/
|
|
int base_bcol_masesmuma_setup_library_buffers(
|
|
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
|
mca_bcol_basesmuma_component_t *sm_bcol_component);
|
|
|
|
/* get the index of the shared memory buffer to be used */
|
|
int bcol_basesmuma_get_buff_index( sm_buffer_mgmt * buff_block,
|
|
uint64_t buff_id );
|
|
|
|
int bcol_basesmuma_free_buff( sm_buffer_mgmt * buff_block,
|
|
uint64_t buff_id );
|
|
|
|
/* This function does bcol_basesmuma specific memory registration and
|
|
issues call back for ml level bank recycling
|
|
*/
|
|
int bcol_basesmuma_bank_init(struct mca_coll_ml_module_t *ml_module,
|
|
mca_bcol_base_module_t *bcol_module,
|
|
void *reg_data);
|
|
|
|
/* bank init which is used for shared memory optimization, fall back to
|
|
* the bank init above if this causes problems
|
|
*/
|
|
int bcol_basesmuma_bank_init_opti(struct mca_coll_ml_module_t *ml_module,
|
|
mca_bcol_base_module_t *bcol_module,
|
|
void *reg_data);
|
|
|
|
/* used for shared memory offset exchange */
|
|
int base_bcol_basesmuma_exchange_offsets(
|
|
mca_bcol_basesmuma_module_t *sm_bcol_module,
|
|
void **result_array, uint64_t mem_offset, int loop_limit,
|
|
int leading_dim);
|
|
|
|
|
|
/* the progress function to be called from the opal progress function
|
|
*/
|
|
int bcol_basesmuma_progress(void);
|
|
|
|
/* Macro for initializing my shared memory control structure */
|
|
#define BASESMUMA_HEADER_INIT(my_ctl_pointer,ready_flag, sequence_number, bcol_id) \
|
|
do{ \
|
|
int i,j; \
|
|
int8_t flag_offset = 0; \
|
|
/* setup resource recycling */ \
|
|
if( my_ctl_pointer->sequence_number < sequence_number ) { \
|
|
/* Signal arrival */ \
|
|
for( j = 0; j < SM_BCOLS_MAX; j++){ \
|
|
my_ctl_pointer->starting_flag_value[j]=0; \
|
|
for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ \
|
|
my_ctl_pointer->flags[i][j] = -1; \
|
|
} \
|
|
} \
|
|
} \
|
|
/* increment the starting flag by one and return */ \
|
|
flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; \
|
|
ready_flag = flag_offset + 1; \
|
|
MB(); \
|
|
my_ctl_pointer->sequence_number = sequence_number; \
|
|
}while(0)
|
|
|
|
/* these are all the same, am using a single macro for all collectives */
|
|
|
|
#define IS_PEER_READY(peer, my_flag, my_sequence_number,flag_index, bcol_id)\
|
|
(((peer)->sequence_number == (my_sequence_number) && \
|
|
(peer)->flags[flag_index][bcol_id] >= (my_flag))? true : false )
|
|
|
|
#if 0
|
|
#define IS_AR_DATA_READY(peer, my_flag, my_sequence_number)\
|
|
(((peer)->sequence_number == (my_sequence_number) && \
|
|
(peer)->flags[ALLREDUCE_FLAG][bcol_id] >= (my_flag) \
|
|
)? true : false )
|
|
|
|
#define IS_GDATA_READY(peer, my_flag, my_sequence_number)\
|
|
(((peer)->sequence_number == (my_sequence_number) && \
|
|
(peer)->flags[GATHER_FLAG][bcol_id] == (my_flag) \
|
|
)? true : false )
|
|
|
|
#define IS_PEER_READY(peer, my_flag, flag_index, my_sequence_number)\
|
|
((((volatile int64_t)(peer)->sequence_number > (my_sequence_number)) || \
|
|
(((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
|
|
((peer)->flags[flag_index][bcol_id] == (my_flag))) \
|
|
)? true : false )
|
|
|
|
#define IS_ALLREDUCE_PEER_READY(peer, my_flag, my_sequence_number)\
|
|
((((volatile int64_t)(peer)->sequence_number == (my_sequence_number)) && \
|
|
(((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag))||((peer)->flags[ALLREDUCE_FLAG][bcol_id] == (my_flag) + 1)) \
|
|
)? true : false )
|
|
#endif
|
|
|
|
#define IS_LAST_BCOL_FUNC(ml_args) \
|
|
((((ml_args)->n_of_this_type_in_collective == \
|
|
(ml_args)->index_of_this_type_in_collective + 1 ) )? true : false)
|
|
|
|
static inline __opal_attribute_always_inline__
|
|
size_t bcol_basesmuma_data_offset_calc(
|
|
mca_bcol_basesmuma_module_t *basesmuma_module)
|
|
{
|
|
uint32_t offset = basesmuma_module->super.header_size;
|
|
offset = ((offset + BCOL_HEAD_ALIGN - 1) / BCOL_HEAD_ALIGN) * BCOL_HEAD_ALIGN;
|
|
|
|
return (size_t) offset;
|
|
}
|
|
|
|
|
|
END_C_DECLS
|
|
|
|
#endif /* MCA_BCOL_basesmuma_EXPORT_H */
|