1
1
openmpi/ompi/mca/bcol/ptpcoll/bcol_ptpcoll.h
2015-06-23 20:59:57 -07:00

475 строки
13 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_PTPCOLL_EXPORT_H
#define MCA_BCOL_PTPCOLL_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/mca/mca.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/patterns/net/netpatterns.h"
BEGIN_C_DECLS
#ifdef HAVE_SCHED_YIELD
# include <sched.h>
# define SPIN sched_yield()
#else /* no switch available */
# define SPIN
#endif
/**
* Structure to hold the basic shared memory coll component. First it holds the
* base coll component, and then holds a bunch of
* sm-coll-component-specific stuff (e.g., current MCA param
* values).
*/
struct mca_bcol_ptpcoll_component_t {
/** Base coll component */
mca_bcol_base_component_2_0_0_t super;
/** Verbosity level, used only in debug enabled builds */
int verbose;
/** The radix of K-nomial tree, initilized by mca parameter */
int k_nomial_radix;
/** The radix of narray tree, initilized by mca parameter */
int narray_radix;
/** The radix is used for narray scatther and knomail gather for
large message bcast **/
int narray_knomial_radix;
/** Number of times to poll for specific tag/src */
int num_to_probe;
/*
* bcast small messages algorithm
* 1 - Knomial bcast
* 2 - Narray bcast
*/
int bcast_small_messages_known_root_alg;
/*
* bcast large messages algorithm
* 1 - binomial scatter-gather
* 2 - Narray scatther, knomial gather
*/
int bcast_large_messages_known_root_alg;
/*
* barrier algorithm
* 1 - recursive doubling
* 2 - recursive K-ing
*/
int barrier_alg;
int use_brucks_smsg_alltoall_rdma;
};
struct mca_bcol_ptpcoll_collreq_t {
opal_free_list_item_t super;
int tag;
int num_reqs;
int exchange;
int need_toserv_extra;
int extra_partner_rank;
ompi_request_t **requests;
};
typedef struct mca_bcol_ptpcoll_collreq_t mca_bcol_ptpcoll_collreq_t;
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_collreq_t);
/**
* Convenience typedef
*/
typedef struct mca_bcol_ptpcoll_component_t mca_bcol_ptpcoll_component_t;
/* Bcast small messages,
known root algorithm */
enum {
PTPCOLL_KNOMIAL = 1,
PTPCOLL_NARRAY
};
/* Bcast large messages,
known root algorithm */
enum {
PTPCOLL_BINOMIAL_SG = 1, /* Binomila scatter-gather */
PTPCOLL_NARRAY_KNOMIAL_SG /* Narray-Knomial scatter-gather */
};
/*
* Implemented function index list
*/
/* barrier */
enum{
FANIN_FAN_OUT_BARRIER_FN,
RECURSIVE_DOUBLING_BARRIER_FN,
N_BARRIER_FNS
};
/* reduce */
enum{
FANIN_REDUCE_FN,
REDUCE_SCATTER_GATHER_FN,
N_REDUCE_FNS
};
enum{
SHORT_DATA_FN_REDUCE,
LONG_DATA_FN_REDUCE,
N_REDUCE_FNS_USED
};
/* all-reduce */
enum{
FANIN_FANOUT_ALLREDUCE_FN,
REDUCE_SCATTER_ALLGATHER_FN,
N_ALLREDUCE_FNS
};
enum{
SHORT_DATA_FN_ALLREDUCE,
LONG_DATA_FN_ALLREDUCE,
N_ALLREDUCE_FNS_USED
};
/*
* N-order tree node description
*/
struct tree_node_t {
/* my rank within the group */
int my_rank;
/* my node type - root, leaf, or interior */
int my_node_type;
/* number of nodes in the tree */
int tree_size;
/* number of parents (0/1) */
int n_parents;
/* number of children */
int n_children;
/* parent rank within the group */
int parent_rank;
/* chidren ranks within the group */
int *children_ranks;
};
typedef struct tree_node_t tree_node_t;
struct pair_exchange_node_t {
/* number of nodes this node will exchange data with */
int n_exchanges;
/* ranks of nodes involved in data exchnge */
int *rank_exchanges;
/* number of extra sources of data - outside largest power of 2 in
* this group */
int n_extra_sources;
/* rank of the extra source */
int rank_extra_source;
/* number of tags needed per stripe */
int n_tags;
/* log 2 of largest full power of 2 for this node set */
int log_2;
/* largest power of 2 that fits in this group */
int n_largest_pow_2;
/* node type */
int node_type;
};
typedef struct pair_exchange_node_t pair_exchange_node_t;
/*
* Barrier request objects
*/
/* enum for phase at which the nb barrier is in */
enum{
NB_BARRIER_INACTIVE,
NB_BARRIER_FAN_IN,
NB_BARRIER_FAN_OUT,
/* done and not started are the same for all practicle
* purposes, as the init funtion always sets this flag
*/
NB_BARRIER_DONE
};
typedef enum {
PTPCOLL_NOT_STARTED = 1,
PTPCOLL_WAITING_FOR_DATA = 1 << 1,
PTPCOLL_SCATTER_STARTED = 1 << 2,
PTPCOLL_GATHER_STARTED = 1 << 3,
PTPCOLL_EXTRA_SEND_STARTED = 1 << 4,
PTPCOLL_ROOT_SEND_STARTED = 1 << 5
} ptpcoll_op_status;
struct mca_bcol_ptpcoll_ml_buffer_desc_t {
void *data_addr; /* buffer address */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
int active_requests; /* keep number of active requests */
ompi_request_t **requests; /* caching pointers to requests */
int data_src; /* used for bcast to cache internal data */
int radix_mask; /* used for bcast to cache internal data */
int radix_mask_pow; /* used for bcast to cache internal data */
int iteration; /* buffer iteration in knomial, binomail, etc. algorithms */
int tag; /* tag number that is attached to this operation */
int status; /* operation status */
/* Fixme: Probably we can get rid of these fields by redesigning
* the reduce implementation
*/
int reduction_status; /* used for reduction to cache internal
reduction status */
bool reduce_init_called;
};
typedef struct mca_bcol_ptpcoll_ml_buffer_desc_t mca_bcol_ptpcoll_ml_buffer_desc_t;
/*
* Information that we need to keep in order to access and
* track local ML memory that is used as source and destinatination
* for collectives operations
*/
struct mca_bcol_ptpcoll_local_mlmem_desc_t {
/* Bank index to release */
uint32_t bank_index_for_release;
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* pointer to buffer descriptors initialized */
mca_bcol_ptpcoll_ml_buffer_desc_t *ml_buf_desc;
};
typedef struct mca_bcol_ptpcoll_local_mlmem_desc_t mca_bcol_ptpcoll_local_mlmem_desc_t;
typedef enum {
PTPCOLL_PROXY = 1,
PTPCOLL_IN_GROUP = 1 << 1,
PTPCOLL_EXTRA = 1 << 2,
PTPCOLL_KN_PROXY = 1 << 3,
PTPCOLL_KN_IN_GROUP = 1 << 4,
PTPCOLL_KN_EXTRA = 1 << 5
} node_type_pow2;
struct mca_bcol_ptpcoll_module_t {
/* base structure */
mca_bcol_base_module_t super;
/* size */
int group_size;
/* size of each memory segment */
size_t segment_size;
/* k_nomial radix */
int k_nomial_radix;
/* caching power of K, for K-nomial operations */
int pow_k;
/* caching power of K number that is smaller or equal to size of group */
int pow_knum;
/* caching power of 2, it is special case for some algorithms */
int pow_2;
/* caching power of 2 number that is closet to size of group */
int pow_2num;
/* type of this node in group of power 2 */
int pow_2type;
/* type of this node in group of K-nomaial tree */
int pow_ktype;
/* type of this node in group of narray tree */
int narray_type;
/* size of full narray tree */
int full_narray_tree_size;
/* num leafs on last level */
int full_narray_tree_num_leafs;
/* Nary tree info */
netpatterns_tree_node_t *narray_node;
/* if the rank in group, it keeps the extra peer.
if the rank is extra, it keeps the proxy peer.
*/
int proxy_extra_index; /* pow2 algorithm */
int *kn_proxy_extra_index; /* K nomaila algorithm */
int kn_proxy_extra_num; /* number of extra peers , maximum k - 1*/
/* collective tag */
long long collective_tag;
/* tag mask - the pml has a limit on tag size, so need
* to wrap around
*/
uint64_t tag_mask;
/* Caching information about local ml memory.
* Since ptpcoll does not support RDMA operations over pml,
* we don't need to keep any information about remote buffers
*/
mca_bcol_ptpcoll_local_mlmem_desc_t ml_mem;
/* Narray-Knomial scatther gather */
/* list of extra indexes */
int *narray_knomial_proxy_extra_index;
/* number of extra peers , maximum k - 1*/
int narray_knomial_proxy_num;
/* Narray-Knomial node information array */
netpatterns_narray_knomial_tree_node_t *narray_knomial_node;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_exchange_tree;
/* knomial allgather tree --- Do not disable, we need both
different algorithms define recursive k - ing differently
*/
netpatterns_k_exchange_node_t knomial_allgather_tree;
/* Knomial allgather offsets */
int **allgather_offsets;
/* Free lists of outstanding collective operations */
opal_free_list_t collreqs_free;
int log_group_size;
struct iovec *alltoall_iovec;
};
typedef struct mca_bcol_ptpcoll_module_t mca_bcol_ptpcoll_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_ptpcoll_module_t);
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC extern mca_bcol_ptpcoll_component_t
mca_bcol_ptpcoll_component;
/*
* coll module functions
*/
/* query to see if the component is available for use, and can
* satisfy the thread and progress requirements
*/
int mca_bcol_ptpcoll_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
/* query to see if the module is available for use on the given
* communicator, and if so, what it's priority is.
*/
mca_bcol_base_module_t **
mca_bcol_ptpcoll_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
/* interface function to setup recursive k-ing tree */
int mca_bcol_ptpcoll_setup_knomial_tree(mca_bcol_base_module_t *super);
/* barrier routines */
int bcol_ptpcoll_barrier_recurs_dbl(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_barrier_recurs_knomial(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int bcol_ptpcoll_barrier_init(mca_bcol_base_module_t *super);
int mca_bcol_ptpcoll_memsync_init(mca_bcol_base_module_t *super);
void * bcol_ptpcoll_allocate_memory(size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_register_memory(void * in_ptr, size_t length, size_t alignment,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_deregister_memory( void * in_ptr,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_free_memory(void *ptr,
struct mca_bcol_base_module_t *bcol_module);
int bcol_ptpcoll_fanin( bcol_function_args_t *input_args,
struct mca_bcol_base_module_t *module);
int bcol_ptpcoll_fanout( bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather routine */
int bcol_ptpcoll_k_nomial_allgather_init(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather progress */
int bcol_ptpcoll_k_nomial_allgather_progress(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
/* allgather register */
int bcol_ptpcoll_allgather_init(mca_bcol_base_module_t *super);
static inline __opal_attribute_always_inline__
int mca_bcol_ptpcoll_test_for_match(ompi_request_t **request , int *rc)
{
int matched = 0;
int i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
*rc = OMPI_SUCCESS;
for (i = 0; i < cm->num_to_probe &&
0 == matched && OMPI_SUCCESS == *rc ; i++) {
*rc = ompi_request_test(request, &matched, MPI_STATUS_IGNORE);
}
return matched;
}
static inline __opal_attribute_always_inline__
int mca_bcol_ptpcoll_test_all_for_match(int *n_requests, ompi_request_t **requests , int *rc)
{
int matched = 0;
int i;
mca_bcol_ptpcoll_component_t *cm = &mca_bcol_ptpcoll_component;
*rc = OMPI_SUCCESS;
assert(*n_requests >= 0);
if (0 == *n_requests) {
return 1;
}
for (i = 0; i < cm->num_to_probe &&
0 == matched && OMPI_SUCCESS == *rc; i++) {
*rc = ompi_request_test_all
(*n_requests, requests, &matched, MPI_STATUS_IGNORE);
}
if (matched) {
*n_requests = 0;
}
return matched;
}
/* Some negative tags already used by OMPI, making sure that we take safe offset */
#define PTPCOLL_TAG_OFFSET 100
#define PTPCOLL_TAG_FACTOR 2
static inline int lognum(int n){
int count = 1, lognum = 0;
while (count < n) {
count = count << 1;
lognum++;
}
return lognum;
}
END_C_DECLS
#endif /* MCA_BCOL_PTPCOLL_EXPORT_H */