1
1
openmpi/ompi/mca/bcol/iboffload/bcol_iboffload.h
2015-06-23 20:59:57 -07:00

766 строки
27 KiB
C

/*
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BCOL_IBOFFLOAD_H
#define MCA_BCOL_IBOFFLOAD_H
#include "ompi_config.h"
#include <stdio.h>
#include <assert.h>
#include <infiniband/mqe.h>
#include <infiniband/verbs.h>
#include <infiniband/mverbs.h>
#include "ompi/mca/mca.h"
#include "ompi/op/op.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/datatype/ompi_datatype_internal.h"
#include "ompi/mca/bcol/bcol.h"
#include "ompi/mca/bcol/base/base.h"
#include "ompi/mca/sbgp/ibnet/sbgp_ibnet.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/request/request.h"
#include "ompi/mca/common/ofacm/connect.h"
#include "bcol_iboffload_qp_info.h"
BEGIN_C_DECLS
#define IMM_RDMA 1
#define INLINE 1
#define NO_INLINE 0
#define MCA_IBOFFLOAD_CALC_SIZE_EXT 8
#define MCA_IBOFFLOAD_IB_DRIVER_OPERAND_SIZE 8
#define MCA_IBOFFLOAD_CACHE_LINE_SIZE 128
#if OPAL_HAVE_IBOFFLOAD_CALC_RDMA
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC_SEND
#else
#define MCA_BCOL_IBOFFLOAD_SEND_CALC IBV_M_WR_CALC
#endif
/* 0 - barrier rdma info
1 - ML rdma info */
#define MAX_REMOTE_RDMA_INFO 2
/* forward declarations */
struct mca_bcol_iboffload_module_t;
struct mca_bcol_iboffload_collreq_t;
struct mca_bcol_iboffload_endpoint_t;
struct mca_bcol_iboffload_frag_t;
struct mca_bcol_iboffload_task_t;
struct mca_bcol_iboffload_qp_info_t;
struct mca_bcol_iboffload_collfrag_t;
struct mca_bcol_iboffload_algth_lst_t;
struct mca_bcol_iboffload_device_t;
typedef int (*mca_bcol_iboffload_coll_algth_fn_t) (
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
struct mca_bcol_iboffload_rdma_info_t {
uint64_t addr;
uint32_t rkey;
uint32_t lkey;
};
typedef struct mca_bcol_iboffload_rdma_info_t mca_bcol_iboffload_rdma_info_t;
struct mca_bcol_iboffload_rdma_buffer_desc_t {
void *data_addr; /* buffer address */
uint64_t generation_number; /* my generation */
uint64_t bank_index; /* my bank */
uint64_t buffer_index; /* my buff index */
};
typedef struct mca_bcol_iboffload_rdma_buffer_desc_t mca_bcol_iboffload_rdma_buffer_desc_t;
struct mca_bcol_iboffload_rdma_block_desc_t {
/* number of memory banks */
uint32_t num_banks;
/* number of buffers per bank */
uint32_t num_buffers_per_bank;
/* size of a payload buffer */
uint32_t size_buffer;
/* data offset from ML */
uint32_t data_offset;
/* pointer to buffer descriptors initialized */
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
};
typedef struct mca_bcol_iboffload_rdma_block_desc_t mca_bcol_iboffload_rdma_block_desc_t;
/* Information that we need to keep in order to access remote
memory. For each remote peer (endpoint) we will keep this
structure */
struct mca_bcol_iboffload_rem_rdma_block_t {
/* IB related information first */
mca_bcol_iboffload_rdma_info_t ib_info;
mca_bcol_iboffload_rdma_buffer_desc_t *rdma_desc;
};
typedef struct mca_bcol_iboffload_rem_rdma_block_t mca_bcol_iboffload_rem_rdma_block_t;
enum {
MCA_BCOL_IBOFFLOAD_BK_COUNTER_INDEX = 0,
MCA_BCOL_IBOFFLOAD_BK_SYNC_INDEX,
MCA_BCOL_IBOFFLOAD_BK_LAST
};
/* Information that we need to keep in order to access and
track local memory that is used as source and destinatination
for RDMA operations */
struct mca_bcol_iboffload_local_rdma_block_t {
/* sync counter keeps next to start bank id */
int sync_counter;
/* Counter for released ml buffers */
int *bank_buffer_counter[MCA_BCOL_IBOFFLOAD_BK_LAST];
/* IB related information first */
struct mca_bcol_iboffload_rdma_info_t ib_info;
/* back pointer to original ML memory descriptor */
struct mca_bcol_base_memory_block_desc_t *ml_mem_desc;
/* Pasha: do we really need this one ?*/
/* caching ml memory descriptor configurations localy */
mca_bcol_iboffload_rdma_block_desc_t bdesc;
};
typedef struct mca_bcol_iboffload_local_rdma_block_t mca_bcol_iboffload_local_rdma_block_t;
struct mca_bcol_iboffload_recv_wr_manager {
opal_mutex_t lock;
/** Array of ready to use receive work requests.
* it is 2 dimensional array since for each
* qp size we want to keep separate recv wr */
struct ibv_recv_wr **recv_work_requests;
};
typedef struct mca_bcol_iboffload_recv_wr_manager mca_bcol_iboffload_recv_wr_manager;
/**
* Structure to hold the basic shared memory coll component. First it holds the
* base coll component, and then holds a bunch of
* sm-coll-component-specific stuff (e.g., current MCA param
* values).
*/
struct mca_bcol_iboffload_component_t {
/** Base coll component */
mca_bcol_base_component_2_0_0_t super;
/** Enable disable verbose mode */
int verbose;
int num_qps;
/** Whether we want a warning if non default GID prefix is not configured
on multiport setup */
bool warn_default_gid_prefix;
/** Whether we want a warning if the user specifies a non-existent
device and/or port via bcol_ibofflad_if_[in|ex]clude MCA params */
bool warn_nonexistent_if;
/** initial size of free lists */
int free_list_num;
/** maximum size of free lists */
int free_list_max;
/** number of elements to alloc when growing free lists */
int free_list_inc;
/** name of ib memory pool */
char* mpool_name;
/** max outstanding CQE on the CQ */
int cq_size;
/** Max size of inline data */
unsigned int max_inline_data;
/** IB partition definition */
uint32_t pkey_val;
/** Outstanding atomic reads */
unsigned int qp_ous_rd_atom;
/** IB MTU */
int mtu;
/** Recv not ready timer */
int min_rnr_timer;
/** IB timeout */
int timeout;
/** IB retry count */
int retry_count;
/** Recv not ready retry count */
int rnr_retry;
/** IB maximum pending RDMA */
int max_rdma_dst_ops;
/** IB Service level (QOS) */
int service_level;
/** Preferred communication buffer alignment in Bytes (must be power of two) */
int buffer_alignment;
/** Max tasks number for MQ */
int max_mqe_tasks;
/** Max MQ size */
int max_mq_size;
/** HCA/Port include exclude list */
char *if_include;
char **if_include_list;
char *if_exclude;
char **if_exclude_list;
/** Dummy argv-style list; a copy of names from the
if_[in|ex]clude list that we use for error checking (to ensure
that they all exist) */
char **if_list;
/** Array of ibv devices */
struct ibv_device **ib_devs;
/** devices count */
int num_devs;
/** MCA param bcol_iboffload_receive_queues */
char *receive_queues;
/** Common info about all kinds of QPs on each iboffload module */
struct mca_bcol_iboffload_qp_info_t qp_infos[MCA_BCOL_IBOFFLOAD_QP_LAST];
/** Array of iboffload devices */
opal_pointer_array_t devices;
/** Free lists of collfrag descriptors */
ompi_free_list_t collfrags_free;
/** Free lists of outstanding collective operations */
ompi_free_list_t collreqs_free;
/** Free lists for free task operations */
ompi_free_list_t tasks_free;
/** Free lists for free calc task operations */
ompi_free_list_t calc_tasks_free;
/** Free list of empty frags, that do not keep any
registration information */
ompi_free_list_t ml_frags_free;
/** Recv work request mananger */
mca_bcol_iboffload_recv_wr_manager recv_wrs;
/** We allocate some resources on the component
* with creating of the first iboffload module
* and set this flag to true */
bool init_done;
/** Maximal number of fragments of the same colective request that can be sent in parallel */
unsigned int max_pipeline_depth;
/** array mapping Open MPI reduction operators to MVerbs reduction operators */
enum ibv_m_wr_calc_op map_ompi_to_ib_calcs[OMPI_OP_NUM_OF_TYPES];
/** array mapping Open MPI data types to MVerbs data types */
enum ibv_m_wr_data_type map_ompi_to_ib_dt[OMPI_DATATYPE_MPI_MAX_PREDEFINED];
/** The order of the exchange tree */
int exchange_tree_order;
/** Knomial tree order */
int knomial_tree_order;
/** K-nomial radix */
int k_nomial_radix;
/** Maximum number of pulls for completion check */
int max_progress_pull;
/** Barrier function selector */
int barrier_mode;
/** MCA for selecting Bruck's alltoall algorithms */
int use_brucks_smsg_alltoall_rdma;
int use_brucks_smsg_alltoall_sr;
/** radix of small-data alltoall Bruck-like algorithm */
int k_alltoall_bruck_radix;
/** alltoall small data buffer alignment */
int tmp_buf_alignment;
};
/**
* Convenience typedef
*/
typedef struct mca_bcol_iboffload_component_t mca_bcol_iboffload_component_t;
/* List of all algorithms that we use */
enum {
FANIN_ALG,
FANOUT_ALG,
RECURSIVE_DOUBLING_BARRIER_ALG,
RECURSIVE_KNOMIAL_BARRIER_ALG,
RECURSIVE_DOUBLING_ALLREDUCE_ALG,
RECURSIVE_DOUBLING_REDUCE_ALG,
RECURSIVE_DOUBLING_TREE_BCAST,
ALL_ENDPOINTS, /* connected to all peers */
ALLGATHER_KNOMIAL_ALG,
ALLGATHER_NEIGHBOR_ALG,
REMOTE_EXCHANGE_ALG,
LAST_ALG
};
struct mca_bcol_iboffload_port_t {
int id; /** Port number on device: 1 or 2 */
int stat; /** Port status - Active,Init,etc.. */
enum ibv_mtu mtu; /** MTU on this port */
uint64_t subnet_id; /** Sunnet id for the port */
uint16_t lid;
uint16_t lmc;
};
typedef struct mca_bcol_iboffload_port_t mca_bcol_iboffload_port_t;
enum {
COLL_MQ = 0,
SERVICE_MQ,
BCOL_IBOFFLOAD_MQ_NUM
};
struct mca_bcol_iboffload_module_t {
/* base structure */
mca_bcol_base_module_t super;
/* size */
int group_size;
int log_group_size;
/* size of each memory segment */
size_t segment_size;
/* collective tag */
long long collective_tag;
/* pointer to device */
struct mca_bcol_iboffload_device_t *device;
/* caching port number */
uint32_t port;
/* Connecting iboffload with ibnet module information */
/* pointer to sbgp ibnet */
mca_sbgp_ibnet_module_t *ibnet;
/* connection group inder for the ibnet */
int cgroup_index;
/* array of endpoints */
struct mca_bcol_iboffload_endpoint_t **endpoints;
/* Size of the endpoints array */
int num_endpoints;
/* caching port subnet id and lid
* the same information we have on device */
uint64_t subnet_id;
uint16_t lid;
/* Pointer to management queue */
struct mqe_context *mq[BCOL_IBOFFLOAD_MQ_NUM];
int mq_credit[BCOL_IBOFFLOAD_MQ_NUM];
/* pending list of collfrags */
opal_list_t collfrag_pending;
/* recursive-doubling tree node */
netpatterns_pair_exchange_node_t recursive_doubling_tree;
/* N exchange tree */
netpatterns_pair_exchange_node_t n_exchange_tree;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_exchange_tree;
/* Knomial exchange tree */
netpatterns_k_exchange_node_t knomial_allgather_tree;
/* The array will keep pre-calculated task consumption per
* algorithm
*/
uint32_t alg_task_consump[LAST_ALG];
/* Pointer to a func that's implementation of a barrier algorithm */
mca_bcol_iboffload_coll_algth_fn_t barrier_algth;
/* Pointer to a func that's implementation of a fanin algorithm */
mca_bcol_iboffload_coll_algth_fn_t fanin_algth;
/* Pointer to a func that's implementation of a fanin algorithm */
mca_bcol_iboffload_coll_algth_fn_t fanout_algth;
/* Pointer to a func that's implementation of a allreduce algorithm */
mca_bcol_iboffload_coll_algth_fn_t allreduce_algth;
/* Pointer to a func that's implementation of a non blocking memory syncronization algorithm */
mca_bcol_iboffload_coll_algth_fn_t memsync_algth;
/* rdma block memory information */
mca_bcol_iboffload_local_rdma_block_t rdma_block;
/* The largest power of two which 1 << power_of_2
is not larger than the group size */
int power_of_2;
/* The largest power of two number which is not larger than the group size */
int power_of_2_ranks;
/* Connection status array */
bool connection_status[LAST_ALG];
/* map from communicator ranks to ibsubnet */
int *comm_to_ibnet_map;
/* order preserving value */
int64_t prev_sequence_num;
/* Temp iovec to send the data fragments -- alltoall Brucks */
struct iovec *alltoall_iovec;
struct iovec *alltoall_recv_iovec;
/* tree radix for the knomial bruck small data alltoall */
int k_alltoall_bruck_radix;
/* Temp buffer alignment for knomial bruck small data alltoall */
int tmp_buf_alignment;
/* Free task list with sge's array */
ompi_free_list_t iovec_tasks_free;
};
typedef struct mca_bcol_iboffload_module_t mca_bcol_iboffload_module_t;
OBJ_CLASS_DECLARATION(mca_bcol_iboffload_module_t);
/**
* Global component instance
*/
OMPI_MODULE_DECLSPEC
extern mca_bcol_iboffload_component_t mca_bcol_iboffload_component;
static inline int mca_bcol_iboffload_err(const char* fmt, ...)
{
va_list list;
int ret;
va_start(list, fmt);
ret = vfprintf(stderr, fmt, list);
va_end(list);
return ret;
}
#define MCA_BCOL_IBOFFLOAD_ALLREDUCE_DO_CALC(ompi_op, c_type, l_operand, r_operand, result) \
do { \
switch (ompi_op) { \
case OMPI_OP_MAX: \
*((c_type *)&result) = ((*(c_type *)&(l_operand) > *(c_type *)&(r_operand)) ? \
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
break; \
case OMPI_OP_MIN: \
*((c_type *)&result) = ((*(c_type *)&(l_operand) < *(c_type *)&(r_operand)) ? \
*(c_type *)&(l_operand) : *(c_type *)&(r_operand)); \
break; \
case OMPI_OP_SUM: \
*((c_type *)&result) = (*((c_type *)&(l_operand)) + *((c_type *)&(r_operand))); \
break; \
default: \
break; \
} \
} while (0);
#define MCA_BCOL_IBOFFLOAD_PKEY_MASK 0x7fff
#define MCA_BCOL_IBOFFLOAD_DEFAULT_GID_PREFIX 0xfe80000000000000ll
#define IBOFFLOAD_ERROR(args) \
do { \
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_iboffload_err args; \
mca_bcol_iboffload_err("\n"); \
} while(0)
#if OPAL_ENABLE_DEBUG
#define IBOFFLOAD_VERBOSE(level, args) \
do { \
if (mca_bcol_iboffload_component.verbose >= level) { \
mca_bcol_iboffload_err("[%s]%s[%s:%d:%s] IBOFFLOAD ", \
ompi_process_info.nodename, \
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
__FILE__, __LINE__, __func__); \
mca_bcol_iboffload_err args; \
mca_bcol_iboffload_err("\n"); \
} \
} while(0)
#else
#define IBOFFLOAD_VERBOSE(level, args)
#endif
#define MCA_BCOL_IBOFFLOAD_SET_COLL_REQ_LINKS(coll_req, coll_work_req) \
do { \
opal_list_append(&(coll_req)->work_requests, \
(opal_list_item_t*) (coll_work_req)); \
(coll_work_req)->coll_full_req = (coll_req); \
} while(0)
/* Vasily: will be removed soon */
#define APPEND_TO_TASKLIST(task_ptr_to_set, event, last_event_type) \
do { \
*task_ptr_to_set = &(event)->element; \
last_event_type = &(event)->element; \
task_ptr_to_set = &((event)->element.next); \
} while(0)
#define MCA_BCOL_IBOFFLOAD_APPEND_TASK_TO_LIST(task_ptr_to_set, task) \
do { \
*task_ptr_to_set = (task); \
task_ptr_to_set = &((task)->next_task); \
} while(0)
#define MCA_BCOL_IBOFFLOAD_APPEND_MQ_TASK_TO_LIST(task_ptr_to_set, event) \
do { \
*task_ptr_to_set = &(event)->element; \
task_ptr_to_set = &((event)->element.next); \
} while(0)
#define BCOL_IS_COMPLETED(req) (((req)->n_frag_mpi_complete == (req)->n_fragments) && \
((req)->n_fragments > 0))
#define BCOL_AND_NET_ARE_COMPLETED(req) (BCOL_IS_COMPLETED(req) && \
((req)->n_frag_net_complete == (req)->n_fragments))
/* Pasha: Need to add locks here */
#define BCOL_IBOFFLOAD_MQ_HAVE_CREDITS(module, mq_index, num_of_credits) \
(((module)->mq_credit[mq_index] -= (num_of_credits)) < 0 ? false : true)
/* Pasha: Need to add locks here */
#define BCOL_IBOFFLOAD_MQ_RETURN_CREDITS(module, mq_index, num_of_credits) \
((module)->mq_credit[mq_index] += (num_of_credits))
#define BCOL_IBOFFLOAD_IS_FIRST_CALL(args) (0 == (args)->index_in_consecutive_same_bcol_calls)
#define BCOL_IBOFFLOAD_IS_LAST_CALL(args) (((args)->n_of_this_type_in_collective - 1) == \
(args)->index_of_this_type_in_collective)
#define BCOL_IBOFFLOAD_READY_TO_POST(args) (((args)->n_of_this_type_in_a_row - 1) == \
(args)->index_in_consecutive_same_bcol_calls)
/*
* bcol module functions
*/
int mca_bcol_iboffload_rec_doubling_start_connections(struct mca_bcol_iboffload_module_t *iboffload);
/* RDMA addr exchange with rem proc */
int mca_bcol_iboffload_exchange_rem_addr(struct mca_bcol_iboffload_endpoint_t *ep);
/* Progress function */
int mca_bcol_iboffload_component_progress(void);
/* Register memory */
int mca_bcol_iboffload_register_mr(void *reg_data, void * base, size_t size,
mca_mpool_base_registration_t *reg);
/* Deregister memory */
int mca_bcol_iboffload_deregister_mr(void *reg_data, mca_mpool_base_registration_t *reg);
/*
* The function is used for create CQ in this module.
*/
int mca_bcol_iboffload_adjust_cq(struct mca_bcol_iboffload_device_t *device,
struct ibv_cq **ib_cq);
/*
* Query to see if the component is available for use,
* and can satisfy the thread and progress requirements
*/
int mca_bcol_iboffload_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
/* Interface to setup the allgather tree */
int mca_bcol_iboffload_setup_knomial_tree(mca_bcol_base_module_t *super);
/*
* Query to see if the module is available for use on
* the given communicator, and if so, what it's priority is.
*/
mca_bcol_base_module_t **
mca_bcol_iboffload_comm_query(mca_sbgp_base_module_t *sbgp, int *num_modules);
int
mca_bcol_iboffload_free_tasks_frags_resources(
struct mca_bcol_iboffload_collfrag_t *collfrag,
ompi_free_list_t *frags_free);
/**
* Shared memory blocking barrier
*/
int mca_bcol_iboffload_small_msg_bcast_intra(bcol_function_args_t *fn_arguments,
struct mca_bcol_base_function_t
*const_args);
int mca_bcol_iboffload_barrier_intra_recursive_doubling_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_barrier_intra_recursive_knomial_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_barrier_intra_recursive_doubling(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_nb_memory_service_barrier_start(
struct mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_fanin_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_fanout_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_barrier_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_memsync_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_allreduce_register(mca_bcol_base_module_t *super);
int mca_bcol_iboffload_new_style_fanin_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_new_style_fanout_first_call(
mca_bcol_iboffload_module_t *iboffload,
struct mca_bcol_iboffload_collreq_t *coll_request);
int mca_bcol_iboffload_nb_memory_service_barrier_intra(bcol_function_args_t *input_args,
struct mca_bcol_base_function_t *const_args);
int mca_bcol_iboffload_coll_support_all_types(bcol_coll coll_name);
int mca_bcol_iboffload_coll_supported(int op, int dtype, bcol_elem_type elem_type);
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_fls(int num)
{
int i = 1;
int j = 0;
if (0 == num) {
return 0;
}
while (i < num) {
i <<= 1;
j++;
}
if (i > num) {
j--;
}
return j;
}
#define BCOL_IBOFFLOAD_IS_EVEN(num) (!((num) & 1))
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_ffs(int num)
{
int j = 0;
if (0 == num) {
return 0;
}
while (BCOL_IBOFFLOAD_IS_EVEN(num)) {
num >>= 1;
j++;
}
return j;
}
#if OPAL_ENABLE_DEBUG
/* Post task list MQ */
#define IS_IMM(a) (a & MQE_WR_FLAG_IMM_EXE)
#define IS_SIG(a) (a & MQE_WR_FLAG_SIGNAL)
#define IS_BLK(a) (a & MQE_WR_FLAG_BLOCK)
int task_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
int wait_to_rank(mca_bcol_iboffload_module_t *iboffload, struct mqe_task *task);
#endif
/* MQ posting function */
static inline __opal_attribute_always_inline__ int
mca_bcol_iboffload_post_mqe_tasks(
mca_bcol_iboffload_module_t *iboffload,
struct mqe_task *head_mqe)
{
int rc;
struct mqe_task *bad_mqe = NULL;
#if OPAL_ENABLE_DEBUG /* debug code */
struct mqe_task *curr_mqe_task = NULL;
int send_count = 0, recv_count = 0, wait_count = 0;
curr_mqe_task = head_mqe;
IBOFFLOAD_VERBOSE(10, ("Processing MQE Head with addr %p <START>\n",
(uintptr_t) (void*) curr_mqe_task));
while (NULL != curr_mqe_task) {
switch(curr_mqe_task->opcode) {
case MQE_WR_SEND:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: send on QP 0x%x\n"
"rank %d, sg_entry: addr %p LEN %d lkey %u, flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
curr_mqe_task->post.qp->qp_num,
task_to_rank(iboffload, curr_mqe_task),
curr_mqe_task->post.send_wr->sg_list->addr,
curr_mqe_task->post.send_wr->sg_list->length,
curr_mqe_task->post.send_wr->sg_list->lkey,
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
++send_count;
break;
case MQE_WR_RECV:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id 0x%x: recv on QP 0x%x rank %d flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
curr_mqe_task->post.qp->qp_num, task_to_rank(iboffload, curr_mqe_task),
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
++recv_count;
break;
case MQE_WR_CQE_WAIT:
IBOFFLOAD_VERBOSE(10, ("Posting task %p id %x: wait on CQ %p for rank %d num of waits %d flag[%d-%d-%d]\n",
(void*) curr_mqe_task, (uintptr_t) curr_mqe_task->wr_id,
(void*) curr_mqe_task->wait.cq, wait_to_rank(iboffload, curr_mqe_task),
curr_mqe_task->wait.count,
IS_IMM(curr_mqe_task->flags), IS_SIG(curr_mqe_task->flags), IS_BLK(curr_mqe_task->flags)));
wait_count += curr_mqe_task->wait.count;
break;
default:
IBOFFLOAD_ERROR(("Fatal error, unknow packet type %d\n",
curr_mqe_task->opcode));
return OMPI_ERROR;
}
/* pointer to next task */
curr_mqe_task = curr_mqe_task->next;
}
IBOFFLOAD_VERBOSE(10, ("wait[%d] send[%d] recv[%d]\n",
wait_count, send_count, recv_count));
#endif
IBOFFLOAD_VERBOSE(10, ("Posting MQ %p <DONE>\n", (uintptr_t) head_mqe->wr_id));
rc = mqe_post_task(iboffload->mq[0], head_mqe, &bad_mqe);
if (OPAL_UNLIKELY(0 != rc)) {
IBOFFLOAD_ERROR(("ibv_post_mqe failed, errno says: %s,"
" the return code is [%d]\n",
strerror(errno), rc));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static inline __opal_attribute_always_inline__
int lognum(int n) {
int count = 1, lognum = 0;
while (count < n) {
count = count << 1;
lognum++;
}
return lognum;
}
END_C_DECLS
#endif /* MCA_BCOL_IBOFFLOAD_H */