2018-06-01 23:53:53 +03:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2018 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*/
|
|
|
|
#ifndef MCA_BTL_OFI_H
|
|
|
|
#define MCA_BTL_OFI_H
|
|
|
|
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
/* Open MPI includes */
|
|
|
|
#include "opal/mca/event/event.h"
|
|
|
|
#include "opal/mca/btl/btl.h"
|
|
|
|
#include "opal/mca/btl/base/base.h"
|
|
|
|
#include "opal/mca/mpool/mpool.h"
|
|
|
|
#include "opal/mca/btl/base/btl_base_error.h"
|
|
|
|
#include "opal/mca/rcache/base/base.h"
|
|
|
|
#include "opal/mca/pmix/pmix.h"
|
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
#include "opal/class/opal_hash_table.h"
|
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
#include <rdma/fabric.h>
|
|
|
|
#include <rdma/fi_domain.h>
|
|
|
|
#include <rdma/fi_errno.h>
|
|
|
|
#include <rdma/fi_cm.h>
|
|
|
|
#include <rdma/fi_endpoint.h>
|
|
|
|
#include <rdma/fi_rma.h>
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
2018-06-07 19:33:12 +03:00
|
|
|
#define MCA_BTL_OFI_MAX_MODULES 16
|
2018-06-25 22:23:57 +03:00
|
|
|
#define MCA_BTL_OFI_NUM_CQE_READ 64
|
2018-08-03 22:30:03 +03:00
|
|
|
|
|
|
|
#define MCA_BTL_OFI_DEFAULT_RD_NUM 10
|
|
|
|
#define MCA_BTL_OFI_DEFAULT_MAX_CQE 128
|
|
|
|
#define MCA_BTL_OFI_DEFAULT_PROGRESS_THRESHOLD 64
|
2018-06-01 23:53:53 +03:00
|
|
|
|
|
|
|
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
|
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
#define TWO_SIDED_ENABLED mca_btl_ofi_component.two_sided_enabled
|
|
|
|
|
|
|
|
enum mca_btl_ofi_mode {
|
|
|
|
MCA_BTL_OFI_MODE_ONE_SIDED = 0,
|
|
|
|
MCA_BTL_OFI_MODE_TWO_SIDED,
|
|
|
|
MCA_BTL_OFI_MODE_FULL_SUPPORT,
|
|
|
|
MCA_BTL_OFI_MODE_TOTAL
|
|
|
|
};
|
|
|
|
|
|
|
|
enum mca_btl_ofi_hdr_type {
|
|
|
|
MCA_BTL_OFI_TYPE_PUT = 0,
|
2018-06-01 23:53:53 +03:00
|
|
|
MCA_BTL_OFI_TYPE_GET,
|
|
|
|
MCA_BTL_OFI_TYPE_AOP,
|
|
|
|
MCA_BTL_OFI_TYPE_AFOP,
|
|
|
|
MCA_BTL_OFI_TYPE_CSWAP,
|
2018-08-03 22:30:03 +03:00
|
|
|
MCA_BTL_OFI_TYPE_SEND,
|
|
|
|
MCA_BTL_OFI_TYPE_RECV,
|
2018-06-01 23:53:53 +03:00
|
|
|
MCA_BTL_OFI_TYPE_TOTAL
|
|
|
|
};
|
|
|
|
|
2018-06-07 19:33:12 +03:00
|
|
|
struct mca_btl_ofi_context_t {
|
|
|
|
int32_t context_id;
|
|
|
|
|
|
|
|
/* transmit context */
|
|
|
|
struct fid_ep *tx_ctx;
|
|
|
|
struct fid_ep *rx_ctx;
|
|
|
|
|
|
|
|
/* completion queue */
|
|
|
|
struct fid_cq *cq;
|
|
|
|
|
|
|
|
/* completion info freelist */
|
|
|
|
/* We have it per context to reduce the thread contention
|
|
|
|
* on the freelist. Things can get really slow. */
|
2018-08-03 22:30:03 +03:00
|
|
|
opal_free_list_t rdma_comp_list;
|
|
|
|
opal_free_list_t frag_comp_list;
|
|
|
|
opal_free_list_t frag_list;
|
2018-06-07 19:33:12 +03:00
|
|
|
|
|
|
|
/* for thread locking */
|
2018-09-15 22:37:32 +03:00
|
|
|
opal_atomic_int32_t lock;
|
2018-06-07 19:33:12 +03:00
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
|
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
/**
|
|
|
|
* @brief OFI BTL module
|
|
|
|
*/
|
|
|
|
struct mca_btl_ofi_module_t {
|
|
|
|
/** base BTL interface */
|
|
|
|
mca_btl_base_module_t super;
|
|
|
|
|
|
|
|
/* libfabric components */
|
|
|
|
struct fi_info *fabric_info;
|
|
|
|
struct fid_fabric *fabric;
|
|
|
|
struct fid_domain *domain;
|
|
|
|
struct fid_ep *ofi_endpoint;
|
|
|
|
struct fid_av *av;
|
|
|
|
|
2018-06-07 19:33:12 +03:00
|
|
|
int num_contexts;
|
|
|
|
mca_btl_ofi_context_t *contexts;
|
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
char *linux_device_name;
|
|
|
|
|
|
|
|
/** whether the module has been fully initialized or not */
|
|
|
|
bool initialized;
|
|
|
|
bool use_virt_addr;
|
2018-06-07 19:33:12 +03:00
|
|
|
bool is_scalable_ep;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
2018-07-11 22:34:03 +03:00
|
|
|
opal_atomic_int64_t outstanding_rdma;
|
|
|
|
opal_atomic_int64_t outstanding_send;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
|
|
|
/** linked list of BTL endpoints. this list is never searched so
|
|
|
|
* there is no need for a complicated structure here at this time*/
|
|
|
|
opal_list_t endpoints;
|
|
|
|
|
2018-06-07 19:33:12 +03:00
|
|
|
opal_mutex_t module_lock;
|
2018-08-03 22:30:03 +03:00
|
|
|
opal_hash_table_t id_to_endpoint;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
|
|
|
/** registration cache */
|
|
|
|
mca_rcache_base_module_t *rcache;
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;
|
|
|
|
|
|
|
|
extern mca_btl_ofi_module_t mca_btl_ofi_module_template;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @brief OFI BTL component
|
|
|
|
*/
|
|
|
|
struct mca_btl_ofi_component_t {
|
|
|
|
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
|
|
|
|
|
|
|
/** number of TL modules */
|
|
|
|
int module_count;
|
2018-06-07 19:33:12 +03:00
|
|
|
int num_contexts_per_module;
|
2018-06-01 23:53:53 +03:00
|
|
|
int num_cqe_read;
|
2018-06-25 22:23:57 +03:00
|
|
|
int progress_threshold;
|
2018-08-03 22:30:03 +03:00
|
|
|
int mode;
|
|
|
|
int rd_num;
|
|
|
|
bool two_sided_enabled;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
|
|
|
size_t namelen;
|
|
|
|
|
|
|
|
/** All BTL OFI modules (1 per tl) */
|
|
|
|
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
|
|
|
|
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
|
|
|
|
|
|
|
|
OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component;
|
|
|
|
|
|
|
|
struct mca_btl_base_registration_handle_t {
|
|
|
|
uint64_t rkey;
|
|
|
|
void *desc;
|
|
|
|
void *base_addr;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mca_btl_ofi_reg_t {
|
|
|
|
mca_rcache_base_registration_t base;
|
|
|
|
struct fid_mr *ur_mr;
|
|
|
|
|
|
|
|
/* remote handle */
|
|
|
|
mca_btl_base_registration_handle_t handle;
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;
|
|
|
|
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);
|
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
struct mca_btl_ofi_header_t {
|
|
|
|
mca_btl_base_tag_t tag;
|
|
|
|
size_t len;
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_header_t mca_btl_ofi_header_t;
|
|
|
|
|
|
|
|
struct mca_btl_ofi_base_frag_t {
|
|
|
|
mca_btl_base_descriptor_t base;
|
|
|
|
mca_btl_base_segment_t segments[2];
|
|
|
|
|
|
|
|
int context_id;
|
|
|
|
struct mca_btl_ofi_module_t *btl;
|
|
|
|
struct mca_btl_base_endpoint_t *endpoint;
|
|
|
|
opal_free_list_t *free_list;
|
|
|
|
mca_btl_ofi_header_t hdr;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct mca_btl_ofi_base_frag_t mca_btl_ofi_base_frag_t;
|
|
|
|
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_ofi_base_frag_t);
|
|
|
|
|
|
|
|
|
|
|
|
struct mca_btl_ofi_completion_context_t {
|
|
|
|
struct fi_context ctx;
|
|
|
|
void *comp;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct mca_btl_ofi_completion_context_t mca_btl_ofi_completion_context_t;
|
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
/* completion structure store information needed
|
|
|
|
* for RDMA callbacks */
|
2018-08-03 22:30:03 +03:00
|
|
|
struct mca_btl_ofi_base_completion_t {
|
2018-06-01 23:53:53 +03:00
|
|
|
opal_free_list_item_t comp_list;
|
2018-08-03 22:30:03 +03:00
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
opal_free_list_t *my_list;
|
|
|
|
|
|
|
|
struct mca_btl_base_module_t *btl;
|
|
|
|
struct mca_btl_base_endpoint_t *endpoint;
|
2018-06-07 19:33:12 +03:00
|
|
|
struct mca_btl_ofi_context_t *my_context;
|
2018-08-03 22:30:03 +03:00
|
|
|
int type;
|
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_base_completion_t mca_btl_ofi_base_completion_t;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
struct mca_btl_ofi_rdma_completion_t {
|
|
|
|
mca_btl_ofi_base_completion_t base;
|
|
|
|
mca_btl_ofi_completion_context_t comp_ctx;
|
2018-06-01 23:53:53 +03:00
|
|
|
void *local_address;
|
|
|
|
mca_btl_base_registration_handle_t *local_handle;
|
|
|
|
|
|
|
|
uint64_t operand;
|
|
|
|
uint64_t compare;
|
|
|
|
|
|
|
|
mca_btl_base_rdma_completion_fn_t cbfunc;
|
|
|
|
void *cbcontext;
|
|
|
|
void *cbdata;
|
2018-08-03 22:30:03 +03:00
|
|
|
};
|
|
|
|
typedef struct mca_btl_ofi_rdma_completion_t mca_btl_ofi_rdma_completion_t;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
struct mca_btl_ofi_frag_completion_t {
|
|
|
|
mca_btl_ofi_base_completion_t base;
|
|
|
|
mca_btl_ofi_completion_context_t comp_ctx;
|
|
|
|
mca_btl_ofi_base_frag_t *frag;
|
2018-06-01 23:53:53 +03:00
|
|
|
};
|
2018-08-03 22:30:03 +03:00
|
|
|
typedef struct mca_btl_ofi_frag_completion_t mca_btl_ofi_frag_completion_t;
|
2018-06-01 23:53:53 +03:00
|
|
|
|
2018-08-03 22:30:03 +03:00
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_ofi_rdma_completion_t);
|
|
|
|
OBJ_CLASS_DECLARATION(mca_btl_ofi_frag_completion_t);
|
2018-06-01 23:53:53 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Initiate an asynchronous put.
|
|
|
|
* Completion Semantics: if this function returns a 1 then the operation
|
|
|
|
* is complete. a return of OPAL_SUCCESS indicates
|
|
|
|
* the put operation has been queued with the
|
|
|
|
* network. the local_handle can not be deregistered
|
|
|
|
* until all outstanding operations on that handle
|
|
|
|
* have been completed.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param endpoint (IN) BTL addressing information
|
|
|
|
* @param local_address (IN) Local address to put from (registered)
|
|
|
|
* @param remote_address (IN) Remote address to put to (registered remotely)
|
|
|
|
* @param local_handle (IN) Registration handle for region containing
|
|
|
|
* (local_address, local_address + size)
|
|
|
|
* @param remote_handle (IN) Remote registration handle for region containing
|
|
|
|
* (remote_address, remote_address + size)
|
|
|
|
* @param size (IN) Number of bytes to put
|
|
|
|
* @param flags (IN) Flags for this put operation
|
|
|
|
* @param order (IN) Ordering
|
|
|
|
* @param cbfunc (IN) Function to call on completion (if queued)
|
|
|
|
* @param cbcontext (IN) Context for the callback
|
|
|
|
* @param cbdata (IN) Data for callback
|
|
|
|
*
|
|
|
|
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
|
|
|
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
|
|
|
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
|
|
|
* operation. Try again later
|
|
|
|
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
|
|
|
* alignment restrictions.
|
|
|
|
*/
|
|
|
|
int mca_btl_ofi_put (struct mca_btl_base_module_t *btl,
|
|
|
|
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
|
|
|
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
|
|
|
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
|
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initiate an asynchronous get.
|
|
|
|
* Completion Semantics: if this function returns a 1 then the operation
|
|
|
|
* is complete. a return of OPAL_SUCCESS indicates
|
|
|
|
* the get operation has been queued with the
|
|
|
|
* network. the local_handle can not be deregistered
|
|
|
|
* until all outstanding operations on that handle
|
|
|
|
* have been completed.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param endpoint (IN) BTL addressing information
|
|
|
|
* @param local_address (IN) Local address to put from (registered)
|
|
|
|
* @param remote_address (IN) Remote address to put to (registered remotely)
|
|
|
|
* @param local_handle (IN) Registration handle for region containing
|
|
|
|
* (local_address, local_address + size)
|
|
|
|
* @param remote_handle (IN) Remote registration handle for region containing
|
|
|
|
* (remote_address, remote_address + size)
|
|
|
|
* @param size (IN) Number of bytes to put
|
|
|
|
* @param flags (IN) Flags for this put operation
|
|
|
|
* @param order (IN) Ordering
|
|
|
|
* @param cbfunc (IN) Function to call on completion (if queued)
|
|
|
|
* @param cbcontext (IN) Context for the callback
|
|
|
|
* @param cbdata (IN) Data for callback
|
|
|
|
*
|
|
|
|
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
|
|
|
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
|
|
|
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
|
|
|
* operation. Try again later
|
|
|
|
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
|
|
|
* alignment restrictions.
|
|
|
|
*/
|
|
|
|
int mca_btl_ofi_get (struct mca_btl_base_module_t *btl,
|
|
|
|
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
|
|
|
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
|
|
|
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
|
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
|
|
|
|
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
|
|
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
|
|
|
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
|
|
|
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
|
|
|
|
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
|
|
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
|
|
|
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
|
|
|
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
|
|
|
void *cbcontext, void *cbdata);
|
|
|
|
|
|
|
|
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
|
|
|
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
|
|
|
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
|
|
|
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
|
|
|
|
|
|
|
|
int mca_btl_ofi_finalize (mca_btl_base_module_t *btl);
|
|
|
|
|
|
|
|
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module);
|
|
|
|
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
|
|
|
|
mca_rcache_base_registration_t *reg);
|
|
|
|
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
|
|
|
|
|
2018-06-07 19:33:12 +03:00
|
|
|
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
|
2018-08-03 22:30:03 +03:00
|
|
|
|
|
|
|
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode);
|
|
|
|
|
|
|
|
int mca_btl_ofi_post_recvs(mca_btl_base_module_t* module, mca_btl_ofi_context_t *context, int count);
|
2018-06-01 23:53:53 +03:00
|
|
|
void mca_btl_ofi_exit(void);
|
|
|
|
|
2018-06-07 19:33:12 +03:00
|
|
|
/* thread atomics */
|
|
|
|
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
|
|
|
|
{
|
|
|
|
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
|
|
|
|
{
|
|
|
|
while (mca_btl_ofi_context_trylock(context));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
|
|
|
|
{
|
|
|
|
opal_atomic_mb();
|
|
|
|
context->lock = 0;
|
|
|
|
}
|
|
|
|
|
2018-06-01 23:53:53 +03:00
|
|
|
END_C_DECLS
|
|
|
|
#endif
|