diff --git a/opal/mca/btl/ofi/Makefile.am b/opal/mca/btl/ofi/Makefile.am new file mode 100644 index 0000000000..fdaeec865d --- /dev/null +++ b/opal/mca/btl/ofi/Makefile.am @@ -0,0 +1,62 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2018 Intel, inc. All rights reserved +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +#dist_opaldata_DATA = help-mpi-btl-ofi.txt + +AM_CPPFLAGS = $(opal_common_ofi_CPPFLAGS) +sources = \ + btl_ofi.h \ + btl_ofi_component.c \ + btl_ofi_endpoint.h \ + btl_ofi_endpoint.c \ + btl_ofi_module.c \ + btl_ofi_rdma.h \ + btl_ofi_rdma.c \ + btl_ofi_atomics.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_btl_ofi_DSO +lib = +lib_sources = +component = mca_btl_ofi.la +component_sources = $(sources) +else +lib = libmca_btl_ofi.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_btl_ofi_la_SOURCES = $(component_sources) +mca_btl_ofi_la_LDFLAGS = -module -avoid-version \ + $(opal_common_ofi_LDFLAGS) +mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la + +noinst_LTLIBRARIES = $(lib) +libmca_btl_ofi_la_SOURCES = $(lib_sources) +libmca_btl_ofi_la_LDFLAGS = -module -avoid-version $(opal_common_ofi_LDFLAGS) diff --git a/opal/mca/btl/ofi/README b/opal/mca/btl/ofi/README new file mode 100644 index 0000000000..97e3759830 --- /dev/null +++ b/opal/mca/btl/ofi/README @@ -0,0 +1,88 @@ +======================================== +Design notes on BTL/OFI +======================================== + +This is the RDMA only btl based on OFI Libfabric. The goal is to enable RDMA +with multiple vendor hardware through one interface. Most of the operations are +managed by upper layer (osc/rdma). This BTL is mostly doing the low level work. + +Tested providers: sockets,psm2,ugni + +======================================== + +Component + +This BTL is requesting libfabric version 1.5 API and will not support older versions. + +The required capabilities of this BTL is FI_ATOMIC and FI_RMA with the endpoint type +of FI_EP_RDM only. This BTL does NOT support libfabric provider that requires local +memory registration (FI_MR_LOCAL). + +BTL/OFI will initialize a module with ONLY the first compatible info returned from OFI. +This means it will rely on OFI provider to do load balancing. The support for multiple +device might be added later. + +The BTL creates only one endpoint and one CQ. + +======================================== + +Memory Registration + +Open MPI has a system in place to exchange remote address and always use the remote +virtual address to refer to a piece of memory. However, some libfabric providers might +not support the use of virtual address and instead will use zero-based offset addressing. + +FI_MR_VIRT_ADDR is the flag that determine this behavior. mca_btl_ofi_reg_mem() handles +this by storing the base address in registration handle in case of the provider does not +support FI_MR_VIRT_ADDR. This base address will be used to calculate the offset later in +RDMA/Atomic operations. + +The BTL will try to use the address of registration handle as the key. However, if the +provider supports FI_MR_PROV_KEY, it will use provider provided key. Simply does not care. + +The BTL does not register local operand or compare. This is why this BTL does not support +FI_MR_LOCAL and will allocate every buffer before registering. This means FI_MR_ALLOCATED +is supported. So to be explicit. + +Supported MR mode bits (will work with or without): + enum: + - FI_MR_BASIC + - FI_MR_SCALABLE + + mode bits: + - FI_MR_VIRT_ADDR + - FI_MR_ALLOCATED + - FI_MR_PROV_KEY + +The BTL does NOT support (will not work with): + - FI_MR_LOCAL + - FI_MR_MMU_NOTIFY + - FI_MR_RMA_EVENT + - FI_MR_ENDPOINT + +Just a reminder, in libfabric API 1.5... +FI_MR_BASIC == (FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_VIRT_ADDR) + +======================================== + +Completions + +Every operation in this BTL is asynchronous. The completion handling will occur in +mca_btl_ofi_component_progress() where we read the CQ with the completion context and +execute the callback functions. The completions are local. No remote completion event is +generated as local completion already guarantee global completion. + +The BTL keep tracks of number of outstanding operations and provide flush interface. + +======================================== + +Sockets Provider + +Sockets provider is the proof of concept provider for libfabric. It is supposed to support +all the OFI API with emulations. This provider is considered very slow and bound to raise +problems that we might not see from other faster providers. + +Known Problems: + - sockets provider uses progress thread and can cause segfault in finalize as we free + the resources while progress thread is still using it. sleep(1) was put in + mca_btl_ofi_componenet_close() for this reason. diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h new file mode 100644 index 0000000000..02e44fd8b3 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -0,0 +1,311 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2018 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_OFI_H +#define MCA_BTL_OFI_H + +#include "opal_config.h" +#include +#include + +/* Open MPI includes */ +#include "opal/mca/event/event.h" +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/mpool/mpool.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/pmix/pmix.h" + +#include +#include +#include +#include +#include +#include + +BEGIN_C_DECLS +#define MCA_BTL_OFI_MAX_MODULES 16 +#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128 +#define MCA_BTL_OFI_NUM_CQE_READ 64 +#define MCA_BTL_OFI_PROGRESS_THRESHOLD 64 + +#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args) + +enum mca_btl_ofi_type { + MCA_BTL_OFI_TYPE_PUT = 1, + MCA_BTL_OFI_TYPE_GET, + MCA_BTL_OFI_TYPE_AOP, + MCA_BTL_OFI_TYPE_AFOP, + MCA_BTL_OFI_TYPE_CSWAP, + MCA_BTL_OFI_TYPE_TOTAL +}; + +struct mca_btl_ofi_context_t { + int32_t context_id; + + /* transmit context */ + struct fid_ep *tx_ctx; + struct fid_ep *rx_ctx; + + /* completion queue */ + struct fid_cq *cq; + + /* completion info freelist */ + /* We have it per context to reduce the thread contention + * on the freelist. Things can get really slow. */ + opal_free_list_t comp_list; + + /* for thread locking */ + volatile int32_t lock; +}; +typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t; + +/** + * @brief OFI BTL module + */ +struct mca_btl_ofi_module_t { + /** base BTL interface */ + mca_btl_base_module_t super; + + /* libfabric components */ + struct fi_info *fabric_info; + struct fid_fabric *fabric; + struct fid_domain *domain; + struct fid_ep *ofi_endpoint; + struct fid_av *av; + + int num_contexts; + mca_btl_ofi_context_t *contexts; + + char *linux_device_name; + + /** whether the module has been fully initialized or not */ + bool initialized; + bool use_virt_addr; + bool is_scalable_ep; + + int64_t outstanding_rdma; + + /** linked list of BTL endpoints. this list is never searched so + * there is no need for a complicated structure here at this time*/ + opal_list_t endpoints; + + opal_mutex_t module_lock; + + /** registration cache */ + mca_rcache_base_module_t *rcache; +}; +typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t; + +extern mca_btl_ofi_module_t mca_btl_ofi_module_template; + +/** + * @brief OFI BTL component + */ +struct mca_btl_ofi_component_t { + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ + + /** number of TL modules */ + int module_count; + int num_contexts_per_module; + int num_cqe_read; + int progress_threshold; + + size_t namelen; + + /** All BTL OFI modules (1 per tl) */ + mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; + +}; +typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t; + +OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component; + +struct mca_btl_base_registration_handle_t { + uint64_t rkey; + void *desc; + void *base_addr; +}; + +struct mca_btl_ofi_reg_t { + mca_rcache_base_registration_t base; + struct fid_mr *ur_mr; + + /* remote handle */ + mca_btl_base_registration_handle_t handle; +}; +typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t; + +OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t); + +/* completion structure store information needed + * for RDMA callbacks */ +struct mca_btl_ofi_completion_t { + opal_free_list_item_t comp_list; + opal_free_list_t *my_list; + + struct mca_btl_base_module_t *btl; + struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_ofi_context_t *my_context; + uint32_t type; + + void *local_address; + mca_btl_base_registration_handle_t *local_handle; + + /* information for atomic op */ + uint64_t operand; + uint64_t compare; + + mca_btl_base_rdma_completion_fn_t cbfunc; + void *cbcontext; + void *cbdata; + +}; +typedef struct mca_btl_ofi_completion_t mca_btl_ofi_completion_t; + +OBJ_CLASS_DECLARATION(mca_btl_ofi_completion_t); + +/** + * Initiate an asynchronous put. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the put operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_ofi_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous get. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the get operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_ofi_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + + +int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); + +int mca_btl_ofi_finalize (mca_btl_base_module_t *btl); + +void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module); +int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg); +int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); + +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context); +void mca_btl_ofi_exit(void); + +/* thread atomics */ +static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context) +{ + return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1)); +} + +static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context) +{ + while (mca_btl_ofi_context_trylock(context)); +} + +static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context) +{ + opal_atomic_mb(); + context->lock = 0; +} + +END_C_DECLS +#endif diff --git a/opal/mca/btl/ofi/btl_ofi_atomics.c b/opal/mca/btl/ofi/btl_ofi_atomics.c new file mode 100644 index 0000000000..e5364ed648 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_atomics.c @@ -0,0 +1,192 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include "btl_ofi_rdma.h" + +static inline int to_fi_op(mca_btl_base_atomic_op_t op) +{ + switch (op) { + case MCA_BTL_ATOMIC_ADD: + return FI_SUM; + case MCA_BTL_ATOMIC_SWAP: + return FI_ATOMIC_WRITE; + default: + BTL_ERROR(("Unknown or unsupported atomic op.")); + MCA_BTL_OFI_ABORT(); + + /* just to squash the warning */ + return OPAL_ERROR; + } +} + +int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + int fi_op; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + fi_op = to_fi_op(op); + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_AFOP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) operand; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + rc = fi_fetch_atomic(ofi_context->tx_ctx, + (void*) &comp->operand, 1, NULL, /* operand */ + local_address, local_handle->desc, /* results */ + btl_endpoint->peer_addr, /* remote addr */ + remote_address, remote_handle->rkey, /* remote buffer */ + fi_datatype, fi_op, comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + int fi_op; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + fi_op = to_fi_op(op); + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, + NULL, + NULL, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_AOP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) operand; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + rc = fi_atomic(ofi_context->tx_ctx, + (void*) &comp->operand, 1, NULL, /* operand */ + btl_endpoint->peer_addr, /* remote addr */ + remote_address, remote_handle->rkey, /* remote buffer */ + fi_datatype, fi_op, comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_CSWAP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) value; + comp->compare = (uint64_t) compare; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* perform atomic */ + rc = fi_compare_atomic(ofi_context->tx_ctx, + (void*) &comp->operand, 1, NULL, + (void*) &comp->compare, NULL, + local_address, local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + fi_datatype, + FI_CSWAP, + comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c new file mode 100644 index 0000000000..1ee541afb3 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -0,0 +1,681 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "opal_config.h" + +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/hwloc/base/base.h" + +#include + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" +#include "btl_ofi_rdma.h" + +#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC) +#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) + +static char *prov_include; +static char *prov_exclude; +static char *ofi_progress_mode; +static bool disable_sep; +static int mca_btl_ofi_init_device(struct fi_info *info); + +/* validate information returned from fi_getinfo(). + * return OPAL_ERROR if we dont have what we need. */ +static int validate_info(struct fi_info *info) +{ + int mr_mode; + + BTL_VERBOSE(("validating device: %s", info->domain_attr->name)); + + /* we need exactly all the required bits */ + if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) { + BTL_VERBOSE(("unsupported caps")); + return OPAL_ERROR; + } + + /* we need FI_EP_RDM */ + if (info->ep_attr->type != FI_EP_RDM) { + BTL_VERBOSE(("unsupported EP type")); + return OPAL_ERROR; + } + + mr_mode = info->domain_attr->mr_mode; + + if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE || + (mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) { + BTL_VERBOSE(("unsupported MR mode")); + return OPAL_ERROR; + } + + if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) { + BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE")); + return OPAL_ERROR; + } + + BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name)); + return OPAL_SUCCESS; +} + +/* Register the MCA parameters */ +static int mca_btl_ofi_component_register(void) +{ + mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template; + + /* fi_getinfo with prov_name == NULL means ALL provider. + * Since now we are using the first valid info returned, I'm not sure + * if we need to provide the support for comma limited provider list. */ + prov_include = NULL; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "provider_include", + "OFI provider that ofi btl will query for. This parameter only " + "accept ONE provider name. " + "(e.g., \"psm2\"; an empty value means that all providers will " + "be considered.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &prov_include); + + /* TODO: this param has not been implemented. Not sure if we need it. " */ + prov_exclude = NULL; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "provider_exclude", + "Comma-delimited list of OFI providers that are not considered for use " + "(default: \"sockets,mxm\"; empty value means that all providers will " + " be considered). " + "Mutually exclusive with btl_ofi_provider_include.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &prov_exclude); + + mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "num_cq_read", + "Number of completion entries to read from a single cq_read. ", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.num_cqe_read); + + ofi_progress_mode = "unspec"; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "progress_mode", + "requested provider progress mode. [unspec, auto, manual]" + "(default: unspec)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &ofi_progress_mode); + + mca_btl_ofi_component.num_contexts_per_module = 1; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "num_contexts_per_module", + "number of communication context per module to create. " + "This should increase multithreaded performance but it is " + "advised that this number should be lower than total cores.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.num_contexts_per_module); + + disable_sep = false; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "disable_sep", + "force btl/ofi to never use scalable endpoint. ", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &disable_sep); + + mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_PROGRESS_THRESHOLD; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "progress_threshold", + "number of outstanding operation before btl will progress " + "automatically. Tuning this might improve performance on " + "certain type of application.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.progress_threshold); + + /* for now we want this component to lose to btl/ugni and btl/vader */ + module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; + + return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version, + &module->super); +} + +static int mca_btl_ofi_component_open(void) +{ + mca_btl_ofi_component.module_count = 0; + return OPAL_SUCCESS; +} + +/* + * component cleanup - sanity checking of queue lengths + */ +static int mca_btl_ofi_component_close(void) +{ + /* If we don't sleep, sockets provider freaks out. */ + sleep(1); + return OPAL_SUCCESS; +} + +void mca_btl_ofi_exit(void) +{ + BTL_ERROR(("BTL OFI will now abort.")); + exit(1); +} + +/* + * OFI component initialization: + * read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + */ + +static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads, + bool enable_mpi_threads) +{ + /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ + int rc; + uint64_t progress_mode; + unsigned resource_count = 0; + struct mca_btl_base_module_t **base_modules; + + BTL_VERBOSE(("initializing ofi btl")); + + /* Set up libfabric hints. */ + uint32_t libfabric_api; + libfabric_api = fi_version(); + + /* bail if OFI version is less than 1.5. */ + if (libfabric_api < FI_VERSION(1, 5)) { + BTL_VERBOSE(("ofi btl disqualified because OFI version < 1.5.")); + return NULL; + } + + struct fi_info *info, *info_list; + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_rx_attr rx_attr = {0}; + struct fi_tx_attr tx_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + struct fi_domain_attr domain_attr = {0}; + + /* Select the provider */ + fabric_attr.prov_name = prov_include; + + domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE; + + /* message progression mode. */ + if (!strcmp(ofi_progress_mode, "auto")) { + progress_mode = FI_PROGRESS_AUTO; + } else if (!strcmp(ofi_progress_mode, "manual")) { + progress_mode = FI_PROGRESS_MANUAL; + } else { + progress_mode = FI_PROGRESS_UNSPEC; + } + + domain_attr.control_progress = progress_mode; + domain_attr.data_progress = progress_mode; + + /* select endpoint type */ + ep_attr.type = FI_EP_RDM; + + /* ask for capabilities */ + hints.caps = MCA_BTL_OFI_REQUIRED_CAPS; + + /* Ask for completion context */ + hints.mode = FI_CONTEXT; + + hints.fabric_attr = &fabric_attr; + hints.domain_attr = &domain_attr; + hints.ep_attr = &ep_attr; + hints.tx_attr = &tx_attr; + hints.rx_attr = &rx_attr; + + /* for now */ + tx_attr.iov_limit = 1; + rx_attr.iov_limit = 1; + + tx_attr.op_flags = FI_DELIVERY_COMPLETE; + + mca_btl_ofi_component.module_count = 0; + + /* do the query. */ + rc = fi_getinfo(FI_VERSION(1, 5), NULL, NULL, 0, &hints, &info_list); + if (0 != rc) { + BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc))); + return NULL; + } + + /* count the number of resources/ */ + info = info_list; + while(info) { + resource_count++; + info = info->next; + } + BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count)); + + info = info_list; + + while(info) { + rc = validate_info(info); + if (OPAL_SUCCESS == rc) { + /* Device passed sanity check, let's make a module. + * We only pick the first device we found valid */ + rc = mca_btl_ofi_init_device(info); + if (OPAL_SUCCESS == rc) + break; + } + info = info->next; + } + + /* We are done with the returned info. */ + fi_freeinfo(info_list); + + /* pass module array back to caller */ + base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules)); + if (NULL == base_modules) { + return NULL; + } + + memcpy(base_modules, mca_btl_ofi_component.modules, + mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0])); + + BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports", + mca_btl_ofi_component.module_count)); + + *num_btl_modules = mca_btl_ofi_component.module_count; + + return base_modules; +} + +static int mca_btl_ofi_init_device(struct fi_info *info) +{ + int rc; + int *module_count = &mca_btl_ofi_component.module_count; + size_t namelen; + size_t num_contexts_to_create; + + char *linux_device_name; + char ep_name[FI_NAME_MAX]; + + struct fi_info *ofi_info; + struct fi_ep_attr *ep_attr; + struct fi_domain_attr *domain_attr; + struct fi_av_attr av_attr = {0}; + struct fid_fabric *fabric = NULL; + struct fid_domain *domain = NULL; + struct fid_ep *ep = NULL; + struct fid_av *av = NULL; + + mca_btl_ofi_module_t *module; + + /* allocate module */ + module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); + if (NULL == module) { + BTL_ERROR(("failed to allocate memory for OFI module")); + goto fail; + } + *module = mca_btl_ofi_module_template; + + /* make a copy of the given info to store on the module */ + ofi_info = fi_dupinfo(info); + ep_attr = ofi_info->ep_attr; + domain_attr = ofi_info->domain_attr; + + linux_device_name = info->domain_attr->name; + BTL_VERBOSE(("initializing dev:%s provider:%s", + linux_device_name, + info->fabric_attr->prov_name)); + + /* fabric */ + rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_fabric with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* domain */ + rc = fi_domain(fabric, ofi_info, &domain, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_domain with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* AV */ + av_attr.type = FI_AV_MAP; + rc = fi_av_open(domain, &av_attr, &av, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_av_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + num_contexts_to_create = mca_btl_ofi_component.num_contexts_per_module; + + /* If the domain support scalable endpoint. */ + if (domain_attr->max_ep_tx_ctx > 1 && !disable_sep) { + + BTL_VERBOSE(("btl/ofi using scalable endpoint.")); + + if (num_contexts_to_create > domain_attr->max_ep_tx_ctx) { + BTL_VERBOSE(("cannot create requested %u contexts. (node max=%zu)", + module->num_contexts, + domain_attr->max_ep_tx_ctx)); + goto fail; + } + + /* modify the info to let the provider know we are creating x contexts */ + ep_attr->tx_ctx_cnt = num_contexts_to_create; + ep_attr->rx_ctx_cnt = num_contexts_to_create; + + /* create scalable endpoint */ + rc = fi_scalable_ep(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = num_contexts_to_create; + module->is_scalable_ep = true; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_scalable(ofi_info, + domain, ep, av, + num_contexts_to_create); + + } else { + /* warn the user if they want more than 1 context */ + if (num_contexts_to_create > 1) { + BTL_ERROR(("cannot create %zu contexts as the provider does not support " + "scalable endpoint. Falling back to single context endpoint.", + num_contexts_to_create)); + } + + BTL_VERBOSE(("btl/ofi using normal endpoint.")); + + rc = fi_endpoint(domain, ofi_info, &ep, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_endpoint with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + module->num_contexts = 1; + module->is_scalable_ep = false; + + /* create contexts */ + module->contexts = mca_btl_ofi_context_alloc_normal(ofi_info, + domain, ep, av); + } + + if (NULL == module->contexts) { + /* error message is already printed */ + goto fail; + } + + /* enable the endpoint for using */ + rc = fi_enable(ep); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* Everything succeeded, lets create a module for this device. */ + /* store the information. */ + module->fabric_info = ofi_info; + module->fabric = fabric; + module->domain = domain; + module->av = av; + module->ofi_endpoint = ep; + module->linux_device_name = linux_device_name; + module->outstanding_rdma = 0; + module->use_virt_addr = false; + + if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC || + ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { + module->use_virt_addr = true; + } + + /* initialize the rcache */ + mca_btl_ofi_rcache_init(module); + + /* create endpoint list */ + OBJ_CONSTRUCT(&module->endpoints, opal_list_t); + OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t); + + /* create and send the modex for this device */ + namelen = sizeof(ep_name); + rc = fi_getname((fid_t)ep, &ep_name[0], &namelen); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_getname with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* post our endpoint name so peer can use it to connect to us */ + OPAL_MODEX_SEND(rc, + OPAL_PMIX_GLOBAL, + &mca_btl_ofi_component.super.btl_version, + &ep_name, + namelen); + mca_btl_ofi_component.namelen = namelen; + + /* add this module to the list */ + mca_btl_ofi_component.modules[(*module_count)++] = module; + + return OPAL_SUCCESS; + +fail: + /* clean up */ + + /* if the contexts have not been initiated, num_contexts should + * be zero and we skip this. */ + for (int i=0; i < module->num_contexts; i++) { + mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep); + } + free(module->contexts); + + if (NULL != av) { + fi_close(&av->fid); + } + + if (NULL != ep) { + fi_close(&ep->fid); + } + + if (NULL != domain) { + fi_close(&domain->fid); + } + + if (NULL != fabric) { + fi_close(&fabric->fid); + } + free(module); + + /* not really a failure. just skip this device. */ + return OPAL_ERR_OUT_OF_RESOURCE; +} + +/** + * @brief OFI BTL progress function + * + * This function explictly progresses all workers. + */ +static int mca_btl_ofi_component_progress (void) +{ + int events = 0; + mca_btl_ofi_context_t *context; + + for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { + mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + + /* progress context we own first. */ + context = get_ofi_context(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* if there is nothing to do, try progress other's. */ + if (events == 0) { + for (int j = 0 ; j < module->num_contexts ; j++ ) { + + context = get_ofi_context_rr(module); + + if (mca_btl_ofi_context_trylock(context)) { + events += mca_btl_ofi_context_progress(context); + mca_btl_ofi_context_unlock(context); + } + + /* If we did something, good enough. return now. + * This is crucial for performance/latency. */ + if (events > 0) { + break; + } + } + } + } + + return events; +} + +int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { + + int ret = 0; + int events_read; + int events = 0; + struct fi_cq_entry cq_entry[MCA_BTL_OFI_MAX_CQ_READ_ENTRIES]; + struct fi_cq_err_entry cqerr = {0}; + + mca_btl_ofi_completion_t *comp; + + ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); + + if (0 < ret) { + events_read = ret; + for (int i = 0; i < events_read; i++) { + if (NULL != cq_entry[i].op_context) { + ++events; + comp = (mca_btl_ofi_completion_t*) cq_entry[i].op_context; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; + + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: + + /* call the callback */ + if (comp->cbfunc) { + comp->cbfunc (comp->btl, comp->endpoint, + comp->local_address, comp->local_handle, + comp->cbcontext, comp->cbdata, OPAL_SUCCESS); + } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); + + MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); + } + } + } + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { + ret = fi_cq_readerr(context->cq, &cqerr, 0); + + /* cq readerr failed!? */ + if (0 > ret) { + BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret)); + } else { + BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", + cqerr.prov_errno)); + } + MCA_BTL_OFI_ABORT(); + } +#ifdef FI_EINTR + /* sometimes, sockets provider complain about interupt. We do nothing. */ + else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { + + } +#endif + /* If the error is not FI_EAGAIN, report the error and abort. */ + else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { + BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); + MCA_BTL_OFI_ABORT(); + } + + return events; +} + +/** OFI btl component */ +mca_btl_ofi_component_t mca_btl_ofi_component = { + .super = { + .btl_version = { + MCA_BTL_DEFAULT_VERSION("ofi"), + .mca_open_component = mca_btl_ofi_component_open, + .mca_close_component = mca_btl_ofi_component_close, + .mca_register_component_params = mca_btl_ofi_component_register, + }, + .btl_data = { + /* The component is not checkpoint ready */ + .param_field = MCA_BASE_METADATA_PARAM_NONE + }, + + .btl_init = mca_btl_ofi_component_init, + .btl_progress = mca_btl_ofi_component_progress, + }, +}; diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.c b/opal/mca/btl/ofi/btl_ofi_endpoint.c new file mode 100644 index 0000000000..0ef91a9b6f --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.c @@ -0,0 +1,343 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" +#include "opal/util/proc.h" + +#if OPAL_HAVE_THREAD_LOCAL +opal_thread_local mca_btl_ofi_context_t *my_context = NULL; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + +static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint) +{ + endpoint->peer_addr = 0; + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t); +} + +static void mca_btl_ofi_endpoint_destruct (mca_btl_ofi_endpoint_t *endpoint) +{ + endpoint->peer_addr = 0; + + /* set to null, we will free ofi endpoint in module */ + endpoint->ofi_endpoint = NULL; + + OBJ_DESTRUCT(&endpoint->ep_lock); +} + +OBJ_CLASS_INSTANCE(mca_btl_ofi_endpoint_t, opal_list_item_t, + mca_btl_ofi_endpoint_construct, + mca_btl_ofi_endpoint_destruct); + +mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep) +{ + mca_btl_ofi_endpoint_t *endpoint = OBJ_NEW(mca_btl_ofi_endpoint_t); + + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->ep_proc = proc; + endpoint->ofi_endpoint = ep; + + return (mca_btl_base_endpoint_t *) endpoint; +} + +int ofi_comp_list_init(opal_free_list_t *comp_list) +{ + int rc; + OBJ_CONSTRUCT(comp_list, opal_free_list_t); + rc = opal_free_list_init(comp_list, + sizeof(mca_btl_ofi_completion_t), + opal_cache_line_size, + OBJ_CLASS(mca_btl_ofi_completion_t), + 0, + 0, + 128, + -1, + 128, + NULL, + 0, + NULL, + NULL, + NULL); + if (rc != OPAL_SUCCESS) { + BTL_VERBOSE(("cannot allocate completion freelist")); + } + return rc; +} + +/* mca_btl_ofi_context_alloc_normal() + * + * This function will allocate an ofi_context, map the endpoint to tx/rx context, + * bind CQ,AV to the endpoint and initialize all the structure. + * USE WITH NORMAL ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av) +{ + int rc; + uint32_t cq_flags = FI_TRANSMIT; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + + mca_btl_ofi_context_t *context; + + context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context)); + if (NULL == context) { + BTL_VERBOSE(("cannot allocate context")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto single_fail; + } + + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto single_fail; + } + + rc = ofi_comp_list_init(&context->comp_list); + if (rc != OPAL_SUCCESS) { + goto single_fail; + } + + context->tx_ctx = ep; + context->rx_ctx = ep; + context->context_id = 0; + + return context; + +single_fail: + mca_btl_ofi_context_finalize(context, false); + return NULL; +} + +/* mca_btl_ofi_context_alloc_scalable() + * + * This function allocate communication contexts and return the pointer + * to the first btl context. It also take care of all the bindings needed. + * USE WITH SCALABLE ENDPOINT ONLY */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts) +{ + BTL_VERBOSE(("creating %zu contexts", num_contexts)); + + int rc; + size_t i; + char *linux_device_name = info->domain_attr->name; + + struct fi_cq_attr cq_attr = {0}; + struct fi_tx_attr tx_attr = {0}; + struct fi_rx_attr rx_attr = {0}; + + mca_btl_ofi_context_t *contexts; + tx_attr.op_flags = FI_DELIVERY_COMPLETE; + + contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts)); + if (NULL == contexts) { + BTL_VERBOSE(("cannot allocate communication contexts.")); + return NULL; + } + + /* Don't really need to check, just avoiding compiler warning because + * BTL_VERBOSE is a no op in performance build and the compiler will + * complain about unused variable. */ + if (NULL == linux_device_name) { + BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); + goto scalable_fail; + } + + /* bind AV to endpoint */ + rc = fi_scalable_ep_bind(sep, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + for (i=0; i < num_contexts; i++) { + rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_tx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* We don't actually need a receiving context as we only do one-sided. + * However, sockets provider will hang if we dont have one. It is + * also nice to have equal number of tx/rx context. */ + rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_rx_context with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* create CQ */ + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* bind cq to transmit context */ + uint32_t cq_flags = (FI_TRANSMIT); + rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* enable the context. */ + rc = fi_enable(contexts[i].tx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + rc = fi_enable(contexts[i].rx_ctx); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto scalable_fail; + } + + /* initialize completion freelist. */ + rc = ofi_comp_list_init(&contexts[i].comp_list); + if (rc != OPAL_SUCCESS) { + goto scalable_fail; + } + + /* assign the id */ + contexts[i].context_id = i; + } + + return contexts; + +scalable_fail: + /* close and free */ + for(i=0; i < num_contexts; i++) { + mca_btl_ofi_context_finalize(&contexts[i], true); + } + free(contexts); + + return NULL; +} + +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) { + + /* if it is a scalable ep, we have to close all contexts. */ + if (scalable_ep) { + if (NULL != context->tx_ctx) { + fi_close(&context->tx_ctx->fid); + } + + if (NULL != context->rx_ctx) { + fi_close(&context->rx_ctx->fid); + } + } + + if( NULL != context->cq) { + fi_close(&context->cq->fid); + } + + /* Can we destruct the object that hasn't been constructed? */ + OBJ_DESTRUCT(&context->comp_list); +} + +/* Get a context to use for communication. + * If TLS is supported, it will use the cached endpoint. + * If not, it will invoke the normal round-robin assignment. */ +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl) +{ +#if OPAL_HAVE_THREAD_LOCAL + /* With TLS, we cache the context we use. */ + static volatile int64_t cur_num = 0; + + if (OPAL_UNLIKELY(my_context == NULL)) { + OPAL_THREAD_LOCK(&btl->module_lock); + + my_context = &btl->contexts[cur_num]; + cur_num = (cur_num + 1) %btl->num_contexts; + + OPAL_THREAD_UNLOCK(&btl->module_lock); + } + + assert (my_context); + return my_context; +#else + return get_ofi_context_rr(btl); +#endif +} + +/* return the context in a round-robin. */ +/* There is no need for atomics here as it might hurt the performance. */ +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) +{ + static volatile uint64_t rr_num = 0; + return &btl->contexts[rr_num++%btl->num_contexts]; +} diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.h b/opal/mca/btl/ofi/btl_ofi_endpoint.h new file mode 100644 index 0000000000..aad758d8c8 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.h @@ -0,0 +1,75 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_OFI_ENDPOINT_H +#define MCA_BTL_OFI_ENDPOINT_H + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" + +#include "btl_ofi.h" + +BEGIN_C_DECLS + +#if OPAL_HAVE_THREAD_LOCAL +extern opal_thread_local mca_btl_ofi_context_t *my_context; +#endif /* OPAL_HAVE_THREAD_LOCAL */ + +struct mca_btl_base_endpoint_t { + opal_list_item_t super; + + struct fid_ep *ofi_endpoint; + fi_addr_t peer_addr; + + /** endpoint proc */ + opal_proc_t *ep_proc; + + /** mutex to protect this structure */ + opal_mutex_t ep_lock; +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; +OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); + +int ofi_comp_list_init(opal_free_list_t *comp_list); + +mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep); + +/* contexts */ +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *sep, + struct fid_av *av, + size_t num_contexts); + +mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, + struct fid_domain *domain, + struct fid_ep *ep, + struct fid_av *av); +void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep); + +mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl); +mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl); + +END_C_DECLS +#endif diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c new file mode 100644 index 0000000000..df6ae1e2e1 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -0,0 +1,329 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" +#include +#include "opal/class/opal_bitmap.h" +#include "opal/mca/btl/btl.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/mpool/mpool.h" + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" + +static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl, + size_t nprocs, opal_proc_t **opal_procs, + mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable) +{ + int rc; + int count; + char *ep_name = NULL; + size_t namelen = mca_btl_ofi_component.namelen; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + for (size_t i = 0 ; i < nprocs ; ++i) { + peers[i] = mca_btl_ofi_endpoint_create (opal_procs[i], ofi_btl->ofi_endpoint); + if (OPAL_UNLIKELY(NULL == peers[i])) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version, + &peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error receiving modex")); + MCA_BTL_OFI_ABORT(); + } + + /* get peer fi_addr */ + count = fi_av_insert(ofi_btl->av, /* Address vector to insert */ + ep_name, /* peer name */ + 1, /* amount to insert */ + &peers[i]->peer_addr, /* return peer address here */ + 0, /* flags */ + NULL); /* context */ + + /* if succeed, add this proc and mark reachable */ + if (count == 1) { /* we inserted 1 address. */ + opal_list_append (&ofi_btl->endpoints, &peers[i]->super); + opal_bitmap_set_bit(reachable, i); + } else { + BTL_VERBOSE(("fi_av_insert failed with rc = %d", count)); + MCA_BTL_OFI_ABORT(); + } + } + + return OPAL_SUCCESS; +} + +static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs, + opal_proc_t **procs, mca_btl_base_endpoint_t **peers) +{ + int ret; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + for (size_t i = 0 ; i < nprocs ; ++i) { + if (peers[i]) { + + /* remove the address from AV. */ + ret = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0); + if (ret < 0) { + /* remove failed. this should not happen. */ + /* Lets not crash because we failed to remove an address. */ + BTL_ERROR(("fi_av_remove failed with error %d:%s", + ret, fi_strerror(-ret))); + } + + /* remove and free MPI endpoint from the list. */ + opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super); + OBJ_RELEASE(peers[i]); + } + } + + return OPAL_SUCCESS; +} + +void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module) +{ + if (!module->initialized) { + mca_rcache_base_resources_t rcache_resources; + char *tmp; + + (void) asprintf (&tmp, "ofi.%s", module->linux_device_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t); + rcache_resources.register_mem = mca_btl_ofi_reg_mem; + rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem; + + module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources); + free (tmp); + + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_ERROR(("cannot create rcache")); + MCA_BTL_OFI_ABORT(); + } + + module->initialized = true; + } +} + + +/** + * @brief Register a memory region for put/get/atomic operations. + * + * @param btl (IN) BTL module + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags indicating what operation will be performed. Valid + * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, + * and MCA_BTL_DES_FLAGS_ATOMIC + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. + */ +static struct mca_btl_base_registration_handle_t * +mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) +{ + mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_reg_t *reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; + int rc; + + rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags, + (mca_rcache_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return NULL; + } + + return ®->handle; +} + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + */ +static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_reg_t *reg = + (mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle)); + + (void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) +{ + int rc; + static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE; + + mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data; + mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg; + + rc = fi_mr_reg(btl->domain, base, size, access_flags, 0, + (uint64_t) reg, 0, &ur->ur_mr, NULL); + if (0 != rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + ur->handle.rkey = fi_mr_key(ur->ur_mr); + ur->handle.desc = fi_mr_desc(ur->ur_mr); + + /* In case the provider doesn't support FI_MR_VIRT_ADDR, + * we need to reference the remote address by the distance from base registered + * address. We keep this information to use in rdma/atomic operations. */ + if (btl->use_virt_addr) { + ur->handle.base_addr = 0; + } else { + ur->handle.base_addr = base; + } + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) +{ + mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg; + + if (ur->ur_mr != NULL) { + if (0 != fi_close(&ur->ur_mr->fid)) { + BTL_ERROR(("%s: error unpinning memory mr=%p: %s", + __func__, (void*) ur->ur_mr, strerror(errno))); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} + +/* + * Cleanup/release module resources. + */ + +int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) +{ + int i; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *endpoint, *next; + + assert(btl); + + /* loop over all the contexts */ + for (i=0; i < ofi_btl->num_contexts; i++) { + mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep); + } + free(ofi_btl->contexts); + + if (NULL != ofi_btl->av) { + fi_close(&ofi_btl->av->fid); + } + + if (NULL != ofi_btl->ofi_endpoint) { + fi_close(&ofi_btl->ofi_endpoint->fid); + } + + if (NULL != ofi_btl->domain) { + fi_close(&ofi_btl->domain->fid); + } + + if (NULL != ofi_btl->fabric) { + fi_close(&ofi_btl->fabric->fid); + } + + if (NULL != ofi_btl->fabric_info) { + fi_freeinfo(ofi_btl->fabric_info); + } + + /* clean up any leftover endpoints */ + OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) { + opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super); + OBJ_RELEASE(endpoint); + } + + OBJ_DESTRUCT(&ofi_btl->endpoints); + + if (ofi_btl->rcache) { + mca_rcache_base_module_destroy (ofi_btl->rcache); + } + + free (btl); + + return OPAL_SUCCESS; +} + +mca_btl_ofi_module_t mca_btl_ofi_module_template = { + .super = { + /* initialize functions. this btl only support RDMA and atomics + * for now so it does not provide prepare_src, alloc, free, or send */ + .btl_component = &mca_btl_ofi_component.super, + .btl_add_procs = mca_btl_ofi_add_procs, + .btl_del_procs = mca_btl_ofi_del_procs, + .btl_finalize = mca_btl_ofi_finalize, + .btl_put = mca_btl_ofi_put, + .btl_get = mca_btl_ofi_get, + .btl_register_mem = mca_btl_ofi_register_mem, + .btl_deregister_mem = mca_btl_ofi_deregister_mem, + .btl_atomic_op = mca_btl_ofi_aop, + .btl_atomic_fop = mca_btl_ofi_afop, + .btl_atomic_cswap = mca_btl_ofi_acswap, + .btl_flush = mca_btl_ofi_flush, + + /* set the default flags for this btl. ofi provides us with rdma and both + * fetching and non-fetching atomics (though limited to add and cswap) */ + .btl_flags = MCA_BTL_FLAGS_RDMA | + MCA_BTL_FLAGS_ATOMIC_FOPS | + MCA_BTL_FLAGS_ATOMIC_OPS, + + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_32BIT, + + /* set the default limits on put and get */ + .btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t), + .btl_put_limit = 1 << 23, + .btl_put_alignment = 0, + .btl_get_limit = 1 << 23, + .btl_get_alignment = 0, + } +}; diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.c b/opal/mca/btl/ofi/btl_ofi_rdma.c new file mode 100644 index 0000000000..9a545038a4 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_rdma.c @@ -0,0 +1,156 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ofi_rdma.h" + +OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t, + opal_free_list_item_t, + NULL, + NULL); + +mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( + mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, + void *local_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata, + int type) +{ + assert(btl); + assert(endpoint); + assert(ofi_context); + + mca_btl_ofi_completion_t *comp; + + comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_context->comp_list); + assert(comp); + + comp->btl = btl; + comp->endpoint = endpoint; + comp->my_context = ofi_context; + comp->local_address = local_address; + comp->local_handle = local_handle; + comp->cbfunc = cbfunc; + comp->cbcontext = cbcontext; + comp->cbdata = cbdata; + comp->my_list = &ofi_context->comp_list; + comp->type = type; + + return comp; +} + +int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + + int rc; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); + + /* create completion context */ + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_GET); + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* Remote write data across the wire */ + rc = fi_read(ofi_context->tx_ctx, + local_address, size, /* payload */ + local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + comp); /* completion context */ + + if (-FI_EAGAIN == rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (0 != rc) { + BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_context_t *ofi_context; + + ofi_context = get_ofi_context(ofi_btl); + + /* create completion context */ + mca_btl_ofi_completion_t *comp; + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + ofi_context, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_PUT); + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* Remote write data across the wire */ + rc = fi_write(ofi_context->tx_ctx, + local_address, size, /* payload */ + local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + comp); /* completion context */ + + if (-FI_EAGAIN == rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (0 != rc) { + BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; + +} + +int mca_btl_ofi_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + while(ofi_btl->outstanding_rdma > 0) { + (void) mca_btl_ofi_component.super.btl_progress(); + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.h b/opal/mca/btl/ofi/btl_ofi_rdma.h new file mode 100644 index 0000000000..3de4245439 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_rdma.h @@ -0,0 +1,42 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef BTL_OFI_RDMA_H +#define BTL_OFI_RDMA_H + +#include "opal/threads/thread_usage.h" + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" + +mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( + mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + mca_btl_ofi_context_t *ofi_context, + void *local_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata, + int type); + +#define MCA_BTL_OFI_NUM_RDMA_INC(module) \ + OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \ + if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \ + mca_btl_ofi_component.super.btl_progress(); \ + } + +#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \ + OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1); + +#endif /* !defined(BTL_OFI_RDMA_H) */ + diff --git a/opal/mca/btl/ofi/configure.m4 b/opal/mca/btl/ofi/configure.m4 new file mode 100644 index 0000000000..222a7b29e0 --- /dev/null +++ b/opal/mca/btl/ofi/configure.m4 @@ -0,0 +1,51 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2018 Los Alamos National Security, LLC. +# All rights reserved. +# Copyright (c) 2018 Intel, inc. All rights reserved +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OPAL_CHECK_OFI(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if OFI support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found + +AC_DEFUN([MCA_opal_btl_ofi_CONFIG],[ + OPAL_VAR_SCOPE_PUSH([opal_btl_ofi_happy CPPFLAGS_save]) + + AC_CONFIG_FILES([opal/mca/btl/ofi/Makefile]) + + AC_REQUIRE([MCA_opal_common_ofi_CONFIG]) + + opal_btl_ofi_happy=0 + AS_IF([test "$opal_common_ofi_happy" = "yes"], + [CPPFLAGS_save=$CPPFLAGS + CPPFLAGS="$opal_common_ofi_CPPFLAGS $CPPFLAGS" + AC_CHECK_DECL([FI_MR_VIRT_ADDR], [opal_btl_ofi_happy=1], [], + [#include ]) + CPPFLAGS=$CPPFLAGS_save]) + AS_IF([test $opal_btl_ofi_happy -eq 1], + [$1], + [$2]) + + OPAL_VAR_SCOPE_POP +])dnl diff --git a/opal/mca/btl/ofi/owner.txt b/opal/mca/btl/ofi/owner.txt new file mode 100644 index 0000000000..f58f1cbab7 --- /dev/null +++ b/opal/mca/btl/ofi/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner:Intel +status:active