diff --git a/opal/mca/btl/ofi/Makefile.am b/opal/mca/btl/ofi/Makefile.am new file mode 100644 index 0000000000..6f25d3cecd --- /dev/null +++ b/opal/mca/btl/ofi/Makefile.am @@ -0,0 +1,62 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2013 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2018 Intel, inc. All rights reserved +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +#dist_opaldata_DATA = help-mpi-btl-ofi.txt + +AM_CPPFLAGS = $(opal_common_ofi_CPPFLAGS) +sources = \ + btl_ofi.h \ + btl_ofi_component.c \ + btl_ofi_endpoint.h \ + btl_ofi_endpoint.c \ + btl_ofi_module.c \ + btl_ofi_rdma.h \ + btl_ofi_rdma.c \ + btl_ofi_atomics.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_btl_ofi_DSO +lib = +lib_sources = +component = mca_btl_ofi.la +component_sources = $(sources) +else +lib = libmca_btl_ofi.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_btl_ofi_la_SOURCES = $(component_sources) +mca_btl_ofi_la_LDFLAGS = -module -avoid-version \ + $(opal_btl_ofi_LDFLAGS) +mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la + +noinst_LTLIBRARIES = $(lib) +libmca_btl_ofi_la_SOURCES = $(lib_sources) +libmca_btl_ofi_la_LDFLAGS = -module -avoid-version $(opal_btl_ofi_LDFLAGS) diff --git a/opal/mca/btl/ofi/README b/opal/mca/btl/ofi/README new file mode 100644 index 0000000000..97e3759830 --- /dev/null +++ b/opal/mca/btl/ofi/README @@ -0,0 +1,88 @@ +======================================== +Design notes on BTL/OFI +======================================== + +This is the RDMA only btl based on OFI Libfabric. The goal is to enable RDMA +with multiple vendor hardware through one interface. Most of the operations are +managed by upper layer (osc/rdma). This BTL is mostly doing the low level work. + +Tested providers: sockets,psm2,ugni + +======================================== + +Component + +This BTL is requesting libfabric version 1.5 API and will not support older versions. + +The required capabilities of this BTL is FI_ATOMIC and FI_RMA with the endpoint type +of FI_EP_RDM only. This BTL does NOT support libfabric provider that requires local +memory registration (FI_MR_LOCAL). + +BTL/OFI will initialize a module with ONLY the first compatible info returned from OFI. +This means it will rely on OFI provider to do load balancing. The support for multiple +device might be added later. + +The BTL creates only one endpoint and one CQ. + +======================================== + +Memory Registration + +Open MPI has a system in place to exchange remote address and always use the remote +virtual address to refer to a piece of memory. However, some libfabric providers might +not support the use of virtual address and instead will use zero-based offset addressing. + +FI_MR_VIRT_ADDR is the flag that determine this behavior. mca_btl_ofi_reg_mem() handles +this by storing the base address in registration handle in case of the provider does not +support FI_MR_VIRT_ADDR. This base address will be used to calculate the offset later in +RDMA/Atomic operations. + +The BTL will try to use the address of registration handle as the key. However, if the +provider supports FI_MR_PROV_KEY, it will use provider provided key. Simply does not care. + +The BTL does not register local operand or compare. This is why this BTL does not support +FI_MR_LOCAL and will allocate every buffer before registering. This means FI_MR_ALLOCATED +is supported. So to be explicit. + +Supported MR mode bits (will work with or without): + enum: + - FI_MR_BASIC + - FI_MR_SCALABLE + + mode bits: + - FI_MR_VIRT_ADDR + - FI_MR_ALLOCATED + - FI_MR_PROV_KEY + +The BTL does NOT support (will not work with): + - FI_MR_LOCAL + - FI_MR_MMU_NOTIFY + - FI_MR_RMA_EVENT + - FI_MR_ENDPOINT + +Just a reminder, in libfabric API 1.5... +FI_MR_BASIC == (FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_VIRT_ADDR) + +======================================== + +Completions + +Every operation in this BTL is asynchronous. The completion handling will occur in +mca_btl_ofi_component_progress() where we read the CQ with the completion context and +execute the callback functions. The completions are local. No remote completion event is +generated as local completion already guarantee global completion. + +The BTL keep tracks of number of outstanding operations and provide flush interface. + +======================================== + +Sockets Provider + +Sockets provider is the proof of concept provider for libfabric. It is supposed to support +all the OFI API with emulations. This provider is considered very slow and bound to raise +problems that we might not see from other faster providers. + +Known Problems: + - sockets provider uses progress thread and can cause segfault in finalize as we free + the resources while progress thread is still using it. sleep(1) was put in + mca_btl_ofi_componenet_close() for this reason. diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h new file mode 100644 index 0000000000..ca96e415c7 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -0,0 +1,275 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2018 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_OFI_H +#define MCA_BTL_OFI_H + +#include "opal_config.h" +#include +#include + +/* Open MPI includes */ +#include "opal/mca/event/event.h" +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/mpool/mpool.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/pmix/pmix.h" + +#include +#include +#include +#include +#include +#include + +BEGIN_C_DECLS + +#define MCA_BTL_OFI_MAX_MODULES 16 +#define MCA_BTL_OFI_MAX_WORKERS 1 +#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128 + +#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args) + +enum mca_btl_ofi_type { + MCA_BTL_OFI_TYPE_PUT = 1, + MCA_BTL_OFI_TYPE_GET, + MCA_BTL_OFI_TYPE_AOP, + MCA_BTL_OFI_TYPE_AFOP, + MCA_BTL_OFI_TYPE_CSWAP, + MCA_BTL_OFI_TYPE_TOTAL +}; + +/** + * @brief OFI BTL module + */ +struct mca_btl_ofi_module_t { + /** base BTL interface */ + mca_btl_base_module_t super; + + /* libfabric components */ + struct fi_info *fabric_info; + struct fid_fabric *fabric; + struct fid_domain *domain; + struct fid_ep *ofi_endpoint; + struct fid_cq *cq; + struct fid_av *av; + + char *linux_device_name; + + /** whether the module has been fully initialized or not */ + bool initialized; + bool use_virt_addr; + + /** spin-lock to protect the module */ + volatile int32_t lock; + + int64_t outstanding_rdma; + + /** linked list of BTL endpoints. this list is never searched so + * there is no need for a complicated structure here at this time*/ + opal_list_t endpoints; + + /* free lists */ + opal_free_list_t comp_list; + + /** registration cache */ + mca_rcache_base_module_t *rcache; +}; +typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t; + +extern mca_btl_ofi_module_t mca_btl_ofi_module_template; + +/** + * @brief OFI BTL component + */ +struct mca_btl_ofi_component_t { + mca_btl_base_component_3_0_0_t super; /**< base BTL component */ + + /** number of TL modules */ + int module_count; + int num_cqe_read; + + size_t namelen; + + /** All BTL OFI modules (1 per tl) */ + mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; + +#if OPAL_C_HAVE__THREAD_LOCAL + /** bind threads to contexts */ + bool bind_threads_to_contexts; +#endif +}; +typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t; + +OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component; + +struct mca_btl_base_registration_handle_t { + uint64_t rkey; + void *desc; + void *base_addr; +}; + +struct mca_btl_ofi_reg_t { + mca_rcache_base_registration_t base; + struct fid_mr *ur_mr; + + /* remote handle */ + mca_btl_base_registration_handle_t handle; +}; +typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t; + +OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t); + +/* completion structure store information needed + * for RDMA callbacks */ +struct mca_btl_ofi_completion_t { + opal_free_list_item_t comp_list; + opal_free_list_t *my_list; + + struct mca_btl_base_module_t *btl; + struct mca_btl_base_endpoint_t *endpoint; + uint32_t type; + + void *local_address; + mca_btl_base_registration_handle_t *local_handle; + + /* information for atomic op */ + uint64_t operand; + uint64_t compare; + + mca_btl_base_rdma_completion_fn_t cbfunc; + void *cbcontext; + void *cbdata; + +}; +typedef struct mca_btl_ofi_completion_t mca_btl_ofi_completion_t; + +OBJ_CLASS_DECLARATION(mca_btl_ofi_completion_t); + +/** + * Initiate an asynchronous put. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the put operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_ofi_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous get. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the get operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_ofi_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + + +int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); + +int mca_btl_ofi_finalize (mca_btl_base_module_t *btl); + +void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module); +int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg); +int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); + +void mca_btl_ofi_exit(void); + +END_C_DECLS +#endif diff --git a/opal/mca/btl/ofi/btl_ofi_atomics.c b/opal/mca/btl/ofi/btl_ofi_atomics.c new file mode 100644 index 0000000000..7d83d5c2b2 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_atomics.c @@ -0,0 +1,180 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include "btl_ofi_rdma.h" + +static inline int to_fi_op(mca_btl_base_atomic_op_t op) +{ + switch (op) { + case MCA_BTL_ATOMIC_ADD: + return FI_SUM; + case MCA_BTL_ATOMIC_SWAP: + return FI_ATOMIC_WRITE; + default: + BTL_ERROR(("Unknown or unsupported atomic op.")); + MCA_BTL_OFI_ABORT(); + + /* just to squash the warning */ + return OPAL_ERROR; + } +} + +int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + int fi_op; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + fi_op = to_fi_op(op); + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_AFOP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) operand; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + rc = fi_fetch_atomic(ofi_btl->ofi_endpoint, + (void*) &comp->operand, 1, NULL, /* operand */ + local_address, local_handle->desc, /* results */ + btl_endpoint->peer_addr, /* remote addr */ + remote_address, remote_handle->rkey, /* remote buffer */ + fi_datatype, fi_op, comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + int fi_op; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + fi_op = to_fi_op(op); + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + NULL, + NULL, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_AOP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) operand; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + rc = fi_atomic(ofi_btl->ofi_endpoint, + (void*) &comp->operand, 1, NULL, /* operand */ + btl_endpoint->peer_addr, /* remote addr */ + remote_address, remote_handle->rkey, /* remote buffer */ + fi_datatype, fi_op, comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + int fi_datatype = FI_UINT64; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp = NULL; + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + fi_datatype = FI_UINT32; + } + + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_CSWAP); + + /* copy the operand because it might get freed from upper layer */ + comp->operand = (uint64_t) value; + comp->compare = (uint64_t) compare; + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* perform atomic */ + rc = fi_compare_atomic(ofi_btl->ofi_endpoint, + (void*) &comp->operand, 1, NULL, + (void*) &comp->compare, NULL, + local_address, local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + fi_datatype, + FI_CSWAP, + comp); + + if (rc == -FI_EAGAIN) { + return OPAL_ERR_OUT_OF_RESOURCE; + } else if (rc < 0) { + BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c new file mode 100644 index 0000000000..7c0cab9152 --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -0,0 +1,583 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "opal_config.h" + +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/hwloc/base/base.h" + +#include + +#include "btl_ofi.h" +#include "btl_ofi_rdma.h" + + +#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC) +#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_UNSPEC) + +static char *prov_include; +static char *prov_exclude; +static char *ofi_progress_mode; +static int mca_btl_ofi_init_device(struct fi_info *info); + +/* validate information returned from fi_getinfo(). + * return OPAL_ERROR if we dont have what we need. */ +static int validate_info(struct fi_info *info) +{ + int mr_mode; + + /* we need exactly all the required bits */ + if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) { + return OPAL_ERROR; + } + + /* we need FI_EP_RDM */ + if (info->ep_attr->type != FI_EP_RDM) { + return OPAL_ERROR; + } + + mr_mode = info->domain_attr->mr_mode; + + if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE || + (mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) { + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +/* Register the MCA parameters */ +static int mca_btl_ofi_component_register(void) +{ + mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template; + + /* fi_getinfo with prov_name == NULL means ALL provider. + * Since now we are using the first valid info returned, I'm not sure + * if we need to provide the support for comma limited provider list. */ + prov_include = NULL; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "provider_include", + "OFI provider that ofi btl will query for. This parameter only " + "accept ONE provider name. " + "(e.g., \"psm2\"; an empty value means that all providers will " + "be considered.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &prov_include); + + /* TODO: this param has not been implemented. Not sure if we need it. " */ + prov_exclude = NULL; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "provider_exclude", + "Comma-delimited list of OFI providers that are not considered for use " + "(default: \"sockets,mxm\"; empty value means that all providers will " + " be considered). " + "Mutually exclusive with btl_ofi_provider_include.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, + &prov_exclude); + + /* Note: better leave it at 1 for now. osc rdma module is designed for 1 completion + * at a time. Dealing with more than 1 completion in 1 read will confuse the osc rdma. + * source: 8 hours of debugging. :(*/ + mca_btl_ofi_component.num_cqe_read = 1; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "num_cq_read", + "Number of completion entries to read from a single cq_read. " + "(default: 1)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.num_cqe_read); + + ofi_progress_mode = "unspec"; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "progress_mode", + "requested provider progress mode. [unspec, auto, manual]" + "(default: unspec)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &ofi_progress_mode); + +#if OPAL_C_HAVE__THREAD_LOCAL + mca_btl_ofi_component.bind_threads_to_contexts = true; + (void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "bind_threads_to_contexts", "Bind threads to device contexts. " + "In general this should improve the multi-threaded performance " + "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, + NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_ALL, + &mca_btl_ofi_component.bind_threads_to_contexts); +#endif + + /* for now we want this component to lose to btl/ugni and btl/vader */ + module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; + + return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version, + &module->super); +} + +static int mca_btl_ofi_component_open(void) +{ + mca_btl_ofi_component.module_count = 0; + return OPAL_SUCCESS; +} + + +/* + * component cleanup - sanity checking of queue lengths + */ +static int mca_btl_ofi_component_close(void) +{ + /* If we don't sleep, sockets provider freaks out. */ + sleep(1); + return OPAL_SUCCESS; +} + +void mca_btl_ofi_exit(void) +{ + BTL_ERROR(("BTL OFI will now abort.")); + exit(1); +} + +/* + * OFI component initialization: + * read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + */ + +static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads, + bool enable_mpi_threads) +{ + /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ + int rc; + uint64_t progress_mode; + unsigned resource_count = 0; + struct mca_btl_base_module_t **base_modules; + + BTL_VERBOSE(("initializing ofi btl")); + + /* Set up libfabric hints. */ + uint32_t libfabric_api; + libfabric_api = FI_VERSION(1, 5); /* 1.5 because of the newer API */ + + struct fi_info *info, *info_list; + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_rx_attr rx_attr = {0}; + struct fi_tx_attr tx_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + struct fi_domain_attr domain_attr = {0}; + + /* Select the provider */ + fabric_attr.prov_name = prov_include; + + domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE; + + /* message progression mode. */ + if (!strcmp(ofi_progress_mode, "auto")) { + progress_mode = FI_PROGRESS_AUTO; + } else if (!strcmp(ofi_progress_mode, "manual")) { + progress_mode = FI_PROGRESS_MANUAL; + } else { + progress_mode = FI_PROGRESS_UNSPEC; + } + + domain_attr.control_progress = progress_mode; + domain_attr.data_progress = progress_mode; + + /* select endpoint type */ + ep_attr.type = FI_EP_RDM; + + /* ask for capabilities */ + hints.caps = MCA_BTL_OFI_REQUIRED_CAPS; + + hints.fabric_attr = &fabric_attr; + hints.domain_attr = &domain_attr; + hints.ep_attr = &ep_attr; + hints.tx_attr = &tx_attr; + hints.rx_attr = &rx_attr; + + /* for now */ + tx_attr.iov_limit = 1; + rx_attr.iov_limit = 1; + + mca_btl_ofi_component.module_count = 0; + + /* do the query. */ + rc = fi_getinfo(libfabric_api, NULL, NULL, 0, &hints, &info_list); + if (0 != rc) { + BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc))); + return NULL; + } + + /* count the number of resources/ */ + info = info_list; + while(info) { + resource_count++; + info = info->next; + } + BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count)); + + info = info_list; + + while(info) { + rc = validate_info(info); + if (OPAL_SUCCESS == rc) { + /* Device passed sanity check, let's make a module. + * We only pick the first device we found valid */ + rc = mca_btl_ofi_init_device(info); + if (OPAL_SUCCESS == rc) + break; + } + info = info->next; + } + + /* We are done with the returned info. */ + fi_freeinfo(info_list); + + /* pass module array back to caller */ + base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules)); + if (NULL == base_modules) { + return NULL; + } + + memcpy(base_modules, mca_btl_ofi_component.modules, + mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0])); + + BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports", + mca_btl_ofi_component.module_count)); + + *num_btl_modules = mca_btl_ofi_component.module_count; + + return base_modules; +} + +static int mca_btl_ofi_init_device(struct fi_info *info) +{ + int rc; + int *module_count = &mca_btl_ofi_component.module_count; + size_t namelen; + mca_btl_ofi_module_t *module; + + char *linux_device_name; + char ep_name[FI_NAME_MAX]; + struct fi_info *ofi_info; + struct fi_cq_attr cq_attr = {0}; + struct fi_av_attr av_attr = {0}; + struct fid_fabric *fabric = NULL; + struct fid_domain *domain = NULL; + struct fid_ep *endpoint = NULL; + struct fid_cq *cq = NULL; + struct fid_av *av = NULL; + + /* make a copy of the given info to store on the module */ + ofi_info = fi_dupinfo(info); + + linux_device_name = info->domain_attr->name; + BTL_VERBOSE(("initializing dev:%s provider:%s", + linux_device_name, + info->fabric_attr->prov_name)); + + /* fabric */ + rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_fabric with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* domain */ + rc = fi_domain(fabric, ofi_info, &domain, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_domain with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* endpoint */ + rc = fi_endpoint(domain, ofi_info, &endpoint, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_endpoint with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* CQ */ + cq_attr.format = FI_CQ_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_WAIT_NONE; + rc = fi_cq_open(domain, &cq_attr, &cq, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_cq_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* AV */ + av_attr.type = FI_AV_MAP; + rc = fi_av_open(domain, &av_attr, &av, NULL); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_av_open with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + + /* bind CQ and AV to endpoint */ + uint32_t cq_flags = (FI_TRANSMIT); + rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + rc = fi_ep_bind(endpoint, (fid_t)av, 0); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* enable the endpoint for using */ + rc = fi_enable(endpoint); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_enable with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* Everything succeeded, lets create a module for this device. */ + module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); + if (NULL == module) { + goto fail; + } + *module = mca_btl_ofi_module_template; + + /* store the information. */ + module->fabric_info = ofi_info; + module->fabric = fabric; + module->domain = domain; + module->cq = cq; + module->av = av; + module->ofi_endpoint = endpoint; + module->linux_device_name = linux_device_name; + module->outstanding_rdma = 0; + module->use_virt_addr = false; + + if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC || + ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { + module->use_virt_addr = true; + } + + /* initialize the rcache */ + mca_btl_ofi_rcache_init(module); + + OBJ_CONSTRUCT(&module->endpoints, opal_list_t); + + /* init free lists */ + OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t); + rc = opal_free_list_init(&module->comp_list, + sizeof(mca_btl_ofi_completion_t), + opal_cache_line_size, + OBJ_CLASS(mca_btl_ofi_completion_t), + 0, + 0, + 128, + -1, + 128, + NULL, + 0, + NULL, + NULL, + NULL); + assert(OPAL_SUCCESS == rc); + + /* create and send the modex for this device */ + namelen = sizeof(ep_name); + rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen); + if (0 != rc) { + BTL_VERBOSE(("%s failed fi_getname with err=%s", + linux_device_name, + fi_strerror(-rc) + )); + goto fail; + } + + /* post our endpoint name so peer can use it to connect to us */ + OPAL_MODEX_SEND(rc, + OPAL_PMIX_GLOBAL, + &mca_btl_ofi_component.super.btl_version, + &ep_name, + namelen); + mca_btl_ofi_component.namelen = namelen; + + /* add this module to the list */ + mca_btl_ofi_component.modules[(*module_count)++] = module; + + return OPAL_SUCCESS; + +fail: + /* clean up */ + if (NULL != av) { + fi_close(&av->fid); + } + if (NULL != cq) { + fi_close(&cq->fid); + } + + if (NULL != endpoint) { + fi_close(&endpoint->fid); + } + + if (NULL != domain) { + fi_close(&domain->fid); + } + + if (NULL != fabric) { + fi_close(&fabric->fid); + } + + /* not really a failure. just skip this device. */ + return OPAL_ERR_OUT_OF_RESOURCE; +} + + +/** + * @brief OFI BTL progress function + * + * This function explictly progresses all workers. + */ +static int mca_btl_ofi_component_progress (void) +{ + + int ret = 0; + int events_read; + int events = 0; + struct fi_cq_entry cq_entry[MCA_BTL_OFI_MAX_CQ_READ_ENTRIES]; + struct fi_cq_err_entry cqerr = {0}; + + mca_btl_ofi_completion_t *comp; + + for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) { + mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i]; + + ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); + + if (0 < ret) { + events_read = ret; + for (int j = 0; j < events_read; j++) { + if (NULL != cq_entry[j].op_context) { + ++events; + comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl; + + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: + + /* call the callback */ + if (comp->cbfunc) { + comp->cbfunc (comp->btl, comp->endpoint, + comp->local_address, comp->local_handle, + comp->cbcontext, comp->cbdata, OPAL_SUCCESS); + } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); + + MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); + } + } + } + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { + ret = fi_cq_readerr(module->cq, &cqerr, 0); + + /* cq readerr failed!? */ + if (0 > ret) { + BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", + __FILE__, __LINE__, fi_strerror(-ret), ret)); + } else { + BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", + cqerr.prov_errno)); + } + + MCA_BTL_OFI_ABORT(); + + } else if (OPAL_UNLIKELY(ret != -FI_EAGAIN && ret != -FI_EINTR)) { + BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); + MCA_BTL_OFI_ABORT(); + } + } + + return events; +} + +/** OFI btl component */ +mca_btl_ofi_component_t mca_btl_ofi_component = { + .super = { + .btl_version = { + MCA_BTL_DEFAULT_VERSION("ofi"), + .mca_open_component = mca_btl_ofi_component_open, + .mca_close_component = mca_btl_ofi_component_close, + .mca_register_component_params = mca_btl_ofi_component_register, + }, + .btl_data = { + /* The component is not checkpoint ready */ + .param_field = MCA_BASE_METADATA_PARAM_NONE + }, + + .btl_init = mca_btl_ofi_component_init, + .btl_progress = mca_btl_ofi_component_progress, + } +}; diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.c b/opal/mca/btl/ofi/btl_ofi_endpoint.c new file mode 100644 index 0000000000..871d594ddf --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.c @@ -0,0 +1,51 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" +#include "opal/util/proc.h" + +static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint) +{ + endpoint->peer_addr = 0; + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t); +} + +static void mca_btl_ofi_endpoint_destruct (mca_btl_ofi_endpoint_t *endpoint) +{ + endpoint->peer_addr = 0; + + /* set to null, we will free ofi endpoint in module */ + endpoint->ofi_endpoint = NULL; + + OBJ_DESTRUCT(&endpoint->ep_lock); +} + +OBJ_CLASS_INSTANCE(mca_btl_ofi_endpoint_t, opal_list_item_t, + mca_btl_ofi_endpoint_construct, + mca_btl_ofi_endpoint_destruct); + +mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep) +{ + mca_btl_ofi_endpoint_t *endpoint = OBJ_NEW(mca_btl_ofi_endpoint_t); + + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->ep_proc = proc; + endpoint->ofi_endpoint = ep; + + return (mca_btl_base_endpoint_t *) endpoint; +} + diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.h b/opal/mca/btl/ofi/btl_ofi_endpoint.h new file mode 100644 index 0000000000..f131f72b1d --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.h @@ -0,0 +1,53 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_OFI_ENDPOINT_H +#define MCA_BTL_OFI_ENDPOINT_H + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" + +#include "btl_ofi.h" + +BEGIN_C_DECLS + +struct mca_btl_base_endpoint_t { + opal_list_item_t super; + + struct fid_ep *ofi_endpoint; + fi_addr_t peer_addr; + + /** endpoint proc */ + opal_proc_t *ep_proc; + + /** mutex to protect this structure */ + opal_mutex_t ep_lock; +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; +OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); + +mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep); + +END_C_DECLS +#endif diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c new file mode 100644 index 0000000000..f8d6d6619c --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -0,0 +1,327 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" +#include +#include "opal/class/opal_bitmap.h" +#include "opal/mca/btl/btl.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/mpool/mpool.h" + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" + +static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl, + size_t nprocs, opal_proc_t **opal_procs, + mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable) +{ + int rc; + int count; + char *ep_name = NULL; + size_t namelen = mca_btl_ofi_component.namelen; + + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + for (size_t i = 0 ; i < nprocs ; ++i) { + peers[i] = mca_btl_ofi_endpoint_create (opal_procs[i], ofi_btl->ofi_endpoint); + if (OPAL_UNLIKELY(NULL == peers[i])) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version, + &peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error receiving modex")); + MCA_BTL_OFI_ABORT(); + } + + /* get peer fi_addr */ + count = fi_av_insert(ofi_btl->av, /* Address vector to insert */ + ep_name, /* peer name */ + 1, /* amount to insert */ + &peers[i]->peer_addr, /* return peer address here */ + 0, /* flags */ + NULL); /* context */ + + /* if succeed, add this proc and mark reachable */ + if (count == 1) { /* we inserted 1 address. */ + opal_list_append (&ofi_btl->endpoints, &peers[i]->super); + opal_bitmap_set_bit(reachable, i); + } else { + BTL_VERBOSE(("fi_av_insert failed with rc = %d", count)); + MCA_BTL_OFI_ABORT(); + } + } + + return OPAL_SUCCESS; +} + +static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs, + opal_proc_t **procs, mca_btl_base_endpoint_t **peers) +{ + int ret; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + for (size_t i = 0 ; i < nprocs ; ++i) { + if (peers[i]) { + + /* remove the address from AV. */ + ret = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0); + if (ret < 0) { + /* remove failed. this should not happen. */ + /* Lets not crash because we failed to remove an address. */ + BTL_ERROR(("fi_av_remove failed with error %d:%s", + ret, fi_strerror(-ret))); + } + + /* remove and free MPI endpoint from the list. */ + opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super); + OBJ_RELEASE(peers[i]); + } + } + + return OPAL_SUCCESS; +} + +void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module) +{ + if (!module->initialized) { + mca_rcache_base_resources_t rcache_resources; + char *tmp; + + (void) asprintf (&tmp, "ofi.%s", module->linux_device_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t); + rcache_resources.register_mem = mca_btl_ofi_reg_mem; + rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem; + + module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources); + free (tmp); + + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_ERROR(("cannot create rcache")); + MCA_BTL_OFI_ABORT(); + } + + module->initialized = true; + } +} + + +/** + * @brief Register a memory region for put/get/atomic operations. + * + * @param btl (IN) BTL module + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags indicating what operation will be performed. Valid + * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, + * and MCA_BTL_DES_FLAGS_ATOMIC + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. + */ +static struct mca_btl_base_registration_handle_t * +mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) +{ + mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_reg_t *reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; + int rc; + + rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags, + (mca_rcache_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return NULL; + } + + return ®->handle; +} + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + */ +static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_reg_t *reg = + (mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle)); + + (void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) +{ + int rc; + static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE; + + mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data; + mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg; + + rc = fi_mr_reg(btl->domain, base, size, access_flags, 0, + (uint64_t) reg, 0, &ur->ur_mr, NULL); + if (0 != rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + ur->handle.rkey = fi_mr_key(ur->ur_mr); + ur->handle.desc = fi_mr_desc(ur->ur_mr); + + /* In case the provider doesn't support FI_MR_VIRT_ADDR, + * we need to reference the remote address by the distance from base registered + * address. We keep this information to use in rdma/atomic operations. */ + if (btl->use_virt_addr) { + ur->handle.base_addr = 0; + } else { + ur->handle.base_addr = base; + } + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) +{ + mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg; + + if (ur->ur_mr != NULL) { + if (0 != fi_close(&ur->ur_mr->fid)) { + BTL_ERROR(("%s: error unpinning memory mr=%p: %s", + __func__, (void*) ur->ur_mr, strerror(errno))); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} + +/* + * Cleanup/release module resources. + */ + +int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *endpoint, *next; + + assert(btl); + + if (NULL != ofi_btl->cq) { + fi_close(&ofi_btl->cq->fid); + } + + if (NULL != ofi_btl->av) { + fi_close(&ofi_btl->av->fid); + } + + if (NULL != ofi_btl->ofi_endpoint) { + fi_close(&ofi_btl->ofi_endpoint->fid); + } + + if (NULL != ofi_btl->domain) { + fi_close(&ofi_btl->domain->fid); + } + + if (NULL != ofi_btl->fabric) { + fi_close(&ofi_btl->fabric->fid); + } + + if (NULL != ofi_btl->fabric_info) { + fi_freeinfo(ofi_btl->fabric_info); + } + + /* clean up any leftover endpoints */ + OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) { + opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super); + OBJ_RELEASE(endpoint); + } + + OBJ_DESTRUCT(&ofi_btl->endpoints); + OBJ_DESTRUCT(&ofi_btl->comp_list); + + if (ofi_btl->rcache) { + mca_rcache_base_module_destroy (ofi_btl->rcache); + } + + free (btl); + + return OPAL_SUCCESS; +} + +mca_btl_ofi_module_t mca_btl_ofi_module_template = { + .super = { + /* initialize functions. this btl only support RDMA and atomics + * for now so it does not provide prepare_src, alloc, free, or send */ + .btl_component = &mca_btl_ofi_component.super, + .btl_add_procs = mca_btl_ofi_add_procs, + .btl_del_procs = mca_btl_ofi_del_procs, + .btl_finalize = mca_btl_ofi_finalize, + .btl_put = mca_btl_ofi_put, + .btl_get = mca_btl_ofi_get, + .btl_register_mem = mca_btl_ofi_register_mem, + .btl_deregister_mem = mca_btl_ofi_deregister_mem, + .btl_atomic_op = mca_btl_ofi_aop, + .btl_atomic_fop = mca_btl_ofi_afop, + .btl_atomic_cswap = mca_btl_ofi_acswap, + .btl_flush = mca_btl_ofi_flush, + + /* set the default flags for this btl. ofi provides us with rdma and both + * fetching and non-fetching atomics (though limited to add and cswap) */ + .btl_flags = MCA_BTL_FLAGS_RDMA | + MCA_BTL_FLAGS_ATOMIC_FOPS | + MCA_BTL_FLAGS_ATOMIC_OPS, + + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_32BIT, + + /* set the default limits on put and get */ + .btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t), + .btl_put_limit = 1 << 23, + .btl_put_alignment = 0, + .btl_get_limit = 1 << 23, + .btl_get_alignment = 0, + } +}; diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.c b/opal/mca/btl/ofi/btl_ofi_rdma.c new file mode 100644 index 0000000000..854301006b --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_rdma.c @@ -0,0 +1,148 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ofi_rdma.h" + +OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t, + opal_free_list_item_t, + NULL, + NULL); + +mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( + mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *local_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata, + int type) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl; + mca_btl_ofi_completion_t *comp; + + comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list); + assert(comp); + + comp->btl = btl; + comp->endpoint = endpoint; + comp->local_address = local_address; + comp->local_handle = local_handle; + comp->cbfunc = cbfunc; + comp->cbcontext = cbcontext; + comp->cbdata = cbdata; + comp->my_list = &ofi_btl->comp_list; + comp->type = type; + + return comp; +} + +int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + + int rc; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + mca_btl_ofi_completion_t *comp; + + /* create completion context */ + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_GET); + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* Remote write data across the wire */ + rc = fi_read(ofi_btl->ofi_endpoint, + local_address, size, /* payload */ + local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + comp); /* completion context */ + + if (-FI_EAGAIN == rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (0 != rc) { + BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + /* force a bit of progress */ + mca_btl_ofi_component.super.btl_progress(); + + return OPAL_SUCCESS; +} + +int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc; + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint; + + /* create completion context */ + mca_btl_ofi_completion_t *comp; + comp = mca_btl_ofi_completion_alloc(btl, endpoint, + local_address, + local_handle, + cbfunc, cbcontext, cbdata, + MCA_BTL_OFI_TYPE_PUT); + + remote_address = (remote_address - (uint64_t) remote_handle->base_addr); + + /* Remote write data across the wire */ + rc = fi_write(ofi_btl->ofi_endpoint, + local_address, size, /* payload */ + local_handle->desc, + btl_endpoint->peer_addr, + remote_address, remote_handle->rkey, + comp); /* completion context */ + + if (-FI_EAGAIN == rc) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (0 != rc) { + BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc))); + MCA_BTL_OFI_ABORT(); + } + + MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl); + + /* force a bit of progress */ + mca_btl_ofi_component.super.btl_progress(); + + return OPAL_SUCCESS; + +} + +int mca_btl_ofi_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + + while(ofi_btl->outstanding_rdma > 0) { + (void) mca_btl_ofi_component.super.btl_progress(); + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/ofi/btl_ofi_rdma.h b/opal/mca/btl/ofi/btl_ofi_rdma.h new file mode 100644 index 0000000000..19eab1e54c --- /dev/null +++ b/opal/mca/btl/ofi/btl_ofi_rdma.h @@ -0,0 +1,38 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2018 Intel, Inc, All rights reserved + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef BTL_OFI_RDMA_H +#define BTL_OFI_RDMA_H + +#include "opal/threads/thread_usage.h" + +#include "btl_ofi.h" +#include "btl_ofi_endpoint.h" + +mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc ( + mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *local_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata, + int type); + +#define MCA_BTL_OFI_NUM_RDMA_INC(module) \ + OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); + +#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \ + OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1); + +#endif /* !defined(BTL_OFI_RDMA_H) */ + diff --git a/opal/mca/btl/ofi/configure.m4 b/opal/mca/btl/ofi/configure.m4 new file mode 100644 index 0000000000..7e6925a0e3 --- /dev/null +++ b/opal/mca/btl/ofi/configure.m4 @@ -0,0 +1,45 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2018 Los Alamos National Security, LLC. +# All rights reserved. +# Copyright (c) 2018 Intel, inc. All rights reserved +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OPAL_CHECK_OFI(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if OFI support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found + +AC_DEFUN([MCA_opal_btl_ofi_CONFIG],[ + AC_CONFIG_FILES([opal/mca/btl/ofi/Makefile]) + + AC_REQUIRE([MCA_opal_common_ofi_CONFIG]) + + AS_IF([test "$opal_common_ofi_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ofi + AC_SUBST([btl_ofi_CPPFLAGS]) + AC_SUBST([btl_ofi_LDFLAGS]) + AC_SUBST([btl_ofi_LIBS]) +])dnl diff --git a/opal/mca/btl/ofi/owner.txt b/opal/mca/btl/ofi/owner.txt new file mode 100644 index 0000000000..f58f1cbab7 --- /dev/null +++ b/opal/mca/btl/ofi/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner:Intel +status:active