Merge pull request #5205 from thananon/btl_ofi
new btl/ofi: RDMA only btl using libfabric.
Этот коммит содержится в:
Коммит
623e36de8a
62
opal/mca/btl/ofi/Makefile.am
Обычный файл
62
opal/mca/btl/ofi/Makefile.am
Обычный файл
@ -0,0 +1,62 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# Copyright (c) 2018 Intel, inc. All rights reserved
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
#dist_opaldata_DATA = help-mpi-btl-ofi.txt
|
||||
|
||||
AM_CPPFLAGS = $(opal_common_ofi_CPPFLAGS)
|
||||
sources = \
|
||||
btl_ofi.h \
|
||||
btl_ofi_component.c \
|
||||
btl_ofi_endpoint.h \
|
||||
btl_ofi_endpoint.c \
|
||||
btl_ofi_module.c \
|
||||
btl_ofi_rdma.h \
|
||||
btl_ofi_rdma.c \
|
||||
btl_ofi_atomics.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_opal_btl_ofi_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_btl_ofi.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_btl_ofi.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(opallibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_btl_ofi_la_SOURCES = $(component_sources)
|
||||
mca_btl_ofi_la_LDFLAGS = -module -avoid-version \
|
||||
$(opal_btl_ofi_LDFLAGS)
|
||||
mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
|
||||
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_btl_ofi_la_SOURCES = $(lib_sources)
|
||||
libmca_btl_ofi_la_LDFLAGS = -module -avoid-version $(opal_btl_ofi_LDFLAGS)
|
88
opal/mca/btl/ofi/README
Обычный файл
88
opal/mca/btl/ofi/README
Обычный файл
@ -0,0 +1,88 @@
|
||||
========================================
|
||||
Design notes on BTL/OFI
|
||||
========================================
|
||||
|
||||
This is the RDMA only btl based on OFI Libfabric. The goal is to enable RDMA
|
||||
with multiple vendor hardware through one interface. Most of the operations are
|
||||
managed by upper layer (osc/rdma). This BTL is mostly doing the low level work.
|
||||
|
||||
Tested providers: sockets,psm2,ugni
|
||||
|
||||
========================================
|
||||
|
||||
Component
|
||||
|
||||
This BTL is requesting libfabric version 1.5 API and will not support older versions.
|
||||
|
||||
The required capabilities of this BTL is FI_ATOMIC and FI_RMA with the endpoint type
|
||||
of FI_EP_RDM only. This BTL does NOT support libfabric provider that requires local
|
||||
memory registration (FI_MR_LOCAL).
|
||||
|
||||
BTL/OFI will initialize a module with ONLY the first compatible info returned from OFI.
|
||||
This means it will rely on OFI provider to do load balancing. The support for multiple
|
||||
device might be added later.
|
||||
|
||||
The BTL creates only one endpoint and one CQ.
|
||||
|
||||
========================================
|
||||
|
||||
Memory Registration
|
||||
|
||||
Open MPI has a system in place to exchange remote address and always use the remote
|
||||
virtual address to refer to a piece of memory. However, some libfabric providers might
|
||||
not support the use of virtual address and instead will use zero-based offset addressing.
|
||||
|
||||
FI_MR_VIRT_ADDR is the flag that determine this behavior. mca_btl_ofi_reg_mem() handles
|
||||
this by storing the base address in registration handle in case of the provider does not
|
||||
support FI_MR_VIRT_ADDR. This base address will be used to calculate the offset later in
|
||||
RDMA/Atomic operations.
|
||||
|
||||
The BTL will try to use the address of registration handle as the key. However, if the
|
||||
provider supports FI_MR_PROV_KEY, it will use provider provided key. Simply does not care.
|
||||
|
||||
The BTL does not register local operand or compare. This is why this BTL does not support
|
||||
FI_MR_LOCAL and will allocate every buffer before registering. This means FI_MR_ALLOCATED
|
||||
is supported. So to be explicit.
|
||||
|
||||
Supported MR mode bits (will work with or without):
|
||||
enum:
|
||||
- FI_MR_BASIC
|
||||
- FI_MR_SCALABLE
|
||||
|
||||
mode bits:
|
||||
- FI_MR_VIRT_ADDR
|
||||
- FI_MR_ALLOCATED
|
||||
- FI_MR_PROV_KEY
|
||||
|
||||
The BTL does NOT support (will not work with):
|
||||
- FI_MR_LOCAL
|
||||
- FI_MR_MMU_NOTIFY
|
||||
- FI_MR_RMA_EVENT
|
||||
- FI_MR_ENDPOINT
|
||||
|
||||
Just a reminder, in libfabric API 1.5...
|
||||
FI_MR_BASIC == (FI_MR_PROV_KEY | FI_MR_ALLOCATED | FI_MR_VIRT_ADDR)
|
||||
|
||||
========================================
|
||||
|
||||
Completions
|
||||
|
||||
Every operation in this BTL is asynchronous. The completion handling will occur in
|
||||
mca_btl_ofi_component_progress() where we read the CQ with the completion context and
|
||||
execute the callback functions. The completions are local. No remote completion event is
|
||||
generated as local completion already guarantee global completion.
|
||||
|
||||
The BTL keep tracks of number of outstanding operations and provide flush interface.
|
||||
|
||||
========================================
|
||||
|
||||
Sockets Provider
|
||||
|
||||
Sockets provider is the proof of concept provider for libfabric. It is supposed to support
|
||||
all the OFI API with emulations. This provider is considered very slow and bound to raise
|
||||
problems that we might not see from other faster providers.
|
||||
|
||||
Known Problems:
|
||||
- sockets provider uses progress thread and can cause segfault in finalize as we free
|
||||
the resources while progress thread is still using it. sleep(1) was put in
|
||||
mca_btl_ofi_componenet_close() for this reason.
|
275
opal/mca/btl/ofi/btl_ofi.h
Обычный файл
275
opal/mca/btl/ofi/btl_ofi.h
Обычный файл
@ -0,0 +1,275 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2018 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_OFI_H
|
||||
#define MCA_BTL_OFI_H
|
||||
|
||||
#include "opal_config.h"
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Open MPI includes */
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
#include "opal/mca/btl/base/btl_base_error.h"
|
||||
#include "opal/mca/rcache/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include <rdma/fabric.h>
|
||||
#include <rdma/fi_domain.h>
|
||||
#include <rdma/fi_errno.h>
|
||||
#include <rdma/fi_cm.h>
|
||||
#include <rdma/fi_endpoint.h>
|
||||
#include <rdma/fi_rma.h>
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
#define MCA_BTL_OFI_MAX_MODULES 16
|
||||
#define MCA_BTL_OFI_MAX_WORKERS 1
|
||||
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
|
||||
|
||||
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
|
||||
|
||||
enum mca_btl_ofi_type {
|
||||
MCA_BTL_OFI_TYPE_PUT = 1,
|
||||
MCA_BTL_OFI_TYPE_GET,
|
||||
MCA_BTL_OFI_TYPE_AOP,
|
||||
MCA_BTL_OFI_TYPE_AFOP,
|
||||
MCA_BTL_OFI_TYPE_CSWAP,
|
||||
MCA_BTL_OFI_TYPE_TOTAL
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief OFI BTL module
|
||||
*/
|
||||
struct mca_btl_ofi_module_t {
|
||||
/** base BTL interface */
|
||||
mca_btl_base_module_t super;
|
||||
|
||||
/* libfabric components */
|
||||
struct fi_info *fabric_info;
|
||||
struct fid_fabric *fabric;
|
||||
struct fid_domain *domain;
|
||||
struct fid_ep *ofi_endpoint;
|
||||
struct fid_cq *cq;
|
||||
struct fid_av *av;
|
||||
|
||||
char *linux_device_name;
|
||||
|
||||
/** whether the module has been fully initialized or not */
|
||||
bool initialized;
|
||||
bool use_virt_addr;
|
||||
|
||||
/** spin-lock to protect the module */
|
||||
volatile int32_t lock;
|
||||
|
||||
int64_t outstanding_rdma;
|
||||
|
||||
/** linked list of BTL endpoints. this list is never searched so
|
||||
* there is no need for a complicated structure here at this time*/
|
||||
opal_list_t endpoints;
|
||||
|
||||
/* free lists */
|
||||
opal_free_list_t comp_list;
|
||||
|
||||
/** registration cache */
|
||||
mca_rcache_base_module_t *rcache;
|
||||
};
|
||||
typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t;
|
||||
|
||||
extern mca_btl_ofi_module_t mca_btl_ofi_module_template;
|
||||
|
||||
/**
|
||||
* @brief OFI BTL component
|
||||
*/
|
||||
struct mca_btl_ofi_component_t {
|
||||
mca_btl_base_component_3_0_0_t super; /**< base BTL component */
|
||||
|
||||
/** number of TL modules */
|
||||
int module_count;
|
||||
int num_cqe_read;
|
||||
|
||||
size_t namelen;
|
||||
|
||||
/** All BTL OFI modules (1 per tl) */
|
||||
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
|
||||
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
/** bind threads to contexts */
|
||||
bool bind_threads_to_contexts;
|
||||
#endif
|
||||
};
|
||||
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
|
||||
|
||||
OPAL_MODULE_DECLSPEC extern mca_btl_ofi_component_t mca_btl_ofi_component;
|
||||
|
||||
struct mca_btl_base_registration_handle_t {
|
||||
uint64_t rkey;
|
||||
void *desc;
|
||||
void *base_addr;
|
||||
};
|
||||
|
||||
struct mca_btl_ofi_reg_t {
|
||||
mca_rcache_base_registration_t base;
|
||||
struct fid_mr *ur_mr;
|
||||
|
||||
/* remote handle */
|
||||
mca_btl_base_registration_handle_t handle;
|
||||
};
|
||||
typedef struct mca_btl_ofi_reg_t mca_btl_ofi_reg_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_reg_t);
|
||||
|
||||
/* completion structure store information needed
|
||||
* for RDMA callbacks */
|
||||
struct mca_btl_ofi_completion_t {
|
||||
opal_free_list_item_t comp_list;
|
||||
opal_free_list_t *my_list;
|
||||
|
||||
struct mca_btl_base_module_t *btl;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
uint32_t type;
|
||||
|
||||
void *local_address;
|
||||
mca_btl_base_registration_handle_t *local_handle;
|
||||
|
||||
/* information for atomic op */
|
||||
uint64_t operand;
|
||||
uint64_t compare;
|
||||
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc;
|
||||
void *cbcontext;
|
||||
void *cbdata;
|
||||
|
||||
};
|
||||
typedef struct mca_btl_ofi_completion_t mca_btl_ofi_completion_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_completion_t);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous put.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the put operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
int mca_btl_ofi_put (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
/**
|
||||
* Initiate an asynchronous get.
|
||||
* Completion Semantics: if this function returns a 1 then the operation
|
||||
* is complete. a return of OPAL_SUCCESS indicates
|
||||
* the get operation has been queued with the
|
||||
* network. the local_handle can not be deregistered
|
||||
* until all outstanding operations on that handle
|
||||
* have been completed.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint (IN) BTL addressing information
|
||||
* @param local_address (IN) Local address to put from (registered)
|
||||
* @param remote_address (IN) Remote address to put to (registered remotely)
|
||||
* @param local_handle (IN) Registration handle for region containing
|
||||
* (local_address, local_address + size)
|
||||
* @param remote_handle (IN) Remote registration handle for region containing
|
||||
* (remote_address, remote_address + size)
|
||||
* @param size (IN) Number of bytes to put
|
||||
* @param flags (IN) Flags for this put operation
|
||||
* @param order (IN) Ordering
|
||||
* @param cbfunc (IN) Function to call on completion (if queued)
|
||||
* @param cbcontext (IN) Context for the callback
|
||||
* @param cbdata (IN) Data for callback
|
||||
*
|
||||
* @retval OPAL_SUCCESS The descriptor was successfully queued for a put
|
||||
* @retval OPAL_ERROR The descriptor was NOT successfully queued for a put
|
||||
* @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put
|
||||
* operation. Try again later
|
||||
* @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or
|
||||
* alignment restrictions.
|
||||
*/
|
||||
int mca_btl_ofi_get (struct mca_btl_base_module_t *btl,
|
||||
struct mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
|
||||
struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata);
|
||||
|
||||
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
|
||||
|
||||
|
||||
int mca_btl_ofi_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
int mca_btl_ofi_finalize (mca_btl_base_module_t *btl);
|
||||
|
||||
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module);
|
||||
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
|
||||
mca_rcache_base_registration_t *reg);
|
||||
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
|
||||
|
||||
void mca_btl_ofi_exit(void);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
180
opal/mca/btl/ofi/btl_ofi_atomics.c
Обычный файл
180
opal/mca/btl/ofi/btl_ofi_atomics.c
Обычный файл
@ -0,0 +1,180 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <rdma/fi_atomic.h>
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
static inline int to_fi_op(mca_btl_base_atomic_op_t op)
|
||||
{
|
||||
switch (op) {
|
||||
case MCA_BTL_ATOMIC_ADD:
|
||||
return FI_SUM;
|
||||
case MCA_BTL_ATOMIC_SWAP:
|
||||
return FI_ATOMIC_WRITE;
|
||||
default:
|
||||
BTL_ERROR(("Unknown or unsupported atomic op."));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
|
||||
/* just to squash the warning */
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
|
||||
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
int fi_op;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_completion_t *comp = NULL;
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
fi_op = to_fi_op(op);
|
||||
|
||||
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_AFOP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) operand;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
rc = fi_fetch_atomic(ofi_btl->ofi_endpoint,
|
||||
(void*) &comp->operand, 1, NULL, /* operand */
|
||||
local_address, local_handle->desc, /* results */
|
||||
btl_endpoint->peer_addr, /* remote addr */
|
||||
remote_address, remote_handle->rkey, /* remote buffer */
|
||||
fi_datatype, fi_op, comp);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_fetch_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
|
||||
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
int fi_op;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_completion_t *comp = NULL;
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
fi_op = to_fi_op(op);
|
||||
|
||||
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
|
||||
NULL,
|
||||
NULL,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_AOP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) operand;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
rc = fi_atomic(ofi_btl->ofi_endpoint,
|
||||
(void*) &comp->operand, 1, NULL, /* operand */
|
||||
btl_endpoint->peer_addr, /* remote addr */
|
||||
remote_address, remote_handle->rkey, /* remote buffer */
|
||||
fi_datatype, fi_op, comp);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
int fi_datatype = FI_UINT64;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_completion_t *comp = NULL;
|
||||
|
||||
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
|
||||
fi_datatype = FI_UINT32;
|
||||
}
|
||||
|
||||
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_CSWAP);
|
||||
|
||||
/* copy the operand because it might get freed from upper layer */
|
||||
comp->operand = (uint64_t) value;
|
||||
comp->compare = (uint64_t) compare;
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* perform atomic */
|
||||
rc = fi_compare_atomic(ofi_btl->ofi_endpoint,
|
||||
(void*) &comp->operand, 1, NULL,
|
||||
(void*) &comp->compare, NULL,
|
||||
local_address, local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
fi_datatype,
|
||||
FI_CSWAP,
|
||||
comp);
|
||||
|
||||
if (rc == -FI_EAGAIN) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
} else if (rc < 0) {
|
||||
BTL_ERROR(("fi_compare_atomic failed with rc=%d (%s)", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
583
opal/mca/btl/ofi/btl_ofi_component.c
Обычный файл
583
opal/mca/btl/ofi/btl_ofi_component.c
Обычный файл
@ -0,0 +1,583 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
|
||||
#define MCA_BTL_OFI_REQUIRED_CAPS (FI_RMA | FI_ATOMIC)
|
||||
#define MCA_BTL_OFI_REQUESTED_MR_MODE (FI_MR_UNSPEC)
|
||||
|
||||
static char *prov_include;
|
||||
static char *prov_exclude;
|
||||
static char *ofi_progress_mode;
|
||||
static int mca_btl_ofi_init_device(struct fi_info *info);
|
||||
|
||||
/* validate information returned from fi_getinfo().
|
||||
* return OPAL_ERROR if we dont have what we need. */
|
||||
static int validate_info(struct fi_info *info)
|
||||
{
|
||||
int mr_mode;
|
||||
|
||||
/* we need exactly all the required bits */
|
||||
if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* we need FI_EP_RDM */
|
||||
if (info->ep_attr->type != FI_EP_RDM) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
mr_mode = info->domain_attr->mr_mode;
|
||||
|
||||
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
|
||||
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Register the MCA parameters */
|
||||
static int mca_btl_ofi_component_register(void)
|
||||
{
|
||||
mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template;
|
||||
|
||||
/* fi_getinfo with prov_name == NULL means ALL provider.
|
||||
* Since now we are using the first valid info returned, I'm not sure
|
||||
* if we need to provide the support for comma limited provider list. */
|
||||
prov_include = NULL;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"provider_include",
|
||||
"OFI provider that ofi btl will query for. This parameter only "
|
||||
"accept ONE provider name. "
|
||||
"(e.g., \"psm2\"; an empty value means that all providers will "
|
||||
"be considered.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_include);
|
||||
|
||||
/* TODO: this param has not been implemented. Not sure if we need it. " */
|
||||
prov_exclude = NULL;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"provider_exclude",
|
||||
"Comma-delimited list of OFI providers that are not considered for use "
|
||||
"(default: \"sockets,mxm\"; empty value means that all providers will "
|
||||
" be considered). "
|
||||
"Mutually exclusive with btl_ofi_provider_include.",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_4,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&prov_exclude);
|
||||
|
||||
/* Note: better leave it at 1 for now. osc rdma module is designed for 1 completion
|
||||
* at a time. Dealing with more than 1 completion in 1 read will confuse the osc rdma.
|
||||
* source: 8 hours of debugging. :(*/
|
||||
mca_btl_ofi_component.num_cqe_read = 1;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"num_cq_read",
|
||||
"Number of completion entries to read from a single cq_read. "
|
||||
"(default: 1)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_btl_ofi_component.num_cqe_read);
|
||||
|
||||
ofi_progress_mode = "unspec";
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"progress_mode",
|
||||
"requested provider progress mode. [unspec, auto, manual]"
|
||||
"(default: unspec)",
|
||||
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ofi_progress_mode);
|
||||
|
||||
#if OPAL_C_HAVE__THREAD_LOCAL
|
||||
mca_btl_ofi_component.bind_threads_to_contexts = true;
|
||||
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
|
||||
"bind_threads_to_contexts", "Bind threads to device contexts. "
|
||||
"In general this should improve the multi-threaded performance "
|
||||
"when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL,
|
||||
NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
|
||||
MCA_BASE_VAR_SCOPE_ALL,
|
||||
&mca_btl_ofi_component.bind_threads_to_contexts);
|
||||
#endif
|
||||
|
||||
/* for now we want this component to lose to btl/ugni and btl/vader */
|
||||
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
|
||||
|
||||
return mca_btl_base_param_register (&mca_btl_ofi_component.super.btl_version,
|
||||
&module->super);
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_component_open(void)
|
||||
{
|
||||
mca_btl_ofi_component.module_count = 0;
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* component cleanup - sanity checking of queue lengths
|
||||
*/
|
||||
static int mca_btl_ofi_component_close(void)
|
||||
{
|
||||
/* If we don't sleep, sockets provider freaks out. */
|
||||
sleep(1);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_ofi_exit(void)
|
||||
{
|
||||
BTL_ERROR(("BTL OFI will now abort."));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* OFI component initialization:
|
||||
* read interface list from kernel and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
*/
|
||||
|
||||
static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules, bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */
|
||||
int rc;
|
||||
uint64_t progress_mode;
|
||||
unsigned resource_count = 0;
|
||||
struct mca_btl_base_module_t **base_modules;
|
||||
|
||||
BTL_VERBOSE(("initializing ofi btl"));
|
||||
|
||||
/* Set up libfabric hints. */
|
||||
uint32_t libfabric_api;
|
||||
libfabric_api = FI_VERSION(1, 5); /* 1.5 because of the newer API */
|
||||
|
||||
struct fi_info *info, *info_list;
|
||||
struct fi_info hints = {0};
|
||||
struct fi_ep_attr ep_attr = {0};
|
||||
struct fi_rx_attr rx_attr = {0};
|
||||
struct fi_tx_attr tx_attr = {0};
|
||||
struct fi_fabric_attr fabric_attr = {0};
|
||||
struct fi_domain_attr domain_attr = {0};
|
||||
|
||||
/* Select the provider */
|
||||
fabric_attr.prov_name = prov_include;
|
||||
|
||||
domain_attr.mr_mode = MCA_BTL_OFI_REQUESTED_MR_MODE;
|
||||
|
||||
/* message progression mode. */
|
||||
if (!strcmp(ofi_progress_mode, "auto")) {
|
||||
progress_mode = FI_PROGRESS_AUTO;
|
||||
} else if (!strcmp(ofi_progress_mode, "manual")) {
|
||||
progress_mode = FI_PROGRESS_MANUAL;
|
||||
} else {
|
||||
progress_mode = FI_PROGRESS_UNSPEC;
|
||||
}
|
||||
|
||||
domain_attr.control_progress = progress_mode;
|
||||
domain_attr.data_progress = progress_mode;
|
||||
|
||||
/* select endpoint type */
|
||||
ep_attr.type = FI_EP_RDM;
|
||||
|
||||
/* ask for capabilities */
|
||||
hints.caps = MCA_BTL_OFI_REQUIRED_CAPS;
|
||||
|
||||
hints.fabric_attr = &fabric_attr;
|
||||
hints.domain_attr = &domain_attr;
|
||||
hints.ep_attr = &ep_attr;
|
||||
hints.tx_attr = &tx_attr;
|
||||
hints.rx_attr = &rx_attr;
|
||||
|
||||
/* for now */
|
||||
tx_attr.iov_limit = 1;
|
||||
rx_attr.iov_limit = 1;
|
||||
|
||||
mca_btl_ofi_component.module_count = 0;
|
||||
|
||||
/* do the query. */
|
||||
rc = fi_getinfo(libfabric_api, NULL, NULL, 0, &hints, &info_list);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("fi_getinfo failed with code %d: %s",rc, fi_strerror(-rc)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* count the number of resources/ */
|
||||
info = info_list;
|
||||
while(info) {
|
||||
resource_count++;
|
||||
info = info->next;
|
||||
}
|
||||
BTL_VERBOSE(("ofi btl found %d possible resources.", resource_count));
|
||||
|
||||
info = info_list;
|
||||
|
||||
while(info) {
|
||||
rc = validate_info(info);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
/* Device passed sanity check, let's make a module.
|
||||
* We only pick the first device we found valid */
|
||||
rc = mca_btl_ofi_init_device(info);
|
||||
if (OPAL_SUCCESS == rc)
|
||||
break;
|
||||
}
|
||||
info = info->next;
|
||||
}
|
||||
|
||||
/* We are done with the returned info. */
|
||||
fi_freeinfo(info_list);
|
||||
|
||||
/* pass module array back to caller */
|
||||
base_modules = calloc (mca_btl_ofi_component.module_count, sizeof (*base_modules));
|
||||
if (NULL == base_modules) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(base_modules, mca_btl_ofi_component.modules,
|
||||
mca_btl_ofi_component.module_count *sizeof (mca_btl_ofi_component.modules[0]));
|
||||
|
||||
BTL_VERBOSE(("ofi btl initialization complete. found %d suitable transports",
|
||||
mca_btl_ofi_component.module_count));
|
||||
|
||||
*num_btl_modules = mca_btl_ofi_component.module_count;
|
||||
|
||||
return base_modules;
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_init_device(struct fi_info *info)
|
||||
{
|
||||
int rc;
|
||||
int *module_count = &mca_btl_ofi_component.module_count;
|
||||
size_t namelen;
|
||||
mca_btl_ofi_module_t *module;
|
||||
|
||||
char *linux_device_name;
|
||||
char ep_name[FI_NAME_MAX];
|
||||
struct fi_info *ofi_info;
|
||||
struct fi_cq_attr cq_attr = {0};
|
||||
struct fi_av_attr av_attr = {0};
|
||||
struct fid_fabric *fabric = NULL;
|
||||
struct fid_domain *domain = NULL;
|
||||
struct fid_ep *endpoint = NULL;
|
||||
struct fid_cq *cq = NULL;
|
||||
struct fid_av *av = NULL;
|
||||
|
||||
/* make a copy of the given info to store on the module */
|
||||
ofi_info = fi_dupinfo(info);
|
||||
|
||||
linux_device_name = info->domain_attr->name;
|
||||
BTL_VERBOSE(("initializing dev:%s provider:%s",
|
||||
linux_device_name,
|
||||
info->fabric_attr->prov_name));
|
||||
|
||||
/* fabric */
|
||||
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_fabric with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* domain */
|
||||
rc = fi_domain(fabric, ofi_info, &domain, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_domain with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* endpoint */
|
||||
rc = fi_endpoint(domain, ofi_info, &endpoint, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_endpoint with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* CQ */
|
||||
cq_attr.format = FI_CQ_FORMAT_CONTEXT;
|
||||
cq_attr.wait_obj = FI_WAIT_NONE;
|
||||
rc = fi_cq_open(domain, &cq_attr, &cq, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_cq_open with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* AV */
|
||||
av_attr.type = FI_AV_MAP;
|
||||
rc = fi_av_open(domain, &av_attr, &av, NULL);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_av_open with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
||||
/* bind CQ and AV to endpoint */
|
||||
uint32_t cq_flags = (FI_TRANSMIT);
|
||||
rc = fi_ep_bind(endpoint, (fid_t)cq, cq_flags);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
rc = fi_ep_bind(endpoint, (fid_t)av, 0);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_ep_bind with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* enable the endpoint for using */
|
||||
rc = fi_enable(endpoint);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_enable with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Everything succeeded, lets create a module for this device. */
|
||||
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
|
||||
if (NULL == module) {
|
||||
goto fail;
|
||||
}
|
||||
*module = mca_btl_ofi_module_template;
|
||||
|
||||
/* store the information. */
|
||||
module->fabric_info = ofi_info;
|
||||
module->fabric = fabric;
|
||||
module->domain = domain;
|
||||
module->cq = cq;
|
||||
module->av = av;
|
||||
module->ofi_endpoint = endpoint;
|
||||
module->linux_device_name = linux_device_name;
|
||||
module->outstanding_rdma = 0;
|
||||
module->use_virt_addr = false;
|
||||
|
||||
if (ofi_info->domain_attr->mr_mode == FI_MR_BASIC ||
|
||||
ofi_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) {
|
||||
module->use_virt_addr = true;
|
||||
}
|
||||
|
||||
/* initialize the rcache */
|
||||
mca_btl_ofi_rcache_init(module);
|
||||
|
||||
OBJ_CONSTRUCT(&module->endpoints, opal_list_t);
|
||||
|
||||
/* init free lists */
|
||||
OBJ_CONSTRUCT(&module->comp_list, opal_free_list_t);
|
||||
rc = opal_free_list_init(&module->comp_list,
|
||||
sizeof(mca_btl_ofi_completion_t),
|
||||
opal_cache_line_size,
|
||||
OBJ_CLASS(mca_btl_ofi_completion_t),
|
||||
0,
|
||||
0,
|
||||
128,
|
||||
-1,
|
||||
128,
|
||||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
assert(OPAL_SUCCESS == rc);
|
||||
|
||||
/* create and send the modex for this device */
|
||||
namelen = sizeof(ep_name);
|
||||
rc = fi_getname((fid_t)endpoint, &ep_name[0], &namelen);
|
||||
if (0 != rc) {
|
||||
BTL_VERBOSE(("%s failed fi_getname with err=%s",
|
||||
linux_device_name,
|
||||
fi_strerror(-rc)
|
||||
));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* post our endpoint name so peer can use it to connect to us */
|
||||
OPAL_MODEX_SEND(rc,
|
||||
OPAL_PMIX_GLOBAL,
|
||||
&mca_btl_ofi_component.super.btl_version,
|
||||
&ep_name,
|
||||
namelen);
|
||||
mca_btl_ofi_component.namelen = namelen;
|
||||
|
||||
/* add this module to the list */
|
||||
mca_btl_ofi_component.modules[(*module_count)++] = module;
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
fail:
|
||||
/* clean up */
|
||||
if (NULL != av) {
|
||||
fi_close(&av->fid);
|
||||
}
|
||||
if (NULL != cq) {
|
||||
fi_close(&cq->fid);
|
||||
}
|
||||
|
||||
if (NULL != endpoint) {
|
||||
fi_close(&endpoint->fid);
|
||||
}
|
||||
|
||||
if (NULL != domain) {
|
||||
fi_close(&domain->fid);
|
||||
}
|
||||
|
||||
if (NULL != fabric) {
|
||||
fi_close(&fabric->fid);
|
||||
}
|
||||
|
||||
/* not really a failure. just skip this device. */
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief OFI BTL progress function
|
||||
*
|
||||
* This function explictly progresses all workers.
|
||||
*/
|
||||
static int mca_btl_ofi_component_progress (void)
|
||||
{
|
||||
|
||||
int ret = 0;
|
||||
int events_read;
|
||||
int events = 0;
|
||||
struct fi_cq_entry cq_entry[MCA_BTL_OFI_MAX_CQ_READ_ENTRIES];
|
||||
struct fi_cq_err_entry cqerr = {0};
|
||||
|
||||
mca_btl_ofi_completion_t *comp;
|
||||
|
||||
for (int i = 0 ; i < mca_btl_ofi_component.module_count ; ++i) {
|
||||
mca_btl_ofi_module_t *module = mca_btl_ofi_component.modules[i];
|
||||
|
||||
ret = fi_cq_read(module->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);
|
||||
|
||||
if (0 < ret) {
|
||||
events_read = ret;
|
||||
for (int j = 0; j < events_read; j++) {
|
||||
if (NULL != cq_entry[j].op_context) {
|
||||
++events;
|
||||
comp = (mca_btl_ofi_completion_t*) cq_entry[j].op_context;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)comp->btl;
|
||||
|
||||
switch (comp->type) {
|
||||
case MCA_BTL_OFI_TYPE_GET:
|
||||
case MCA_BTL_OFI_TYPE_PUT:
|
||||
case MCA_BTL_OFI_TYPE_AOP:
|
||||
case MCA_BTL_OFI_TYPE_AFOP:
|
||||
case MCA_BTL_OFI_TYPE_CSWAP:
|
||||
|
||||
/* call the callback */
|
||||
if (comp->cbfunc) {
|
||||
comp->cbfunc (comp->btl, comp->endpoint,
|
||||
comp->local_address, comp->local_handle,
|
||||
comp->cbcontext, comp->cbdata, OPAL_SUCCESS);
|
||||
}
|
||||
|
||||
/* return the completion handler */
|
||||
opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_DEC(ofi_btl);
|
||||
break;
|
||||
|
||||
default:
|
||||
/* catasthrophic */
|
||||
BTL_ERROR(("unknown completion type"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
|
||||
ret = fi_cq_readerr(module->cq, &cqerr, 0);
|
||||
|
||||
/* cq readerr failed!? */
|
||||
if (0 > ret) {
|
||||
BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
|
||||
__FILE__, __LINE__, fi_strerror(-ret), ret));
|
||||
} else {
|
||||
BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
|
||||
cqerr.prov_errno));
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_ABORT();
|
||||
|
||||
} else if (OPAL_UNLIKELY(ret != -FI_EAGAIN && ret != -FI_EINTR)) {
|
||||
BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
}
|
||||
|
||||
return events;
|
||||
}
|
||||
|
||||
/** OFI btl component */
|
||||
mca_btl_ofi_component_t mca_btl_ofi_component = {
|
||||
.super = {
|
||||
.btl_version = {
|
||||
MCA_BTL_DEFAULT_VERSION("ofi"),
|
||||
.mca_open_component = mca_btl_ofi_component_open,
|
||||
.mca_close_component = mca_btl_ofi_component_close,
|
||||
.mca_register_component_params = mca_btl_ofi_component_register,
|
||||
},
|
||||
.btl_data = {
|
||||
/* The component is not checkpoint ready */
|
||||
.param_field = MCA_BASE_METADATA_PARAM_NONE
|
||||
},
|
||||
|
||||
.btl_init = mca_btl_ofi_component_init,
|
||||
.btl_progress = mca_btl_ofi_component_progress,
|
||||
}
|
||||
};
|
51
opal/mca/btl/ofi/btl_ofi_endpoint.c
Обычный файл
51
opal/mca/btl/ofi/btl_ofi_endpoint.c
Обычный файл
@ -0,0 +1,51 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
static void mca_btl_ofi_endpoint_construct (mca_btl_ofi_endpoint_t *endpoint)
|
||||
{
|
||||
endpoint->peer_addr = 0;
|
||||
OBJ_CONSTRUCT(&endpoint->ep_lock, opal_mutex_t);
|
||||
}
|
||||
|
||||
static void mca_btl_ofi_endpoint_destruct (mca_btl_ofi_endpoint_t *endpoint)
|
||||
{
|
||||
endpoint->peer_addr = 0;
|
||||
|
||||
/* set to null, we will free ofi endpoint in module */
|
||||
endpoint->ofi_endpoint = NULL;
|
||||
|
||||
OBJ_DESTRUCT(&endpoint->ep_lock);
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_endpoint_t, opal_list_item_t,
|
||||
mca_btl_ofi_endpoint_construct,
|
||||
mca_btl_ofi_endpoint_destruct);
|
||||
|
||||
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep)
|
||||
{
|
||||
mca_btl_ofi_endpoint_t *endpoint = OBJ_NEW(mca_btl_ofi_endpoint_t);
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == endpoint)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
endpoint->ep_proc = proc;
|
||||
endpoint->ofi_endpoint = ep;
|
||||
|
||||
return (mca_btl_base_endpoint_t *) endpoint;
|
||||
}
|
||||
|
53
opal/mca/btl/ofi/btl_ofi_endpoint.h
Обычный файл
53
opal/mca/btl/ofi/btl_ofi_endpoint.h
Обычный файл
@ -0,0 +1,53 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_OFI_ENDPOINT_H
|
||||
#define MCA_BTL_OFI_ENDPOINT_H
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
struct fid_ep *ofi_endpoint;
|
||||
fi_addr_t peer_addr;
|
||||
|
||||
/** endpoint proc */
|
||||
opal_proc_t *ep_proc;
|
||||
|
||||
/** mutex to protect this structure */
|
||||
opal_mutex_t ep_lock;
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t);
|
||||
|
||||
mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create (opal_proc_t *proc, struct fid_ep *ep);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
327
opal/mca/btl/ofi/btl_ofi_module.c
Обычный файл
327
opal/mca/btl/ofi/btl_ofi_module.c
Обычный файл
@ -0,0 +1,327 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
#include <string.h>
|
||||
#include "opal/class/opal_bitmap.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/mpool/mpool.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
|
||||
static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
|
||||
size_t nprocs, opal_proc_t **opal_procs,
|
||||
mca_btl_base_endpoint_t **peers,
|
||||
opal_bitmap_t *reachable)
|
||||
{
|
||||
int rc;
|
||||
int count;
|
||||
char *ep_name = NULL;
|
||||
size_t namelen = mca_btl_ofi_component.namelen;
|
||||
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
|
||||
for (size_t i = 0 ; i < nprocs ; ++i) {
|
||||
peers[i] = mca_btl_ofi_endpoint_create (opal_procs[i], ofi_btl->ofi_endpoint);
|
||||
if (OPAL_UNLIKELY(NULL == peers[i])) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
|
||||
&peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
BTL_ERROR(("error receiving modex"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
/* get peer fi_addr */
|
||||
count = fi_av_insert(ofi_btl->av, /* Address vector to insert */
|
||||
ep_name, /* peer name */
|
||||
1, /* amount to insert */
|
||||
&peers[i]->peer_addr, /* return peer address here */
|
||||
0, /* flags */
|
||||
NULL); /* context */
|
||||
|
||||
/* if succeed, add this proc and mark reachable */
|
||||
if (count == 1) { /* we inserted 1 address. */
|
||||
opal_list_append (&ofi_btl->endpoints, &peers[i]->super);
|
||||
opal_bitmap_set_bit(reachable, i);
|
||||
} else {
|
||||
BTL_VERBOSE(("fi_av_insert failed with rc = %d", count));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
|
||||
opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
|
||||
{
|
||||
int ret;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
|
||||
for (size_t i = 0 ; i < nprocs ; ++i) {
|
||||
if (peers[i]) {
|
||||
|
||||
/* remove the address from AV. */
|
||||
ret = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
|
||||
if (ret < 0) {
|
||||
/* remove failed. this should not happen. */
|
||||
/* Lets not crash because we failed to remove an address. */
|
||||
BTL_ERROR(("fi_av_remove failed with error %d:%s",
|
||||
ret, fi_strerror(-ret)));
|
||||
}
|
||||
|
||||
/* remove and free MPI endpoint from the list. */
|
||||
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
|
||||
OBJ_RELEASE(peers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module)
|
||||
{
|
||||
if (!module->initialized) {
|
||||
mca_rcache_base_resources_t rcache_resources;
|
||||
char *tmp;
|
||||
|
||||
(void) asprintf (&tmp, "ofi.%s", module->linux_device_name);
|
||||
|
||||
rcache_resources.cache_name = tmp;
|
||||
rcache_resources.reg_data = (void *) module;
|
||||
rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t);
|
||||
rcache_resources.register_mem = mca_btl_ofi_reg_mem;
|
||||
rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem;
|
||||
|
||||
module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
|
||||
free (tmp);
|
||||
|
||||
if (NULL == module->rcache) {
|
||||
/* something when horribly wrong */
|
||||
BTL_ERROR(("cannot create rcache"));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
module->initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Register a memory region for put/get/atomic operations.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
||||
* @param base (IN) Pointer to start of region
|
||||
* @param size (IN) Size of region
|
||||
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
||||
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
||||
* and MCA_BTL_DES_FLAGS_ATOMIC
|
||||
*
|
||||
* @returns a memory registration handle valid for both local and remote operations
|
||||
* @returns NULL if the region could not be registered
|
||||
*
|
||||
* This function registers the specified region with the hardware for use with
|
||||
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
||||
* functions. Care should be taken to not hold an excessive number of registrations
|
||||
* as they may use limited system/NIC resources.
|
||||
*/
|
||||
static struct mca_btl_base_registration_handle_t *
|
||||
mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
||||
size_t size, uint32_t flags)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_reg_t *reg;
|
||||
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
|
||||
int rc;
|
||||
|
||||
rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags,
|
||||
(mca_rcache_base_registration_t **) ®);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ®->handle;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Deregister a memory region
|
||||
*
|
||||
* @param btl (IN) BTL module region was registered with
|
||||
* @param handle (IN) BTL registration handle to deregister
|
||||
*
|
||||
* This function deregisters the memory region associated with the specified handle. Care
|
||||
* should be taken to not perform any RDMA or atomic operation on this memory region
|
||||
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
||||
* a remote node.
|
||||
*/
|
||||
static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_reg_t *reg =
|
||||
(mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle));
|
||||
|
||||
(void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg)
|
||||
{
|
||||
int rc;
|
||||
static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE;
|
||||
|
||||
mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data;
|
||||
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg;
|
||||
|
||||
rc = fi_mr_reg(btl->domain, base, size, access_flags, 0,
|
||||
(uint64_t) reg, 0, &ur->ur_mr, NULL);
|
||||
if (0 != rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ur->handle.rkey = fi_mr_key(ur->ur_mr);
|
||||
ur->handle.desc = fi_mr_desc(ur->ur_mr);
|
||||
|
||||
/* In case the provider doesn't support FI_MR_VIRT_ADDR,
|
||||
* we need to reference the remote address by the distance from base registered
|
||||
* address. We keep this information to use in rdma/atomic operations. */
|
||||
if (btl->use_virt_addr) {
|
||||
ur->handle.base_addr = 0;
|
||||
} else {
|
||||
ur->handle.base_addr = base;
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
|
||||
{
|
||||
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg;
|
||||
|
||||
if (ur->ur_mr != NULL) {
|
||||
if (0 != fi_close(&ur->ur_mr->fid)) {
|
||||
BTL_ERROR(("%s: error unpinning memory mr=%p: %s",
|
||||
__func__, (void*) ur->ur_mr, strerror(errno)));
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleanup/release module resources.
|
||||
*/
|
||||
|
||||
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *endpoint, *next;
|
||||
|
||||
assert(btl);
|
||||
|
||||
if (NULL != ofi_btl->cq) {
|
||||
fi_close(&ofi_btl->cq->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->av) {
|
||||
fi_close(&ofi_btl->av->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->ofi_endpoint) {
|
||||
fi_close(&ofi_btl->ofi_endpoint->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->domain) {
|
||||
fi_close(&ofi_btl->domain->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->fabric) {
|
||||
fi_close(&ofi_btl->fabric->fid);
|
||||
}
|
||||
|
||||
if (NULL != ofi_btl->fabric_info) {
|
||||
fi_freeinfo(ofi_btl->fabric_info);
|
||||
}
|
||||
|
||||
/* clean up any leftover endpoints */
|
||||
OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) {
|
||||
opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super);
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&ofi_btl->endpoints);
|
||||
OBJ_DESTRUCT(&ofi_btl->comp_list);
|
||||
|
||||
if (ofi_btl->rcache) {
|
||||
mca_rcache_base_module_destroy (ofi_btl->rcache);
|
||||
}
|
||||
|
||||
free (btl);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
mca_btl_ofi_module_t mca_btl_ofi_module_template = {
|
||||
.super = {
|
||||
/* initialize functions. this btl only support RDMA and atomics
|
||||
* for now so it does not provide prepare_src, alloc, free, or send */
|
||||
.btl_component = &mca_btl_ofi_component.super,
|
||||
.btl_add_procs = mca_btl_ofi_add_procs,
|
||||
.btl_del_procs = mca_btl_ofi_del_procs,
|
||||
.btl_finalize = mca_btl_ofi_finalize,
|
||||
.btl_put = mca_btl_ofi_put,
|
||||
.btl_get = mca_btl_ofi_get,
|
||||
.btl_register_mem = mca_btl_ofi_register_mem,
|
||||
.btl_deregister_mem = mca_btl_ofi_deregister_mem,
|
||||
.btl_atomic_op = mca_btl_ofi_aop,
|
||||
.btl_atomic_fop = mca_btl_ofi_afop,
|
||||
.btl_atomic_cswap = mca_btl_ofi_acswap,
|
||||
.btl_flush = mca_btl_ofi_flush,
|
||||
|
||||
/* set the default flags for this btl. ofi provides us with rdma and both
|
||||
* fetching and non-fetching atomics (though limited to add and cswap) */
|
||||
.btl_flags = MCA_BTL_FLAGS_RDMA |
|
||||
MCA_BTL_FLAGS_ATOMIC_FOPS |
|
||||
MCA_BTL_FLAGS_ATOMIC_OPS,
|
||||
|
||||
.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
|
||||
MCA_BTL_ATOMIC_SUPPORTS_32BIT,
|
||||
|
||||
/* set the default limits on put and get */
|
||||
.btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t),
|
||||
.btl_put_limit = 1 << 23,
|
||||
.btl_put_alignment = 0,
|
||||
.btl_get_limit = 1 << 23,
|
||||
.btl_get_alignment = 0,
|
||||
}
|
||||
};
|
148
opal/mca/btl/ofi/btl_ofi_rdma.c
Обычный файл
148
opal/mca/btl/ofi/btl_ofi_rdma.c
Обычный файл
@ -0,0 +1,148 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofi_rdma.h"
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ofi_completion_t,
|
||||
opal_free_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata,
|
||||
int type)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t*)btl;
|
||||
mca_btl_ofi_completion_t *comp;
|
||||
|
||||
comp = (mca_btl_ofi_completion_t*) opal_free_list_get(&ofi_btl->comp_list);
|
||||
assert(comp);
|
||||
|
||||
comp->btl = btl;
|
||||
comp->endpoint = endpoint;
|
||||
comp->local_address = local_address;
|
||||
comp->local_handle = local_handle;
|
||||
comp->cbfunc = cbfunc;
|
||||
comp->cbcontext = cbcontext;
|
||||
comp->cbdata = cbdata;
|
||||
comp->my_list = &ofi_btl->comp_list;
|
||||
comp->type = type;
|
||||
|
||||
return comp;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
|
||||
int rc;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
mca_btl_ofi_completion_t *comp;
|
||||
|
||||
/* create completion context */
|
||||
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_GET);
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* Remote write data across the wire */
|
||||
rc = fi_read(ofi_btl->ofi_endpoint,
|
||||
local_address, size, /* payload */
|
||||
local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
comp); /* completion context */
|
||||
|
||||
if (-FI_EAGAIN == rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("fi_read failed with %d:%s", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
/* force a bit of progress */
|
||||
mca_btl_ofi_component.super.btl_progress();
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
|
||||
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
|
||||
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
|
||||
{
|
||||
int rc;
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
|
||||
|
||||
/* create completion context */
|
||||
mca_btl_ofi_completion_t *comp;
|
||||
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
|
||||
local_address,
|
||||
local_handle,
|
||||
cbfunc, cbcontext, cbdata,
|
||||
MCA_BTL_OFI_TYPE_PUT);
|
||||
|
||||
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
|
||||
|
||||
/* Remote write data across the wire */
|
||||
rc = fi_write(ofi_btl->ofi_endpoint,
|
||||
local_address, size, /* payload */
|
||||
local_handle->desc,
|
||||
btl_endpoint->peer_addr,
|
||||
remote_address, remote_handle->rkey,
|
||||
comp); /* completion context */
|
||||
|
||||
if (-FI_EAGAIN == rc) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("fi_write failed with %d:%s", rc, fi_strerror(-rc)));
|
||||
MCA_BTL_OFI_ABORT();
|
||||
}
|
||||
|
||||
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
|
||||
|
||||
/* force a bit of progress */
|
||||
mca_btl_ofi_component.super.btl_progress();
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
int mca_btl_ofi_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
||||
|
||||
while(ofi_btl->outstanding_rdma > 0) {
|
||||
(void) mca_btl_ofi_component.super.btl_progress();
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
38
opal/mca/btl/ofi/btl_ofi_rdma.h
Обычный файл
38
opal/mca/btl/ofi/btl_ofi_rdma.h
Обычный файл
@ -0,0 +1,38 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef BTL_OFI_RDMA_H
|
||||
#define BTL_OFI_RDMA_H
|
||||
|
||||
#include "opal/threads/thread_usage.h"
|
||||
|
||||
#include "btl_ofi.h"
|
||||
#include "btl_ofi_endpoint.h"
|
||||
|
||||
mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
|
||||
mca_btl_base_module_t *btl,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address,
|
||||
mca_btl_base_registration_handle_t *local_handle,
|
||||
mca_btl_base_rdma_completion_fn_t cbfunc,
|
||||
void *cbcontext, void *cbdata,
|
||||
int type);
|
||||
|
||||
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
|
||||
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1);
|
||||
|
||||
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
|
||||
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);
|
||||
|
||||
#endif /* !defined(BTL_OFI_RDMA_H) */
|
||||
|
45
opal/mca/btl/ofi/configure.m4
Обычный файл
45
opal/mca/btl/ofi/configure.m4
Обычный файл
@ -0,0 +1,45 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 QLogic Corp. All rights reserved.
|
||||
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2018 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2018 Intel, inc. All rights reserved
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# OPAL_CHECK_OFI(prefix, [action-if-found], [action-if-not-found])
|
||||
# --------------------------------------------------------
|
||||
# check if OFI support can be found. sets prefix_{CPPFLAGS,
|
||||
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
|
||||
# support, otherwise executes action-if-not-found
|
||||
|
||||
AC_DEFUN([MCA_opal_btl_ofi_CONFIG],[
|
||||
AC_CONFIG_FILES([opal/mca/btl/ofi/Makefile])
|
||||
|
||||
AC_REQUIRE([MCA_opal_common_ofi_CONFIG])
|
||||
|
||||
AS_IF([test "$opal_common_ofi_happy" = "yes"],
|
||||
[$1],
|
||||
[$2])
|
||||
|
||||
# substitute in the things needed to build ofi
|
||||
AC_SUBST([btl_ofi_CPPFLAGS])
|
||||
AC_SUBST([btl_ofi_LDFLAGS])
|
||||
AC_SUBST([btl_ofi_LIBS])
|
||||
])dnl
|
7
opal/mca/btl/ofi/owner.txt
Обычный файл
7
opal/mca/btl/ofi/owner.txt
Обычный файл
@ -0,0 +1,7 @@
|
||||
#
|
||||
# owner/status file
|
||||
# owner: institution that is responsible for this package
|
||||
# status: e.g. active, maintenance, unmaintained
|
||||
#
|
||||
owner:Intel
|
||||
status:active
|
Загрузка…
x
Ссылка в новой задаче
Block a user