1bc3dab118
Per suggestion of @awlauria, added some comments about the need to free ep before resources it points to. Signed-off-by: Harumi Kuno <harumi.kuno@hpe.com>
450 строки
16 KiB
C
450 строки
16 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2013 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2018 Intel, Inc, All rights reserved
|
|
*
|
|
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
#include <string.h>
|
|
#include "opal/class/opal_bitmap.h"
|
|
#include "opal/util/printf.h"
|
|
#include "opal/mca/btl/btl.h"
|
|
#include "opal/datatype/opal_convertor.h"
|
|
#include "opal/mca/mpool/base/base.h"
|
|
#include "opal/mca/mpool/mpool.h"
|
|
|
|
#include "btl_ofi.h"
|
|
#include "btl_ofi_endpoint.h"
|
|
#include "btl_ofi_frag.h"
|
|
|
|
static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl,
|
|
size_t nprocs, opal_proc_t **opal_procs,
|
|
mca_btl_base_endpoint_t **peers,
|
|
opal_bitmap_t *reachable)
|
|
{
|
|
int rc;
|
|
int count;
|
|
char *ep_name = NULL;
|
|
size_t namelen = mca_btl_ofi_component.namelen;
|
|
|
|
opal_proc_t *proc;
|
|
mca_btl_base_endpoint_t *ep;
|
|
|
|
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
|
|
|
for (size_t i = 0 ; i < nprocs ; ++i) {
|
|
|
|
proc = opal_procs[i];
|
|
|
|
/* See if we already have an endpoint for this proc. */
|
|
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep);
|
|
|
|
if (OPAL_SUCCESS == rc) {
|
|
BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name)));
|
|
peers[i] = ep;
|
|
|
|
} else {
|
|
/* We don't have this endpoint yet, create one */
|
|
peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint);
|
|
BTL_VERBOSE(("creating peer %p", (void*) peers[i]));
|
|
|
|
if (OPAL_UNLIKELY(NULL == peers[i])) {
|
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* Add this endpoint to the lookup table */
|
|
(void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep);
|
|
}
|
|
|
|
OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version,
|
|
&peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen);
|
|
if (OPAL_SUCCESS != rc) {
|
|
BTL_ERROR(("error receiving modex"));
|
|
MCA_BTL_OFI_ABORT();
|
|
}
|
|
|
|
/* get peer fi_addr */
|
|
count = fi_av_insert(ofi_btl->av, /* Address vector to insert */
|
|
ep_name, /* peer name */
|
|
1, /* amount to insert */
|
|
&peers[i]->peer_addr, /* return peer address here */
|
|
0, /* flags */
|
|
NULL); /* context */
|
|
|
|
/* if succeed, add this proc and mark reachable */
|
|
if (count == 1) { /* we inserted 1 address. */
|
|
opal_list_append (&ofi_btl->endpoints, &peers[i]->super);
|
|
opal_bitmap_set_bit(reachable, i);
|
|
} else {
|
|
BTL_VERBOSE(("fi_av_insert failed with rc = %d", count));
|
|
MCA_BTL_OFI_ABORT();
|
|
}
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs,
|
|
opal_proc_t **procs, mca_btl_base_endpoint_t **peers)
|
|
{
|
|
int rc;
|
|
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
|
mca_btl_base_endpoint_t *ep;
|
|
|
|
for (size_t i = 0 ; i < nprocs ; ++i) {
|
|
if (peers[i]) {
|
|
rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep);
|
|
|
|
if (OPAL_SUCCESS == rc) {
|
|
/* remove the address from AV. */
|
|
rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0);
|
|
if (rc < 0) {
|
|
/* remove failed. this should not happen. */
|
|
/* Lets not crash because we failed to remove an address. */
|
|
BTL_ERROR(("fi_av_remove failed with error %d:%s",
|
|
rc, fi_strerror(-rc)));
|
|
}
|
|
|
|
/* remove and free MPI endpoint from the list. */
|
|
opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super);
|
|
(void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]);
|
|
OBJ_RELEASE(peers[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module)
|
|
{
|
|
if (!module->initialized) {
|
|
mca_rcache_base_resources_t rcache_resources;
|
|
char *tmp;
|
|
|
|
(void) opal_asprintf (&tmp, "ofi.%s", module->linux_device_name);
|
|
|
|
rcache_resources.cache_name = tmp;
|
|
rcache_resources.reg_data = (void *) module;
|
|
rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t);
|
|
rcache_resources.register_mem = mca_btl_ofi_reg_mem;
|
|
rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem;
|
|
|
|
module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources);
|
|
free (tmp);
|
|
|
|
if (NULL == module->rcache) {
|
|
/* something when horribly wrong */
|
|
BTL_ERROR(("cannot create rcache"));
|
|
MCA_BTL_OFI_ABORT();
|
|
}
|
|
|
|
module->initialized = true;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @brief Register a memory region for put/get/atomic operations.
|
|
*
|
|
* @param btl (IN) BTL module
|
|
* @param endpoint(IN) BTL addressing information (or NULL for all endpoints)
|
|
* @param base (IN) Pointer to start of region
|
|
* @param size (IN) Size of region
|
|
* @param flags (IN) Flags indicating what operation will be performed. Valid
|
|
* values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET,
|
|
* and MCA_BTL_DES_FLAGS_ATOMIC
|
|
*
|
|
* @returns a memory registration handle valid for both local and remote operations
|
|
* @returns NULL if the region could not be registered
|
|
*
|
|
* This function registers the specified region with the hardware for use with
|
|
* the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop
|
|
* functions. Care should be taken to not hold an excessive number of registrations
|
|
* as they may use limited system/NIC resources.
|
|
*/
|
|
static struct mca_btl_base_registration_handle_t *
|
|
mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
|
|
size_t size, uint32_t flags)
|
|
{
|
|
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
|
mca_btl_ofi_reg_t *reg;
|
|
int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
|
|
int rc;
|
|
|
|
rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags,
|
|
(mca_rcache_base_registration_t **) ®);
|
|
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
|
return NULL;
|
|
}
|
|
|
|
return ®->handle;
|
|
}
|
|
|
|
/**
|
|
* @brief Deregister a memory region
|
|
*
|
|
* @param btl (IN) BTL module region was registered with
|
|
* @param handle (IN) BTL registration handle to deregister
|
|
*
|
|
* This function deregisters the memory region associated with the specified handle. Care
|
|
* should be taken to not perform any RDMA or atomic operation on this memory region
|
|
* after it is deregistered. It is erroneous to specify a memory handle associated with
|
|
* a remote node.
|
|
*/
|
|
static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
|
|
{
|
|
mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl;
|
|
mca_btl_ofi_reg_t *reg =
|
|
(mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle));
|
|
|
|
(void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg)
|
|
{
|
|
int rc;
|
|
static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE;
|
|
|
|
mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data;
|
|
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg;
|
|
|
|
rc = fi_mr_reg(btl->domain, base, size, access_flags, 0,
|
|
(uint64_t) reg, 0, &ur->ur_mr, NULL);
|
|
if (0 != rc) {
|
|
return OPAL_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
ur->handle.rkey = fi_mr_key(ur->ur_mr);
|
|
ur->handle.desc = fi_mr_desc(ur->ur_mr);
|
|
|
|
/* In case the provider doesn't support FI_MR_VIRT_ADDR,
|
|
* we need to reference the remote address by the distance from base registered
|
|
* address. We keep this information to use in rdma/atomic operations. */
|
|
if (btl->use_virt_addr) {
|
|
ur->handle.base_addr = 0;
|
|
} else {
|
|
ur->handle.base_addr = base;
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg)
|
|
{
|
|
mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg;
|
|
|
|
if (ur->ur_mr != NULL) {
|
|
if (0 != fi_close(&ur->ur_mr->fid)) {
|
|
BTL_ERROR(("%s: error unpinning memory mr=%p: %s",
|
|
__func__, (void*) ur->ur_mr, strerror(errno)));
|
|
return OPAL_ERROR;
|
|
}
|
|
}
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Cleanup/release module resources.
|
|
*/
|
|
|
|
int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
|
|
{
|
|
int i;
|
|
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
|
|
mca_btl_ofi_endpoint_t *endpoint, *next;
|
|
|
|
assert(btl);
|
|
|
|
/* clear the rcache */
|
|
if (ofi_btl->rcache) {
|
|
mca_rcache_base_module_destroy (ofi_btl->rcache);
|
|
ofi_btl->rcache = NULL;
|
|
}
|
|
|
|
/* Close basic ep before closing its attached resources. */
|
|
if (NULL != ofi_btl->ofi_endpoint && !ofi_btl->is_scalable_ep) {
|
|
fi_close(&ofi_btl->ofi_endpoint->fid);
|
|
ofi_btl->ofi_endpoint = NULL;
|
|
}
|
|
|
|
/* loop over all the contexts */
|
|
for (i=0; i < ofi_btl->num_contexts; i++) {
|
|
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
|
|
}
|
|
free(ofi_btl->contexts);
|
|
|
|
if (NULL != ofi_btl->ofi_endpoint) {
|
|
fi_close(&ofi_btl->ofi_endpoint->fid);
|
|
}
|
|
|
|
/* close ep before closing av */
|
|
if (NULL != ofi_btl->av) {
|
|
fi_close(&ofi_btl->av->fid);
|
|
}
|
|
|
|
if (NULL != ofi_btl->domain) {
|
|
fi_close(&ofi_btl->domain->fid);
|
|
}
|
|
|
|
if (NULL != ofi_btl->fabric) {
|
|
fi_close(&ofi_btl->fabric->fid);
|
|
}
|
|
|
|
if (NULL != ofi_btl->fabric_info) {
|
|
fi_freeinfo(ofi_btl->fabric_info);
|
|
}
|
|
|
|
/* clean up any leftover endpoints */
|
|
OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) {
|
|
opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super);
|
|
OBJ_RELEASE(endpoint);
|
|
}
|
|
|
|
OBJ_DESTRUCT(&ofi_btl->endpoints);
|
|
OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
|
|
OBJ_DESTRUCT(&ofi_btl->module_lock);
|
|
|
|
free (btl);
|
|
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/* Post wildcard recvs on the rx context. */
|
|
int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module,
|
|
mca_btl_ofi_context_t *context,
|
|
int count)
|
|
{
|
|
int i;
|
|
int rc;
|
|
mca_btl_ofi_base_frag_t *frag;
|
|
mca_btl_ofi_frag_completion_t *comp;
|
|
|
|
for (i=0; i < count; i++) {
|
|
frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module,
|
|
NULL,
|
|
0,
|
|
MCA_BTL_OFI_FRAG_SIZE,
|
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
if (NULL == frag) {
|
|
BTL_ERROR(("cannot allocate recv frag."));
|
|
return OPAL_ERROR;
|
|
}
|
|
|
|
comp = mca_btl_ofi_frag_completion_alloc (module,
|
|
context,
|
|
frag,
|
|
MCA_BTL_OFI_TYPE_RECV);
|
|
|
|
rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE,
|
|
NULL, FI_ADDR_UNSPEC, &comp->comp_ctx);
|
|
|
|
if (FI_SUCCESS != rc) {
|
|
BTL_ERROR(("cannot post recvs"));
|
|
return OPAL_ERROR;
|
|
}
|
|
}
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
/* Allocate and fill out the module capabilities according to operation mode. */
|
|
mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode)
|
|
{
|
|
mca_btl_ofi_module_t *module;
|
|
|
|
/* allocate module */
|
|
module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t));
|
|
if (NULL == module) {
|
|
return NULL;
|
|
}
|
|
|
|
/* fill in the defaults */
|
|
*module = mca_btl_ofi_module_template;
|
|
|
|
if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
|
|
|
module->super.btl_put = mca_btl_ofi_put;
|
|
module->super.btl_get = mca_btl_ofi_get;
|
|
module->super.btl_atomic_op = mca_btl_ofi_aop;
|
|
module->super.btl_atomic_fop = mca_btl_ofi_afop;
|
|
module->super.btl_atomic_cswap = mca_btl_ofi_acswap;
|
|
module->super.btl_flush = mca_btl_ofi_flush;
|
|
|
|
module->super.btl_register_mem = mca_btl_ofi_register_mem;
|
|
module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem;
|
|
|
|
module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS |
|
|
MCA_BTL_FLAGS_ATOMIC_OPS |
|
|
MCA_BTL_FLAGS_RDMA;
|
|
|
|
module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD |
|
|
MCA_BTL_ATOMIC_SUPPORTS_SWAP |
|
|
MCA_BTL_ATOMIC_SUPPORTS_CSWAP |
|
|
MCA_BTL_ATOMIC_SUPPORTS_32BIT ;
|
|
|
|
module->super.btl_put_limit = 1 << 23;
|
|
module->super.btl_put_alignment = 0;
|
|
|
|
module->super.btl_get_limit = 1 << 23;
|
|
module->super.btl_get_alignment = 0;
|
|
|
|
module->super.btl_registration_handle_size =
|
|
sizeof(mca_btl_base_registration_handle_t);
|
|
}
|
|
|
|
if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
|
|
|
module->super.btl_alloc = mca_btl_ofi_alloc;
|
|
module->super.btl_free = mca_btl_ofi_free;
|
|
module->super.btl_prepare_src = mca_btl_ofi_prepare_src;
|
|
|
|
module->super.btl_send = mca_btl_ofi_send;
|
|
|
|
module->super.btl_flags |= MCA_BTL_FLAGS_SEND;
|
|
module->super.btl_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
|
|
module->super.btl_max_send_size = MCA_BTL_OFI_FRAG_SIZE;
|
|
module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE;
|
|
|
|
/* If two sided is enabled, we expected that the user knows exactly what
|
|
* they want. We bump the priority to maximum, making this BTL the default. */
|
|
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH;
|
|
}
|
|
|
|
if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) {
|
|
module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024;
|
|
module->super.btl_rdma_pipeline_send_length = 8 * 1024;
|
|
}
|
|
|
|
return module;
|
|
}
|
|
|
|
mca_btl_ofi_module_t mca_btl_ofi_module_template = {
|
|
.super = {
|
|
.btl_component = &mca_btl_ofi_component.super,
|
|
.btl_add_procs = mca_btl_ofi_add_procs,
|
|
.btl_del_procs = mca_btl_ofi_del_procs,
|
|
.btl_finalize = mca_btl_ofi_finalize,
|
|
}
|
|
};
|