/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2013 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2018 Intel, Inc, All rights reserved * * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "opal_config.h" #include #include "opal/class/opal_bitmap.h" #include "opal/util/printf.h" #include "opal/mca/btl/btl.h" #include "opal/datatype/opal_convertor.h" #include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/mpool.h" #include "btl_ofi.h" #include "btl_ofi_endpoint.h" #include "btl_ofi_frag.h" static int mca_btl_ofi_add_procs (mca_btl_base_module_t *btl, size_t nprocs, opal_proc_t **opal_procs, mca_btl_base_endpoint_t **peers, opal_bitmap_t *reachable) { int rc; int count; char *ep_name = NULL; size_t namelen = mca_btl_ofi_component.namelen; opal_proc_t *proc; mca_btl_base_endpoint_t *ep; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; for (size_t i = 0 ; i < nprocs ; ++i) { proc = opal_procs[i]; /* See if we already have an endpoint for this proc. */ rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void **) &ep); if (OPAL_SUCCESS == rc) { BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name))); peers[i] = ep; } else { /* We don't have this endpoint yet, create one */ peers[i] = mca_btl_ofi_endpoint_create (proc, ofi_btl->ofi_endpoint); BTL_VERBOSE(("creating peer %p", (void*) peers[i])); if (OPAL_UNLIKELY(NULL == peers[i])) { return OPAL_ERR_OUT_OF_RESOURCE; } /* Add this endpoint to the lookup table */ (void) opal_hash_table_set_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) proc, (void**) &ep); } OPAL_MODEX_RECV(rc, &mca_btl_ofi_component.super.btl_version, &peers[i]->ep_proc->proc_name, (void **)&ep_name, &namelen); if (OPAL_SUCCESS != rc) { BTL_ERROR(("error receiving modex")); MCA_BTL_OFI_ABORT(); } /* get peer fi_addr */ count = fi_av_insert(ofi_btl->av, /* Address vector to insert */ ep_name, /* peer name */ 1, /* amount to insert */ &peers[i]->peer_addr, /* return peer address here */ 0, /* flags */ NULL); /* context */ /* if succeed, add this proc and mark reachable */ if (count == 1) { /* we inserted 1 address. */ opal_list_append (&ofi_btl->endpoints, &peers[i]->super); opal_bitmap_set_bit(reachable, i); } else { BTL_VERBOSE(("fi_av_insert failed with rc = %d", count)); MCA_BTL_OFI_ABORT(); } } return OPAL_SUCCESS; } static int mca_btl_ofi_del_procs (mca_btl_base_module_t *btl, size_t nprocs, opal_proc_t **procs, mca_btl_base_endpoint_t **peers) { int rc; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_base_endpoint_t *ep; for (size_t i = 0 ; i < nprocs ; ++i) { if (peers[i]) { rc = opal_hash_table_get_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i], (void **) &ep); if (OPAL_SUCCESS == rc) { /* remove the address from AV. */ rc = fi_av_remove(ofi_btl->av, &peers[i]->peer_addr, 1, 0); if (rc < 0) { /* remove failed. this should not happen. */ /* Lets not crash because we failed to remove an address. */ BTL_ERROR(("fi_av_remove failed with error %d:%s", rc, fi_strerror(-rc))); } /* remove and free MPI endpoint from the list. */ opal_list_remove_item (&ofi_btl->endpoints, &peers[i]->super); (void) opal_hash_table_remove_value_uint64 (&ofi_btl->id_to_endpoint, (intptr_t) procs[i]); OBJ_RELEASE(peers[i]); } } } return OPAL_SUCCESS; } void mca_btl_ofi_rcache_init (mca_btl_ofi_module_t *module) { if (!module->initialized) { mca_rcache_base_resources_t rcache_resources; char *tmp; (void) opal_asprintf (&tmp, "ofi.%s", module->linux_device_name); rcache_resources.cache_name = tmp; rcache_resources.reg_data = (void *) module; rcache_resources.sizeof_reg = sizeof (mca_btl_ofi_reg_t); rcache_resources.register_mem = mca_btl_ofi_reg_mem; rcache_resources.deregister_mem = mca_btl_ofi_dereg_mem; module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources); free (tmp); if (NULL == module->rcache) { /* something when horribly wrong */ BTL_ERROR(("cannot create rcache")); MCA_BTL_OFI_ABORT(); } module->initialized = true; } } /** * @brief Register a memory region for put/get/atomic operations. * * @param btl (IN) BTL module * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) * @param base (IN) Pointer to start of region * @param size (IN) Size of region * @param flags (IN) Flags indicating what operation will be performed. Valid * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, * and MCA_BTL_DES_FLAGS_ATOMIC * * @returns a memory registration handle valid for both local and remote operations * @returns NULL if the region could not be registered * * This function registers the specified region with the hardware for use with * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop * functions. Care should be taken to not hold an excessive number of registrations * as they may use limited system/NIC resources. */ static struct mca_btl_base_registration_handle_t * mca_btl_ofi_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_reg_t *reg; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; rc = ofi_module->rcache->rcache_register (ofi_module->rcache, base, size, 0, access_flags, (mca_rcache_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } return ®->handle; } /** * @brief Deregister a memory region * * @param btl (IN) BTL module region was registered with * @param handle (IN) BTL registration handle to deregister * * This function deregisters the memory region associated with the specified handle. Care * should be taken to not perform any RDMA or atomic operation on this memory region * after it is deregistered. It is erroneous to specify a memory handle associated with * a remote node. */ static int mca_btl_ofi_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { mca_btl_ofi_module_t *ofi_module = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_reg_t *reg = (mca_btl_ofi_reg_t *)((intptr_t) handle - offsetof (mca_btl_ofi_reg_t, handle)); (void) ofi_module->rcache->rcache_deregister (ofi_module->rcache, ®->base); return OPAL_SUCCESS; } int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) { int rc; static uint64_t access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ | FI_READ | FI_WRITE; mca_btl_ofi_module_t *btl = (mca_btl_ofi_module_t*) reg_data; mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*) reg; rc = fi_mr_reg(btl->domain, base, size, access_flags, 0, (uint64_t) reg, 0, &ur->ur_mr, NULL); if (0 != rc) { return OPAL_ERR_OUT_OF_RESOURCE; } ur->handle.rkey = fi_mr_key(ur->ur_mr); ur->handle.desc = fi_mr_desc(ur->ur_mr); /* In case the provider doesn't support FI_MR_VIRT_ADDR, * we need to reference the remote address by the distance from base registered * address. We keep this information to use in rdma/atomic operations. */ if (btl->use_virt_addr) { ur->handle.base_addr = 0; } else { ur->handle.base_addr = base; } return OPAL_SUCCESS; } int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) { mca_btl_ofi_reg_t *ur = (mca_btl_ofi_reg_t*)reg; if (ur->ur_mr != NULL) { if (0 != fi_close(&ur->ur_mr->fid)) { BTL_ERROR(("%s: error unpinning memory mr=%p: %s", __func__, (void*) ur->ur_mr, strerror(errno))); return OPAL_ERROR; } } return OPAL_SUCCESS; } /* * Cleanup/release module resources. */ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl) { int i; mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; mca_btl_ofi_endpoint_t *endpoint, *next; assert(btl); /* clear the rcache */ if (ofi_btl->rcache) { mca_rcache_base_module_destroy (ofi_btl->rcache); ofi_btl->rcache = NULL; } /* Close basic ep before closing its attached resources. */ if (NULL != ofi_btl->ofi_endpoint && !ofi_btl->is_scalable_ep) { fi_close(&ofi_btl->ofi_endpoint->fid); ofi_btl->ofi_endpoint = NULL; } /* loop over all the contexts */ for (i=0; i < ofi_btl->num_contexts; i++) { mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep); } free(ofi_btl->contexts); if (NULL != ofi_btl->ofi_endpoint) { fi_close(&ofi_btl->ofi_endpoint->fid); } /* close ep before closing av */ if (NULL != ofi_btl->av) { fi_close(&ofi_btl->av->fid); } if (NULL != ofi_btl->domain) { fi_close(&ofi_btl->domain->fid); } if (NULL != ofi_btl->fabric) { fi_close(&ofi_btl->fabric->fid); } if (NULL != ofi_btl->fabric_info) { fi_freeinfo(ofi_btl->fabric_info); } /* clean up any leftover endpoints */ OPAL_LIST_FOREACH_SAFE(endpoint, next, &ofi_btl->endpoints, mca_btl_ofi_endpoint_t) { opal_list_remove_item (&ofi_btl->endpoints, &endpoint->super); OBJ_RELEASE(endpoint); } OBJ_DESTRUCT(&ofi_btl->endpoints); OBJ_DESTRUCT(&ofi_btl->id_to_endpoint); OBJ_DESTRUCT(&ofi_btl->module_lock); free (btl); return OPAL_SUCCESS; } /* Post wildcard recvs on the rx context. */ int mca_btl_ofi_post_recvs (mca_btl_base_module_t *module, mca_btl_ofi_context_t *context, int count) { int i; int rc; mca_btl_ofi_base_frag_t *frag; mca_btl_ofi_frag_completion_t *comp; for (i=0; i < count; i++) { frag = (mca_btl_ofi_base_frag_t*) mca_btl_ofi_alloc(module, NULL, 0, MCA_BTL_OFI_FRAG_SIZE, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if (NULL == frag) { BTL_ERROR(("cannot allocate recv frag.")); return OPAL_ERROR; } comp = mca_btl_ofi_frag_completion_alloc (module, context, frag, MCA_BTL_OFI_TYPE_RECV); rc = fi_recv (context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE, NULL, FI_ADDR_UNSPEC, &comp->comp_ctx); if (FI_SUCCESS != rc) { BTL_ERROR(("cannot post recvs")); return OPAL_ERROR; } } return OPAL_SUCCESS; } /* Allocate and fill out the module capabilities according to operation mode. */ mca_btl_ofi_module_t * mca_btl_ofi_module_alloc (int mode) { mca_btl_ofi_module_t *module; /* allocate module */ module = (mca_btl_ofi_module_t*) calloc(1, sizeof(mca_btl_ofi_module_t)); if (NULL == module) { return NULL; } /* fill in the defaults */ *module = mca_btl_ofi_module_template; if (mode == MCA_BTL_OFI_MODE_ONE_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) { module->super.btl_put = mca_btl_ofi_put; module->super.btl_get = mca_btl_ofi_get; module->super.btl_atomic_op = mca_btl_ofi_aop; module->super.btl_atomic_fop = mca_btl_ofi_afop; module->super.btl_atomic_cswap = mca_btl_ofi_acswap; module->super.btl_flush = mca_btl_ofi_flush; module->super.btl_register_mem = mca_btl_ofi_register_mem; module->super.btl_deregister_mem = mca_btl_ofi_deregister_mem; module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA; module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT ; module->super.btl_put_limit = 1 << 23; module->super.btl_put_alignment = 0; module->super.btl_get_limit = 1 << 23; module->super.btl_get_alignment = 0; module->super.btl_registration_handle_size = sizeof(mca_btl_base_registration_handle_t); } if (mode == MCA_BTL_OFI_MODE_TWO_SIDED || mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) { module->super.btl_alloc = mca_btl_ofi_alloc; module->super.btl_free = mca_btl_ofi_free; module->super.btl_prepare_src = mca_btl_ofi_prepare_src; module->super.btl_send = mca_btl_ofi_send; module->super.btl_flags |= MCA_BTL_FLAGS_SEND; module->super.btl_eager_limit = MCA_BTL_OFI_FRAG_SIZE; module->super.btl_max_send_size = MCA_BTL_OFI_FRAG_SIZE; module->super.btl_rndv_eager_limit = MCA_BTL_OFI_FRAG_SIZE; /* If two sided is enabled, we expected that the user knows exactly what * they want. We bump the priority to maximum, making this BTL the default. */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; } if (mode == MCA_BTL_OFI_MODE_FULL_SUPPORT) { module->super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024; module->super.btl_rdma_pipeline_send_length = 8 * 1024; } return module; } mca_btl_ofi_module_t mca_btl_ofi_module_template = { .super = { .btl_component = &mca_btl_ofi_component.super, .btl_add_procs = mca_btl_ofi_add_procs, .btl_del_procs = mca_btl_ofi_del_procs, .btl_finalize = mca_btl_ofi_finalize, } };