diff --git a/README b/README index 439674d468..7ad119fbbb 100644 --- a/README +++ b/README @@ -623,7 +623,6 @@ MPI Functionality and Features - portals4 (2) The ob1 PML and the following BTLs support MPI_THREAD_MULTIPLE: - - openib (see exception below) - self - sm - smcuda @@ -632,10 +631,6 @@ MPI Functionality and Features - usnic - vader (shared memory) - The openib BTL's RDMACM based connection setup mechanism is also not - thread safe. The default UDCM method should be used for - applications requiring MPI_THREAD_MULTIPLE support. - Currently, MPI File operations are not thread safe even if MPI is initialized for MPI_THREAD_MULTIPLE support. @@ -794,7 +789,8 @@ Network Support - In prior versions of Open MPI, InfiniBand and RoCE support was provided through the openib BTL and ob1 PML plugins. Starting with Open MPI 4.0.0, InfiniBand support through the openib plugin is both - deprecated and superseded by the ucx PML component. + deprecated and superseded by the ucx PML component. The openib BTL + was removed in Open MPI v5.0.0. While the openib BTL depended on libibverbs, the UCX PML depends on the UCX library. @@ -809,15 +805,6 @@ Network Support for OpenSHMEM support, and "--mca osc ucx" for MPI RMA (one-sided) operations. -- Although the ob1 PML+openib BTL is still the default for iWARP and - RoCE devices, it will reject InfiniBand defaults (by default) so - that they will use the ucx PML. If using the openib BTL is still - desired, set the following MCA parameters: - - # Note that "vader" is Open MPI's shared memory BTL - $ mpirun --mca pml ob1 --mca btl openib,vader,self \ - --mca btl_openib_allow_ib 1 ... - - The usnic BTL is support for Cisco's usNIC device ("userspace NIC") on Cisco UCS servers with the Virtualized Interface Card (VIC). Although the usNIC is accessed via the OpenFabrics Libfabric API @@ -850,8 +837,8 @@ Network Support http://lwn.net/Articles/343351/ -- The use of fork() with OpenFabrics-based networks (i.e., the openib - BTL) is only partially supported, and only on Linux kernels >= +- The use of fork() with OpenFabrics-based networks (i.e., the UCX + PML) is only partially supported, and only on Linux kernels >= v2.6.15 with libibverbs v1.1 or later (first released as part of OFED v1.2), per restrictions imposed by the OFED network stack. diff --git a/opal/mca/btl/openib/Makefile.am b/opal/mca/btl/openib/Makefile.am deleted file mode 100644 index c66d1619ae..0000000000 --- a/opal/mca/btl/openib/Makefile.am +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2007-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2011 NVIDIA Corporation. All rights reserved. -# Copyright (c) 2011 Mellanox Technologies. All rights reserved. -# Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved -# Copyright (c) 2013 Intel, Inc. All rights reserved. -# Copyright (c) 2016 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# Copyright (c) 2017 IBM Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(btl_openib_CPPFLAGS) -AM_LFLAGS = -Pbtl_openib_ini_yy -LEX_OUTPUT_ROOT = lex.btl_openib_ini_yy - -amca_paramdir = $(AMCA_PARAM_SETS_DIR) -dist_amca_param_DATA = btl-openib-benchmark - -dist_opaldata_DATA = \ - help-mpi-btl-openib.txt \ - connect/help-mpi-btl-openib-cpc-base.txt \ - mca-btl-openib-device-params.ini - -sources = \ - btl_openib.c \ - btl_openib.h \ - btl_openib_component.c \ - btl_openib_endpoint.c \ - btl_openib_endpoint.h \ - btl_openib_frag.c \ - btl_openib_frag.h \ - btl_openib_proc.c \ - btl_openib_proc.h \ - btl_openib_eager_rdma.h \ - btl_openib_lex.h \ - btl_openib_lex.l \ - btl_openib_mca.c \ - btl_openib_mca.h \ - btl_openib_ini.c \ - btl_openib_ini.h \ - btl_openib_async.c \ - btl_openib_async.h \ - btl_openib_xrc.c \ - btl_openib_xrc.h \ - btl_openib_ip.h \ - btl_openib_ip.c \ - btl_openib_put.c \ - btl_openib_get.c \ - btl_openib_atomic.c \ - connect/base.h \ - connect/btl_openib_connect_base.c \ - connect/btl_openib_connect_empty.c \ - connect/btl_openib_connect_empty.h \ - connect/connect.h - -# If we have rdmacm support, build that CPC -if MCA_btl_openib_have_rdmacm -sources += \ - connect/btl_openib_connect_rdmacm.c \ - connect/btl_openib_connect_rdmacm.h - -dist_opaldata_DATA += connect/help-mpi-btl-openib-cpc-rdmacm.txt -endif - -# If we have udcm support, build that CPC -if MCA_btl_openib_have_udcm -sources += \ - connect/btl_openib_connect_udcm.c \ - connect/btl_openib_connect_udcm.h - -# dist_opaldata_DATA += connect/help-mpi-btl-openib-cpc-ud.txt -endif - -# If we have dynamic SL support, build those files -if MCA_btl_openib_have_dynamic_sl -sources += \ - connect/btl_openib_connect_sl.c \ - connect/btl_openib_connect_sl.h -endif - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_opal_btl_openib_DSO -lib = -lib_sources = -component = mca_btl_openib.la -component_sources = $(sources) -else -lib = libmca_btl_openib.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(opallibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_btl_openib_la_SOURCES = $(component_sources) -mca_btl_openib_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS) -mca_btl_openib_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \ - $(btl_openib_LIBS) \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/verbs/lib@OPAL_LIB_PREFIX@mca_common_verbs.la -if OPAL_cuda_support -mca_btl_openib_la_LIBADD += \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/lib@OPAL_LIB_PREFIX@mca_common_cuda.la -endif - -noinst_LTLIBRARIES = $(lib) -libmca_btl_openib_la_SOURCES = $(lib_sources) -libmca_btl_openib_la_LDFLAGS= -module -avoid-version $(btl_openib_LDFLAGS) -libmca_btl_openib_la_LIBADD = $(btl_openib_LIBS) - -maintainer-clean-local: - rm -f btl_openib_lex.c diff --git a/opal/mca/btl/openib/btl-openib-benchmark b/opal/mca/btl/openib/btl-openib-benchmark deleted file mode 100644 index 3798c333b5..0000000000 --- a/opal/mca/btl/openib/btl-openib-benchmark +++ /dev/null @@ -1,19 +0,0 @@ -# -# These values are suitable for benchmarking with the openib and sm -# btls with a small number of MPI processes. If you're only going to -# use one process per node, remove "sm". These values are *NOT* -# scalable to large numbers of processes! -# -btl=openib,self,sm -btl_openib_max_btls=20 -btl_openib_rd_num=128 -btl_openib_rd_low=75 -btl_openib_rd_win=50 -btl_openib_max_eager_rdma=32 -mpool_base_use_mem_hooks=1 -mpi_leave_pinned=1 -# -# Note that we are not limiting the max free list size, so for netpipe -# (for example), this is no problem. But we may want to explore the -# parameter space for other popular benchmarks. -# diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c deleted file mode 100644 index 9ec57c0e2e..0000000000 --- a/opal/mca/btl/openib/btl_openib.c +++ /dev/null @@ -1,2022 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved - * Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" -#include -#include "opal_stdint.h" -#include "opal/class/opal_bitmap.h" -#include "opal/util/output.h" -#include "opal/util/arch.h" -#include "opal/util/proc.h" -#include "opal/util/printf.h" -#include "opal/include/opal_stdint.h" -#include "opal/util/show_help.h" -#include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/btl_base_error.h" - -#if OPAL_ENABLE_FT_CR == 1 -#include "opal/runtime/opal_cr.h" -#endif - -#include "btl_openib_ini.h" - -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_proc.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_xrc.h" -#include "btl_openib_async.h" - -#include "opal/datatype/opal_convertor.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/mca/rcache/rcache.h" - -#if OPAL_CUDA_SUPPORT -#include "opal/datatype/opal_datatype_cuda.h" -#include "opal/mca/common/cuda/common_cuda.h" -#endif /* OPAL_CUDA_SUPPORT */ - -#include "opal/util/sys_limits.h" - -#include -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_TYPES_H -#include -#endif -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_SYS_RESOURCE_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif -#include "opal/mca/hwloc/hwloc-internal.h" - -#ifndef MIN -#define MIN(a,b) ((a)<(b)?(a):(b)) -#endif - -static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - void *base, size_t size, uint32_t flags); -static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); - -mca_btl_openib_module_t mca_btl_openib_module = { - .super = { - .btl_component = &mca_btl_openib_component.super, - .btl_add_procs = mca_btl_openib_add_procs, - .btl_del_procs = mca_btl_openib_del_procs, - .btl_finalize = mca_btl_openib_finalize, - /* we need alloc free, pack */ - .btl_alloc = mca_btl_openib_alloc, - .btl_free = mca_btl_openib_free, - .btl_prepare_src = mca_btl_openib_prepare_src, - .btl_send = mca_btl_openib_send, - .btl_sendi = mca_btl_openib_sendi, /* send immediate */ - .btl_put = mca_btl_openib_put, - .btl_get = mca_btl_openib_get, - .btl_dump = mca_btl_base_dump, - .btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */ - .btl_ft_event = mca_btl_openib_ft_event, - .btl_register_mem = mca_btl_openib_register_mem, - .btl_deregister_mem = mca_btl_openib_deregister_mem, -#if HAVE_DECL_IBV_ATOMIC_HCA - .btl_atomic_fop = mca_btl_openib_atomic_fop, - .btl_atomic_cswap = mca_btl_openib_atomic_cswap, -#endif - } -}; - -char* const mca_btl_openib_transport_name_strings[MCA_BTL_OPENIB_TRANSPORT_SIZE] = { - "MCA_BTL_OPENIB_TRANSPORT_IB", - "MCA_BTL_OPENIB_TRANSPORT_IWARP", - "MCA_BTL_OPENIB_TRANSPORT_RDMAOE", - "MCA_BTL_OPENIB_TRANSPORT_UNKNOWN" -}; - -static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl); - -void mca_btl_openib_show_init_error(const char *file, int line, - const char *func, const char *dev) -{ - if (ENOMEM == errno) { - int ret; - struct rlimit limit; - char *str_limit = NULL; - -#if HAVE_DECL_RLIMIT_MEMLOCK - ret = getrlimit(RLIMIT_MEMLOCK, &limit); -#else - ret = -1; -#endif - if (0 != ret) { - opal_asprintf(&str_limit, "Unknown"); - } else if (limit.rlim_cur == RLIM_INFINITY) { - opal_asprintf(&str_limit, "unlimited"); - } else { - opal_asprintf(&str_limit, "%ld", (long)limit.rlim_cur); - } - - opal_show_help("help-mpi-btl-openib.txt", "init-fail-no-mem", - true, opal_process_info.nodename, - file, line, func, dev, str_limit); - - if (NULL != str_limit) free(str_limit); - } else { - opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q", - true, opal_process_info.nodename, - file, line, func, strerror(errno), errno, dev); - } -} - -static inline struct ibv_cq *create_cq_compat(struct ibv_context *context, - int cqe, void *cq_context, struct ibv_comp_channel *channel, - int comp_vector) -{ -#if OPAL_IBV_CREATE_CQ_ARGS == 3 - return ibv_create_cq(context, cqe, channel); -#else - return ibv_create_cq(context, cqe, cq_context, channel, comp_vector); -#endif -} - -static int adjust_cq(mca_btl_openib_device_t *device, const int cq) -{ - uint32_t cq_size = device->cq_size[cq]; - - /* make sure we don't exceed the maximum CQ size and that we - * don't size the queue smaller than otherwise requested - */ - if(cq_size < mca_btl_openib_component.ib_cq_size[cq]) - cq_size = mca_btl_openib_component.ib_cq_size[cq]; - - if(cq_size > (uint32_t)device->ib_dev_attr.max_cqe) - cq_size = device->ib_dev_attr.max_cqe; - - if(NULL == device->ib_cq[cq]) { - device->ib_cq[cq] = create_cq_compat(device->ib_dev_context, cq_size, -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - device, device->ib_channel, -#else - NULL, NULL, -#endif - 0); - - if (NULL == device->ib_cq[cq]) { - mca_btl_openib_show_init_error(__FILE__, __LINE__, "ibv_create_cq", - ibv_get_device_name(device->ib_dev)); - return OPAL_ERROR; - } - -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - if(ibv_req_notify_cq(device->ib_cq[cq], 0)) { - mca_btl_openib_show_init_error(__FILE__, __LINE__, - "ibv_req_notify_cq", - ibv_get_device_name(device->ib_dev)); - return OPAL_ERROR; - } - - if (!device->progress) { - int rc; - device->progress = true; - if(OPAL_SUCCESS != (rc = opal_thread_start(&device->thread))) { - BTL_ERROR(("Unable to create progress thread, retval=%d", rc)); - return rc; - } - } -#endif - } -#ifdef HAVE_IBV_RESIZE_CQ - else if (cq_size > mca_btl_openib_component.ib_cq_size[cq]){ - int rc; - rc = ibv_resize_cq(device->ib_cq[cq], cq_size); - /* For ConnectX the resize CQ is not implemented and verbs returns -ENOSYS - * but should return ENOSYS. So it is reason for abs */ - if(rc && ENOSYS != abs(rc) && EOPNOTSUPP != abs(rc)) { - BTL_ERROR(("cannot resize completion queue, error: %d", rc)); - return OPAL_ERROR; - } - } -#endif - - return OPAL_SUCCESS; -} - - -/* In this function we check if the device supports srq limit - event. We create the temporary srq, post some receive buffers - in - order to prevent srq limit event immediately and call the - "ibv_modify_srq" function. If a return value of the function not - success => our decision that the device doesn't support this - capability. */ -static int check_if_device_support_modify_srq(mca_btl_openib_module_t *openib_btl) -{ - char buff; - int rc = OPAL_SUCCESS; - - struct ibv_srq* dummy_srq = NULL; - struct ibv_srq_attr modify_attr; - - struct ibv_sge sge_elem; - struct ibv_recv_wr wr1, wr2, *bad_wr; - - struct ibv_srq_init_attr init_attr; - memset(&init_attr, 0, sizeof(struct ibv_srq_init_attr)); - - init_attr.attr.max_wr = 3; - init_attr.attr.max_sge = 1; - - dummy_srq = ibv_create_srq(openib_btl->device->ib_pd, &init_attr); - if(NULL == dummy_srq) { - rc = OPAL_ERROR; - return rc; - } - - sge_elem.addr = (uint64_t)((uintptr_t) &buff); - sge_elem.length = sizeof(buff); - - wr1.num_sge = wr2.num_sge = 1; - wr1.sg_list = wr2.sg_list = &sge_elem; - - wr1.next = &wr2; - wr2.next = NULL; - - if(ibv_post_srq_recv(dummy_srq, &wr1, &bad_wr)) { - rc = OPAL_ERROR; - goto destroy_dummy_srq; - } - - modify_attr.max_wr = 2; - modify_attr.max_sge = 1; - modify_attr.srq_limit = 1; - - if(ibv_modify_srq(dummy_srq, &modify_attr, IBV_SRQ_LIMIT)) { - rc = OPAL_ERR_NOT_SUPPORTED; - goto destroy_dummy_srq; - } - -destroy_dummy_srq: - if(ibv_destroy_srq(dummy_srq)) { - rc = OPAL_ERROR; - } - - return rc; -} - -/* - * create both the high and low priority completion queues - * and the shared receive queue (if requested) - */ -static int create_srq(mca_btl_openib_module_t *openib_btl) -{ - int qp, rc = 0; - int32_t rd_num, rd_curr_num; - - bool device_support_modify_srq = true; - - /* Check if our device supports modify srq ability */ - rc = check_if_device_support_modify_srq(openib_btl); - if(OPAL_ERR_NOT_SUPPORTED == rc) { - device_support_modify_srq = false; - } else if(OPAL_SUCCESS != rc) { - mca_btl_openib_show_init_error(__FILE__, __LINE__, - "ibv_create_srq", - ibv_get_device_name(openib_btl->device->ib_dev)); - return rc; - } - - /* create the SRQ's */ - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - struct ibv_srq_init_attr attr; - memset(&attr, 0, sizeof(struct ibv_srq_init_attr)); - - if(!BTL_OPENIB_QP_TYPE_PP(qp)) { - attr.attr.max_wr = mca_btl_openib_component.qp_infos[qp].rd_num + - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; - attr.attr.max_sge = 1; - openib_btl->qps[qp].u.srq_qp.rd_posted = 0; -#if HAVE_XRC - if(BTL_OPENIB_QP_TYPE_XRC(qp)) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_srq_init_attr_ex attr_ex; - memset(&attr_ex, 0, sizeof(struct ibv_srq_init_attr_ex)); - attr_ex.attr.max_wr = attr.attr.max_wr; - attr_ex.attr.max_sge = attr.attr.max_sge; - attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_XRCD | - IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; - attr_ex.srq_type = IBV_SRQT_XRC; - attr_ex.xrcd = openib_btl->device->xrcd; - attr_ex.cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; - attr_ex.pd = openib_btl->device->ib_pd; - - openib_btl->qps[qp].u.srq_qp.srq = - ibv_create_srq_ex(openib_btl->device->ib_dev_context, &attr_ex); -#else - openib_btl->qps[qp].u.srq_qp.srq = - ibv_create_xrc_srq(openib_btl->device->ib_pd, - openib_btl->device->xrc_domain, - openib_btl->device->ib_cq[qp_cq_prio(qp)], &attr); -#endif - } else -#endif - { - opal_mutex_lock(&openib_btl->device->device_lock); - openib_btl->qps[qp].u.srq_qp.srq = - ibv_create_srq(openib_btl->device->ib_pd, &attr); - opal_mutex_unlock(&openib_btl->device->device_lock); - } - if (NULL == openib_btl->qps[qp].u.srq_qp.srq) { - mca_btl_openib_show_init_error(__FILE__, __LINE__, - "ibv_create_srq", - ibv_get_device_name(openib_btl->device->ib_dev)); - return OPAL_ERROR; - } - - { - opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock; - opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; - - opal_mutex_lock(lock); - if (OPAL_SUCCESS != opal_hash_table_set_value_ptr( - srq_addr_table, &openib_btl->qps[qp].u.srq_qp.srq, - sizeof(struct ibv_srq*), (void*) openib_btl)) { - BTL_ERROR(("SRQ Internal error." - " Failed to add element to mca_btl_openib_component.srq_manager.srq_addr_table\n")); - - opal_mutex_unlock(lock); - return OPAL_ERROR; - } - opal_mutex_unlock(lock); - } - rd_num = mca_btl_openib_component.qp_infos[qp].rd_num; - rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init; - - if(true == mca_btl_openib_component.enable_srq_resize && - true == device_support_modify_srq) { - if(0 == rd_curr_num) { - openib_btl->qps[qp].u.srq_qp.rd_curr_num = 1; - } - - openib_btl->qps[qp].u.srq_qp.rd_low_local = rd_curr_num - (rd_curr_num >> 2); - openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true; - } else { - openib_btl->qps[qp].u.srq_qp.rd_curr_num = rd_num; - openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low; - /* Not used in this case, but we don't need a garbage */ - mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = 0; - openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false; - } - } - } - - openib_btl->srqs_created = true; - - return OPAL_SUCCESS; -} - -static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl) -{ - int rc = OPAL_SUCCESS; - opal_mutex_lock(&openib_btl->ib_lock); - if (!openib_btl->srqs_created && - (mca_btl_openib_component.num_srq_qps > 0 || - mca_btl_openib_component.num_xrc_qps > 0)) { - rc = create_srq(openib_btl); - } - opal_mutex_unlock(&openib_btl->ib_lock); - return rc; -} - - -static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl) -{ - uint32_t send_cqes, recv_cqes; - int rc = OPAL_SUCCESS; - mca_btl_openib_device_t *device = openib_btl->device; - uint32_t requested[BTL_OPENIB_MAX_CQ]; - - opal_mutex_lock(&openib_btl->ib_lock); - - for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) { - requested[cq] = 0; - } - - /* figure out reasonable sizes for completion queues */ - for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; qp++) { - if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { - send_cqes = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; - recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num; - } else { - send_cqes = (mca_btl_openib_component.qp_infos[qp].rd_num + - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) * openib_btl->num_peers; - recv_cqes = send_cqes; - } - - requested[qp_cq_prio(qp)] += recv_cqes; - requested[BTL_OPENIB_LP_CQ] += send_cqes; - } - - opal_mutex_lock (&openib_btl->device->device_lock); - for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) { - if (requested[cq] < mca_btl_openib_component.ib_cq_size[cq]) { - requested[cq] = mca_btl_openib_component.ib_cq_size[cq]; - } else if (requested[cq] > (uint32_t) openib_btl->device->ib_dev_attr.max_cqe) { - requested[cq] = openib_btl->device->ib_dev_attr.max_cqe; - } - - if (openib_btl->device->cq_size[cq] < requested[cq]) { - openib_btl->device->cq_size[cq] = requested[cq]; - - rc = adjust_cq (device, cq); - if (OPAL_SUCCESS != rc) { - break; - } - } - } - opal_mutex_unlock (&openib_btl->device->device_lock); - opal_mutex_unlock(&openib_btl->ib_lock); - - return rc; -} - -mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl) -{ -/* If we have a driver with RDMAoE supporting as the device struct contains the same type (IB) for - IBV_LINK_LAYER_INFINIBAND and IBV_LINK_LAYER_ETHERNET link layers and the single way - to detect this fact is to check their link_layer fields in a port_attr struct. - If our driver doesn't support this feature => the checking of transport type in device struct will be enough. - If the driver doesn't support completely transport types => - our assumption that it is very old driver - that supports IB devices only */ - -#ifdef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE - switch(openib_btl->device->ib_dev->transport_type) { - case IBV_TRANSPORT_IB: -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - switch(openib_btl->ib_port_attr.link_layer) { - case IBV_LINK_LAYER_ETHERNET: - return MCA_BTL_OPENIB_TRANSPORT_RDMAOE; - - case IBV_LINK_LAYER_INFINIBAND: - return MCA_BTL_OPENIB_TRANSPORT_IB; - /* It is not possible that a device struct contains - IB transport and port was configured to IBV_LINK_LAYER_UNSPECIFIED */ - case IBV_LINK_LAYER_UNSPECIFIED: - default: - return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN; - } -#endif - return MCA_BTL_OPENIB_TRANSPORT_IB; - - case IBV_TRANSPORT_IWARP: - return MCA_BTL_OPENIB_TRANSPORT_IWARP; - - case IBV_TRANSPORT_UNKNOWN: - default: - return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN; - } -#else - return MCA_BTL_OPENIB_TRANSPORT_IB; -#endif -} - -static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl, - mca_btl_base_endpoint_t* endpoint) -{ - opal_btl_openib_ini_values_t values; - char* recv_qps = NULL; - int ret; - - if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) { - opal_show_help("help-mpi-btl-openib.txt", - "conflicting transport types", true, - opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), - (openib_btl->device->ib_dev_attr).vendor_id, - (openib_btl->device->ib_dev_attr).vendor_part_id, - mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)], - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - endpoint->rem_info.rem_vendor_id, - endpoint->rem_info.rem_vendor_part_id, - mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]); - - return OPAL_ERROR; - } - - memset(&values, 0, sizeof(opal_btl_openib_ini_values_t)); - ret = opal_btl_openib_ini_query(endpoint->rem_info.rem_vendor_id, - endpoint->rem_info.rem_vendor_part_id, &values); - - if (OPAL_SUCCESS != ret && - OPAL_ERR_NOT_FOUND != ret) { - opal_show_help("help-mpi-btl-openib.txt", - "error in device init", true, - opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev)); - return ret; - } - - if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) { - endpoint->rem_info.rem_mtu = openib_btl->device->mtu; - } - - endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma & - endpoint->use_eager_rdma; - - /* Receive queues checking */ - - /* In this check we assume that the command line or INI file parameters are the same - for all processes on all machines. The assumption is correct for 99.9999% of users, - if a user distributes different INI files or parameters for different node/procs, - it is on his own responsibility */ - switch(mca_btl_openib_component.receive_queues_source) { - case MCA_BASE_VAR_SOURCE_COMMAND_LINE: - case MCA_BASE_VAR_SOURCE_ENV: - case MCA_BASE_VAR_SOURCE_FILE: - case MCA_BASE_VAR_SOURCE_SET: - case MCA_BASE_VAR_SOURCE_OVERRIDE: - break; - - /* If the queues configuration was set from command line - (with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */ - - /* In this case the local queues configuration was gotten from INI file => - not possible that remote side got its queues configuration from command line => - (by prio) the configuration was set from INI file or (if not configure) - by default queues configuration */ - case BTL_OPENIB_RQ_SOURCE_DEVICE_INI: - if(NULL != values.receive_queues) { - recv_qps = values.receive_queues; - } else { - recv_qps = mca_btl_openib_component.default_recv_qps; - } - - if(0 != strcmp(mca_btl_openib_component.receive_queues, - recv_qps)) { - opal_show_help("help-mpi-btl-openib.txt", - "unsupported queues configuration", true, - opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), - (openib_btl->device->ib_dev_attr).vendor_id, - (openib_btl->device->ib_dev_attr).vendor_part_id, - mca_btl_openib_component.receive_queues, - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - endpoint->rem_info.rem_vendor_id, - endpoint->rem_info.rem_vendor_part_id, - recv_qps); - - return OPAL_ERROR; - } - break; - - /* If the local queues configuration was set - by default queues => check all possible cases for remote side and compare */ - case MCA_BASE_VAR_SOURCE_DEFAULT: - if(NULL != values.receive_queues) { - if(0 != strcmp(mca_btl_openib_component.receive_queues, - values.receive_queues)) { - opal_show_help("help-mpi-btl-openib.txt", - "unsupported queues configuration", true, - opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), - (openib_btl->device->ib_dev_attr).vendor_id, - (openib_btl->device->ib_dev_attr).vendor_part_id, - mca_btl_openib_component.receive_queues, - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - endpoint->rem_info.rem_vendor_id, - endpoint->rem_info.rem_vendor_part_id, - values.receive_queues); - - return OPAL_ERROR; - } - } - break; - } - - return OPAL_SUCCESS; -} - -static int prepare_device_for_use (mca_btl_openib_device_t *device) -{ - mca_btl_openib_frag_init_data_t *init_data; - int rc = OPAL_SUCCESS, length; - - opal_mutex_lock(&device->device_lock); - - if (device->ready_for_use) { - goto exit; - } - - /* For each btl module that we made - find every - base device that doesn't have device->qps setup on it yet (remember - that some modules may share the same device, so when going through - to loop, we may hit a device that was already setup earlier in - the loop). - - We may to call for prepare_device_for_use() only after adding the btl - to mca_btl_openib_component.openib_btls, since the prepare_device_for_use - adds device to async thread that require access to - mca_btl_openib_component.openib_btls. - */ - - /* Setup the device qps info */ - device->qps = (mca_btl_openib_device_qp_t*) - calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_device_qp_t)); - if (NULL == device->qps) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - - for (int qp_index = 0 ; qp_index < mca_btl_openib_component.num_qps ; qp_index++) { - OBJ_CONSTRUCT(&device->qps[qp_index].send_free, opal_free_list_t); - OBJ_CONSTRUCT(&device->qps[qp_index].recv_free, opal_free_list_t); - } - - device->got_fatal_event = false; - device->got_port_event = false; - mca_btl_openib_async_add_device (device); - -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - /* Prepare data for thread, but not starting it */ - OBJ_CONSTRUCT(&device->thread, opal_thread_t); - device->thread.t_run = mca_btl_openib_progress_thread; - device->thread.t_arg = device; - device->progress = false; -#endif - -#if HAVE_XRC - /* if user configured to run with XRC qp and the device doesn't - * support it - we should ignore this device. Maybe we have another - * one that has XRC support - */ - if (!(device->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) && - MCA_BTL_XRC_ENABLED) { - opal_show_help("help-mpi-btl-openib.txt", - "XRC on device without XRC support", true, - mca_btl_openib_component.num_xrc_qps, - ibv_get_device_name(device->ib_dev), - opal_process_info.nodename); - rc = OPAL_ERROR; - goto exit; - } - - if (MCA_BTL_XRC_ENABLED) { - if (OPAL_SUCCESS != mca_btl_openib_open_xrc_domain(device)) { - BTL_ERROR(("XRC Internal error. Failed to open xrc domain")); - rc = OPAL_ERROR; - goto exit; - } - } -#endif - - device->endpoints = OBJ_NEW(opal_pointer_array_t); - opal_pointer_array_init(device->endpoints, 10, INT_MAX, 10); - opal_pointer_array_add(&mca_btl_openib_component.devices, device); - if (mca_btl_openib_component.max_eager_rdma > 0 && - device->use_eager_rdma) { - device->eager_rdma_buffers = - (mca_btl_base_endpoint_t **) calloc((size_t) mca_btl_openib_component.max_eager_rdma * device->btls, - sizeof(mca_btl_openib_endpoint_t*)); - if(NULL == device->eager_rdma_buffers) { - BTL_ERROR(("Memory allocation fails")); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - } - - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - if (NULL == init_data) { - if (mca_btl_openib_component.max_eager_rdma > 0 && - device->use_eager_rdma) { - /* cleanup */ - free (device->eager_rdma_buffers); - device->eager_rdma_buffers = NULL; - } - BTL_ERROR(("Memory allocation fails")); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - - length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_footer_t) + - sizeof(mca_btl_openib_eager_rdma_header_t); - - init_data->order = MCA_BTL_NO_ORDER; - init_data->list = &device->send_free_control; - - rc = opal_free_list_init(&device->send_free_control, - sizeof(mca_btl_openib_send_control_frag_t), opal_cache_line_size, - OBJ_CLASS(mca_btl_openib_send_control_frag_t), length, - mca_btl_openib_component.buffer_alignment, - mca_btl_openib_component.ib_free_list_num, -1, - mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, device->rcache, mca_btl_openib_frag_init, - init_data); - if (OPAL_SUCCESS != rc) { - /* If we're "out of memory", this usually means that we ran - out of registered memory, so show that error message */ - if (OPAL_ERR_OUT_OF_RESOURCE == rc || - OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) { - errno = ENOMEM; - mca_btl_openib_show_init_error(__FILE__, __LINE__, - "opal_free_list_init", - ibv_get_device_name(device->ib_dev)); - } - goto exit; - } - - /* setup all the qps */ - for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; qp++) { - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - if (NULL == init_data) { - BTL_ERROR(("Memory allocation fails")); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - - /* Initialize pool of send fragments */ - length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_footer_t) + - mca_btl_openib_component.qp_infos[qp].size; - - init_data->order = qp; - init_data->list = &device->qps[qp].send_free; - - rc = opal_free_list_init (init_data->list, - sizeof(mca_btl_openib_send_frag_t), opal_cache_line_size, - OBJ_CLASS(mca_btl_openib_send_frag_t), length, - mca_btl_openib_component.buffer_alignment, - mca_btl_openib_component.ib_free_list_num, - mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, device->rcache, mca_btl_openib_frag_init, - init_data); - if (OPAL_SUCCESS != rc) { - /* If we're "out of memory", this usually means that we - ran out of registered memory, so show that error - message */ - if (OPAL_ERR_OUT_OF_RESOURCE == rc || - OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) { - errno = ENOMEM; - mca_btl_openib_show_init_error(__FILE__, __LINE__, - "opal_free_list_init", - ibv_get_device_name(device->ib_dev)); - } - goto exit; - } - - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_footer_t) + - mca_btl_openib_component.qp_infos[qp].size; - - init_data->order = qp; - init_data->list = &device->qps[qp].recv_free; - - if(OPAL_SUCCESS != opal_free_list_init (init_data->list, - sizeof(mca_btl_openib_recv_frag_t), opal_cache_line_size, - OBJ_CLASS(mca_btl_openib_recv_frag_t), - length, mca_btl_openib_component.buffer_alignment, - mca_btl_openib_component.ib_free_list_num, - mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, device->rcache, mca_btl_openib_frag_init, - init_data)) { - rc = OPAL_ERROR; - goto exit; - } - } - - device->ready_for_use = true; - -exit: - opal_mutex_unlock(&device->device_lock); - return rc; -} - -static int init_ib_proc_nolock(mca_btl_openib_module_t* openib_btl, mca_btl_openib_proc_t* ib_proc, - volatile mca_btl_base_endpoint_t **endpoint_ptr, - int local_port_cnt, int btl_rank) -{ - int rem_port_cnt, matching_port = -1, j, rc; - mca_btl_base_endpoint_t *endpoint; - opal_btl_openib_connect_base_module_t *local_cpc; - opal_btl_openib_connect_base_module_data_t *remote_cpc_data; - - *endpoint_ptr = NULL; - - /* check if the remote proc has any ports that: - - on the same subnet as the local proc, and - - on that subnet, has a CPC in common with the local proc - */ - - rem_port_cnt = 0; - BTL_VERBOSE(("got %d port_infos ", ib_proc->proc_port_count)); - for (j = 0; j < (int) ib_proc->proc_port_count; j++){ - BTL_VERBOSE(("got a subnet %016" PRIx64, - ib_proc->proc_ports[j].pm_port_info.subnet_id)); - if (ib_proc->proc_ports[j].pm_port_info.subnet_id == - openib_btl->port_info.subnet_id) { - BTL_VERBOSE(("Got a matching subnet!")); - if (rem_port_cnt == btl_rank) { - matching_port = j; - } - rem_port_cnt++; - } else { - if (mca_btl_openib_component.allow_different_subnets) { - BTL_VERBOSE(("Using different subnets!")); - if (rem_port_cnt == btl_rank) { - matching_port = j; - } - rem_port_cnt++; - } - } - } - - if (0 == rem_port_cnt) { - /* no use trying to communicate with this endpoint */ - BTL_VERBOSE(("No matching subnet id/CPC was found, moving on.. ")); - return OPAL_ERROR; - } - - /* If this process has multiple ports on a single subnet ID, - and the report proc also has multiple ports on this same - subnet ID, the default connection pattern is: - - LOCAL REMOTE PEER - 1st port on subnet X <--> 1st port on subnet X - 2nd port on subnet X <--> 2nd port on subnet X - 3nd port on subnet X <--> 3nd port on subnet X - ...etc. - - Note that the port numbers may not be contiguous, and they - may not be the same on either side. Hence the "1st", "2nd", - "3rd, etc. notation, above. - - Hence, if the local "rank" of this module's port on the - subnet ID is greater than the total number of ports on the - peer on this same subnet, then we have no match. So skip - this connection. */ - if (rem_port_cnt < local_port_cnt && btl_rank >= rem_port_cnt) { - BTL_VERBOSE(("Not enough remote ports on this subnet id, moving on.. ")); - return OPAL_ERROR; - } - - /* Now that we have verified that we're on the same subnet and - the remote peer has enough ports, see if that specific port - on the peer has a matching CPC. */ - assert(btl_rank <= ib_proc->proc_port_count); - assert(matching_port != -1); - if (OPAL_SUCCESS != - opal_btl_openib_connect_base_find_match(openib_btl, - &(ib_proc->proc_ports[matching_port]), - &local_cpc, - &remote_cpc_data)) { - return OPAL_ERROR; - } - - /* The btl_proc datastructure is shared by all IB BTL - * instances that are trying to reach this destination. - * Cache the peer instance on the btl_proc. - */ - endpoint = OBJ_NEW(mca_btl_openib_endpoint_t); - assert(((opal_object_t*)endpoint)->obj_reference_count == 1); - if(NULL == endpoint) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED) { - int rem_port_cnt = 0; - for(j = 0; j < (int) ib_proc->proc_port_count; j++) { - if(ib_proc->proc_ports[j].pm_port_info.subnet_id == - openib_btl->port_info.subnet_id) { - if (rem_port_cnt == btl_rank) - break; - else - rem_port_cnt ++; - } else { - if (mca_btl_openib_component.allow_different_subnets) { - if (rem_port_cnt == btl_rank) - break; - else - rem_port_cnt ++; - } - } - } - - assert(rem_port_cnt == btl_rank); - /* Push the subnet/lid/jobid to xrc hash */ - rc = mca_btl_openib_ib_address_add_new( - ib_proc->proc_ports[j].pm_port_info.lid, - ib_proc->proc_ports[j].pm_port_info.subnet_id, - ib_proc->proc_opal->proc_name.jobid, endpoint); - if (OPAL_SUCCESS != rc ) { - return OPAL_ERROR; - } - } -#endif - mca_btl_openib_endpoint_init(openib_btl, endpoint, - local_cpc, - &(ib_proc->proc_ports[matching_port]), - remote_cpc_data); - - rc = mca_btl_openib_proc_insert(ib_proc, endpoint); - if (OPAL_SUCCESS != rc) { - OBJ_RELEASE(endpoint); - return OPAL_ERROR; - } - - if(OPAL_SUCCESS != mca_btl_openib_tune_endpoint(openib_btl, endpoint)) { - OBJ_RELEASE(endpoint); - return OPAL_ERROR; - } - - /* protect device because several endpoints for different ib_proc's - * may be simultaneously initialized */ - opal_mutex_lock(&openib_btl->device->device_lock); - endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint); - opal_mutex_unlock(&openib_btl->device->device_lock); - - if( 0 > endpoint->index ) { - OBJ_RELEASE(endpoint); - return OPAL_ERROR; - } - - /* Tell the selected CPC that it won. NOTE: This call is - outside of / separate from mca_btl_openib_endpoint_init() - because this function likely needs the endpoint->index. */ - if (NULL != local_cpc->cbm_endpoint_init) { - rc = local_cpc->cbm_endpoint_init(endpoint); - if (OPAL_SUCCESS != rc) { - OBJ_RELEASE(endpoint); - return OPAL_ERROR; - } - } - - *endpoint_ptr = endpoint; - return OPAL_SUCCESS; -} - -static int get_openib_btl_params(mca_btl_openib_module_t* openib_btl, int *port_cnt_ptr) -{ - int port_cnt = 0, rank = -1, j; - for(j=0; j < mca_btl_openib_component.ib_num_btls; j++){ - if(mca_btl_openib_component.openib_btls[j]->port_info.subnet_id - == openib_btl->port_info.subnet_id) { - if(openib_btl == mca_btl_openib_component.openib_btls[j]) { - rank = port_cnt; - } - port_cnt++; - } else { - if (mca_btl_openib_component.allow_different_subnets) { - if (openib_btl == mca_btl_openib_component.openib_btls[j]) { - rank = port_cnt; - } - port_cnt++; - } - } - } - *port_cnt_ptr = port_cnt; - return rank; -} - -/* - * add a proc to this btl module - * creates an endpoint that is setup on the - * first send to the endpoint - */ -int mca_btl_openib_add_procs( - struct mca_btl_base_module_t* btl, - size_t nprocs, - struct opal_proc_t **procs, - struct mca_btl_base_endpoint_t** peers, - opal_bitmap_t* reachable) -{ - mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl; - size_t nprocs_new_loc = 0, nprocs_new = 0; - int i,j, rc; - int lcl_subnet_id_port_cnt = 0; - int btl_rank = 0; - volatile mca_btl_base_endpoint_t* endpoint; - - - if (! openib_btl->allowed) { - opal_bitmap_clear_all_bits(reachable); - opal_show_help("help-mpi-btl-openib.txt", "ib port not selected", - true, opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num); - } - - btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt); - if( 0 > btl_rank ){ - return OPAL_ERR_NOT_FOUND; - } - -#if HAVE_XRC - if(MCA_BTL_XRC_ENABLED && - NULL == mca_btl_openib_component.ib_addr_table.ht_table) { - if(OPAL_SUCCESS != opal_hash_table_init( - &mca_btl_openib_component.ib_addr_table, nprocs)) { - BTL_ERROR(("XRC internal error. Failed to allocate ib_table")); - return OPAL_ERROR; - } - } -#endif - - rc = prepare_device_for_use (openib_btl->device); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib device for use")); - return rc; - } - - if (0 == openib_btl->num_peers) { - /* ensure completion queues are created before attempting to - * make a loop-back queue pair */ - rc = openib_btl_size_queues(openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("error creating cqs")); - return rc; - } - } - - /* prepare all proc's and account them properly */ - for (i = 0, nprocs_new_loc = 0 ; i < (int) nprocs; i++) { - struct opal_proc_t* proc = procs[i]; - mca_btl_openib_proc_t* ib_proc; - -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - /* Most current iWARP adapters (June 2008) cannot handle - talking to other processes on the same host (!) -- so mark - them as unreachable (need to use sm). So for the moment, - we'll just mark any local peer on an iWARP NIC as - unreachable. See trac ticket #1352. */ - if (IBV_TRANSPORT_IWARP == openib_btl->device->ib_dev->transport_type && - OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { - continue; - } -#endif - - if(NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc)) ) { - /* if we don't have connection info for this process, it's - * okay because some other method might be able to reach it, - * so just mark it as unreachable by us */ - continue; - } - - /* account this openib_btl in this proc */ - rc = mca_btl_openib_proc_reg_btl(ib_proc, openib_btl); - - opal_mutex_unlock( &ib_proc->proc_lock ); - - switch( rc ){ - case OPAL_SUCCESS: - /* this is a new process to this openib btl */ - nprocs_new++; - if (OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { - nprocs_new_loc ++; - } - break; - case OPAL_ERR_RESOURCE_BUSY: - /* process was accounted earlier in this openib btl */ - break; - default: - /* unexpected error, e.g. out of mem */ - return rc; - } - } - - if (nprocs_new) { - opal_atomic_add_fetch_32 (&openib_btl->num_peers, nprocs_new); - - /* adjust cq sizes given the new procs */ - rc = openib_btl_size_queues (openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("error creating cqs")); - return rc; - } - } - - rc = openib_btl_prepare (openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib btl module for use")); - return rc; - } - - opal_mutex_lock(&openib_btl->device->device_lock); - openib_btl->local_procs += nprocs_new_loc; - if( 0 < nprocs_new_loc ){ - openib_btl->device->mem_reg_max = openib_btl->device->mem_reg_max_total / openib_btl->local_procs; - } - opal_mutex_unlock(&openib_btl->device->device_lock); - - /* prepare endpoints */ - for (i = 0, nprocs_new_loc = 0 ; i < (int) nprocs; i++) { - struct opal_proc_t* proc = procs[i]; - mca_btl_openib_proc_t* ib_proc; - bool found_existing = false; - - opal_output(-1, "add procs: adding proc %d", i); - -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - /* Most current iWARP adapters (June 2008) cannot handle - talking to other processes on the same host (!) -- so mark - them as unreachable (need to use sm). So for the moment, - we'll just mark any local peer on an iWARP NIC as - unreachable. See trac ticket #1352. */ - if (IBV_TRANSPORT_IWARP == openib_btl->device->ib_dev->transport_type && - OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { - continue; - } -#endif - - if(NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc)) ) { - /* if we don't have connection info for this process, it's - * okay because some other method might be able to reach it, - * so just mark it as unreachable by us */ - continue; - } - - found_existing = false; - - for (j = 0 ; j < (int) ib_proc->proc_endpoint_count ; ++j) { - endpoint = ib_proc->proc_endpoints[j]; - if (endpoint->endpoint_btl == openib_btl) { - found_existing = true; - break; - } - } - - if( !found_existing ) { - rc = init_ib_proc_nolock(openib_btl, ib_proc, &endpoint, - lcl_subnet_id_port_cnt, btl_rank); - if( OPAL_SUCCESS == rc ){ - found_existing = true; - } - } - opal_mutex_unlock( &ib_proc->proc_lock ); - - if (found_existing) { - if (reachable) { - opal_bitmap_set_bit(reachable, i); - } - peers[i] = (mca_btl_base_endpoint_t*)endpoint; - } - - } - - return OPAL_SUCCESS; -} - -struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, struct opal_proc_t *proc) -{ - mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t *) btl; - volatile mca_btl_base_endpoint_t *endpoint = NULL; - int local_port_cnt = 0, btl_rank, rc; - mca_btl_openib_proc_t *ib_proc; - - rc = prepare_device_for_use (openib_btl->device); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib device for use")); - return NULL; - } - - if (NULL == (ib_proc = mca_btl_openib_proc_get_locked(proc))) { - /* if we don't have connection info for this process, it's - * okay because some other method might be able to reach it, - * so just mark it as unreachable by us */ - return NULL; - } - - rc = mca_btl_openib_proc_reg_btl(ib_proc, openib_btl); - - switch( rc ){ - case OPAL_SUCCESS: - /* unlock first to avoid possible deadlocks */ - opal_mutex_unlock(&ib_proc->proc_lock); - - /* this is a new process to this openib btl - * account this procs if need */ - opal_atomic_add_fetch_32 (&openib_btl->num_peers, 1); - rc = openib_btl_size_queues(openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("error creating cqs")); - return NULL; - } - - if( OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags) ) { - opal_mutex_lock(&openib_btl->ib_lock); - openib_btl->local_procs += 1; - openib_btl->device->mem_reg_max = openib_btl->device->mem_reg_max_total / openib_btl->local_procs; - opal_mutex_unlock(&openib_btl->ib_lock); - } - - /* lock process back */ - opal_mutex_lock(&ib_proc->proc_lock); - break; - case OPAL_ERR_RESOURCE_BUSY: - /* process was accounted earlier in this openib btl */ - break; - default: - /* unexpected error, e.g. out of mem */ - BTL_ERROR(("Unexpected OPAL error %d", rc)); - return NULL; - } - - rc = openib_btl_prepare(openib_btl); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("could not prepare openib btl structure for use")); - goto exit; - } - - for (size_t j = 0 ; j < ib_proc->proc_endpoint_count ; ++j) { - endpoint = ib_proc->proc_endpoints[j]; - if (endpoint->endpoint_btl == openib_btl) { - goto exit; - } - } - - endpoint = NULL; - - btl_rank = get_openib_btl_params(openib_btl, &local_port_cnt); - if( 0 > btl_rank ){ - goto exit; - } - - (void)init_ib_proc_nolock(openib_btl, ib_proc, &endpoint, - local_port_cnt, btl_rank); - -exit: - opal_mutex_unlock(&ib_proc->proc_lock); - - return (struct mca_btl_base_endpoint_t *)endpoint; -} - -/* - * delete the proc as reachable from this btl module - */ -int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct opal_proc_t **procs, - struct mca_btl_base_endpoint_t ** peers) -{ - int i, ep_index; - mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; - mca_btl_openib_endpoint_t* endpoint; - - for (i=0 ; i < (int) nprocs ; i++) { - mca_btl_base_endpoint_t* del_endpoint = peers[i]; - for(ep_index=0; - ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); - ep_index++) { - endpoint = (mca_btl_openib_endpoint_t *) - opal_pointer_array_get_item(openib_btl->device->endpoints, - ep_index); - if(!endpoint || endpoint->endpoint_btl != openib_btl) { - continue; - } - if (endpoint == del_endpoint) { - int j; - BTL_VERBOSE(("in del_procs %d, setting another endpoint to null", - ep_index)); - /* remove the endpoint from eager_rdma_buffers */ - for (j=0; jdevice->eager_rdma_buffers_count; j++) { - if (openib_btl->device->eager_rdma_buffers[j] == endpoint) { - /* should it be obj_reference_count == 2 ? */ - assert(((opal_object_t*)endpoint)->obj_reference_count > 1); - OBJ_RELEASE(endpoint); - openib_btl->device->eager_rdma_buffers[j] = NULL; - /* can we simply break and leave the for loop ? */ - } - } - opal_pointer_array_set_item(openib_btl->device->endpoints, - ep_index, NULL); - assert(((opal_object_t*)endpoint)->obj_reference_count == 1); - mca_btl_openib_proc_remove(procs[i], endpoint); - OBJ_RELEASE(endpoint); - } - } - } - - return OPAL_SUCCESS; -} - -/* - *Register callback function for error handling.. - */ -int mca_btl_openib_register_error_cb( - struct mca_btl_base_module_t* btl, - mca_btl_base_module_error_cb_fn_t cbfunc) -{ - - mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; - openib_btl->error_cb = cbfunc; /* stash for later */ - return OPAL_SUCCESS; -} - -static inline mca_btl_base_descriptor_t * -ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order, - uint32_t flags) -{ - int qp; - opal_free_list_item_t* item = NULL; - - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if(mca_btl_openib_component.qp_infos[qp].size >= size) { - item = opal_free_list_get (&btl->device->qps[qp].send_free); - if(item) - break; - } - } - if(NULL == item) - return NULL; - - /* not all upper layer users set this */ - to_base_frag(item)->segment.seg_len = size; - to_base_frag(item)->base.order = order; - to_base_frag(item)->base.des_flags = flags; - - assert(to_send_frag(item)->qp_idx <= order); - return &to_base_frag(item)->base; -} - -/* check if pending fragment has enough space for coalescing */ -static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list, - opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size, - mca_btl_openib_coalesced_frag_t **cfrag) -{ - mca_btl_openib_send_frag_t *frag = NULL; - - if (opal_list_is_empty(frag_list)) - return NULL; - - OPAL_THREAD_LOCK(lock); - if (!opal_list_is_empty(frag_list)) { - int qp; - size_t total_length; - opal_list_item_t *i = opal_list_get_first(frag_list); - frag = to_send_frag(i); - if(to_com_frag(frag)->endpoint != ep || - MCA_BTL_OPENIB_FRAG_CONTROL == openib_frag_type(frag)) { - OPAL_THREAD_UNLOCK(lock); - return NULL; - } - - total_length = size + frag->coalesced_length + - to_base_frag(frag)->segment.seg_len + - sizeof(mca_btl_openib_header_coalesced_t); - - qp = to_base_frag(frag)->base.order; - - if(total_length <= mca_btl_openib_component.qp_infos[qp].size) { - /* make sure we can allocate a coalescing frag before returning success */ - *cfrag = alloc_coalesced_frag(); - if (OPAL_LIKELY(NULL != cfrag)) { - (*cfrag)->send_frag = frag; - (*cfrag)->sent = false; - - opal_list_remove_first(frag_list); - } else { - frag = NULL; - } - } else { - frag = NULL; - } - } - OPAL_THREAD_UNLOCK(lock); - - return frag; -} - -/** - * Allocate a segment. - * - * @param btl (IN) BTL module - * @param size (IN) Request segment size. - * @param size (IN) Size of segment to allocate - * - * When allocating a segment we pull a pre-alllocated segment - * from one of two free lists, an eager list and a max list - */ -mca_btl_base_descriptor_t* mca_btl_openib_alloc( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - uint8_t order, - size_t size, - uint32_t flags) -{ - mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; - int qp = frag_size_to_order(obtl, size); - mca_btl_openib_send_frag_t *sfrag = NULL; - mca_btl_openib_coalesced_frag_t *cfrag = NULL; - - assert(qp != MCA_BTL_NO_ORDER); - - if(mca_btl_openib_component.use_message_coalescing && - (flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { - int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY); - - sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio], - &ep->endpoint_lock, ep, size, &cfrag); - - if (NULL == sfrag) { - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio], - &ep->endpoint_lock, ep, size, &cfrag); - } else { - sfrag = check_coalescing( - &obtl->qps[qp].u.srq_qp.pending_frags[prio], - &obtl->ib_lock, ep, size, &cfrag); - } - } - } - - if (NULL == sfrag) { - return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags); - } - - /* begin coalescing message */ - - /* fix up new coalescing header if this is the first coalesced frag */ - if(sfrag->hdr != sfrag->chdr) { - mca_btl_openib_control_header_t *ctrl_hdr; - mca_btl_openib_header_coalesced_t *clsc_hdr; - uint8_t org_tag; - - org_tag = sfrag->hdr->tag; - sfrag->hdr = sfrag->chdr; - ctrl_hdr = (mca_btl_openib_control_header_t*)(sfrag->hdr + 1); - clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctrl_hdr + 1); - sfrag->hdr->tag = MCA_BTL_TAG_IB; - ctrl_hdr->type = MCA_BTL_OPENIB_CONTROL_COALESCED; - clsc_hdr->tag = org_tag; - clsc_hdr->size = to_base_frag(sfrag)->segment.seg_len; - clsc_hdr->alloc_size = to_base_frag(sfrag)->segment.seg_len; - if(ep->nbo) - BTL_OPENIB_HEADER_COALESCED_HTON(*clsc_hdr); - sfrag->coalesced_length = sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_header_coalesced_t); - to_com_frag(sfrag)->sg_entry.addr = (uint64_t)(uintptr_t)sfrag->hdr; - } - - cfrag->hdr = (mca_btl_openib_header_coalesced_t*)((unsigned char*)(sfrag->hdr + 1) + - sfrag->coalesced_length + - to_base_frag(sfrag)->segment.seg_len); - cfrag->hdr = (mca_btl_openib_header_coalesced_t*)BTL_OPENIB_ALIGN_COALESCE_HDR(cfrag->hdr); - cfrag->hdr->alloc_size = size; - - /* point coalesced frag pointer into a data buffer */ - to_base_frag(cfrag)->segment.seg_addr.pval = cfrag->hdr + 1; - to_base_frag(cfrag)->segment.seg_len = size; - - /* NTH: there is no reason to append the coalesced fragment here. No more - * fragments will be added until either send or free has been called on - * the coalesced frag. */ - - to_base_frag(cfrag)->base.des_flags = flags; - - return &to_base_frag(cfrag)->base; -} - -/** - * Return a segment - * - * Return the segment to the appropriate - * preallocated segment list - */ -int mca_btl_openib_free( - struct mca_btl_base_module_t* btl, - mca_btl_base_descriptor_t* des) -{ - /* reset those field on free so we will not have to do it on alloc */ - to_base_frag(des)->base.des_flags = 0; - switch(openib_frag_type(des)) { - case MCA_BTL_OPENIB_FRAG_SEND: - to_send_frag(des)->hdr = (mca_btl_openib_header_t*) - (((unsigned char*)to_send_frag(des)->chdr) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t)); - to_com_frag(des)->sg_entry.addr = - (uint64_t)(uintptr_t)to_send_frag(des)->hdr; - to_send_frag(des)->coalesced_length = 0; - to_base_frag(des)->segment.seg_addr.pval = - to_send_frag(des)->hdr + 1; - assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags)); - /* fall through */ - default: - break; - } - - if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) { - mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag; - - /* the coalesced fragment would have sent the original fragment but that - * will not happen so send the fragment now */ - mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag); - } - - MCA_BTL_IB_FRAG_RETURN(des); - - return OPAL_SUCCESS; -} - -/** - * register user buffer or pack - * data into pre-registered buffer and return a - * descriptor that can be - * used for send/put. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - * - * prepare source's behavior depends on the following: - * Has a valid memory registration been passed to prepare_src? - * if so we attempt to use the pre-registered user-buffer, if the memory registration - * is too small (only a portion of the user buffer) then we must reregister the user buffer - * Has the user requested the memory to be left pinned? - * if so we insert the memory registration into a memory tree for later lookup, we - * may also remove a previous registration if a MRU (most recently used) list of - * registrations is full, this prevents resources from being exhausted. - * Is the requested size larger than the btl's max send size? - * if so and we aren't asked to leave the registration pinned, then we register the memory if - * the users buffer is contiguous - * Otherwise we choose from two free lists of pre-registered memory in which to pack the data into. - * - */ -mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_openib_com_frag_t *frag = NULL; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = *size; - void *ptr; - - assert(MCA_BTL_NO_ORDER == order); - - if (max_data + reserve > btl->btl_max_send_size) { - max_data = btl->btl_max_send_size - reserve; - } - - frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order, - max_data + reserve, flags); - if (NULL == frag) { - return NULL; - } - - ptr = to_base_frag(frag)->segment.seg_addr.pval; - - iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); - (void) opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ - /* If the convertor is copying the data asynchronously, then record an event - * that will trigger the callback when it completes. Mark descriptor as async. - * No need for this in the case we are not sending any GPU data. */ - if ((convertor->flags & CONVERTOR_CUDA_ASYNC) && (0 != max_data)) { - mca_common_cuda_record_dtoh_event("btl_openib", (mca_btl_base_descriptor_t *)frag); - to_base_frag(frag)->base.des_flags = flags | MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; - } -#endif /* OPAL_CUDA_SUPPORT */ - - *size = max_data; - - /* not all upper layer users set this */ - to_base_frag(frag)->segment.seg_len = max_data + reserve; - - return &to_base_frag(frag)->base; -} - -static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) { - mca_btl_openib_module_t* openib_btl; - mca_btl_openib_endpoint_t* endpoint; - int ep_index, i; - int qp, rc = OPAL_SUCCESS; - - openib_btl = (mca_btl_openib_module_t*) btl; - - /* Sanity check */ - if( mca_btl_openib_component.ib_num_btls <= 0 ) { - return OPAL_SUCCESS; - } - - if (openib_btl->allowed) { - /* Release all QPs */ - if (NULL != openib_btl->device->endpoints) { - for (ep_index=0; - ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); - ep_index++) { - endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints, - ep_index); - if(!endpoint) { - BTL_VERBOSE(("In finalize, got another null endpoint")); - continue; - } - if(endpoint->endpoint_btl != openib_btl) { - continue; - } - for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) { - if(openib_btl->device->eager_rdma_buffers[i] == endpoint) { - openib_btl->device->eager_rdma_buffers[i] = NULL; - OBJ_RELEASE(endpoint); - } - } - opal_pointer_array_set_item(openib_btl->device->endpoints, - ep_index, NULL); - assert(((opal_object_t*)endpoint)->obj_reference_count == 1); - OBJ_RELEASE(endpoint); - } - } - - /* Release SRQ resources */ - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if(!BTL_OPENIB_QP_TYPE_PP(qp)) { - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &openib_btl->qps[qp].u.srq_qp.pending_frags[0]); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &openib_btl->qps[qp].u.srq_qp.pending_frags[1]); - if (NULL != openib_btl->qps[qp].u.srq_qp.srq) { - opal_mutex_t *lock = - &mca_btl_openib_component.srq_manager.lock; - - opal_hash_table_t *srq_addr_table = - &mca_btl_openib_component.srq_manager.srq_addr_table; - - opal_mutex_lock(lock); - if (OPAL_SUCCESS != - opal_hash_table_remove_value_ptr(srq_addr_table, - &openib_btl->qps[qp].u.srq_qp.srq, - sizeof(struct ibv_srq *))) { - BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp)); - rc = OPAL_ERROR; - } - opal_mutex_unlock(lock); - if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) { - BTL_VERBOSE(("Failed to close SRQ %d", qp)); - rc = OPAL_ERROR; - } - } - - OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]); - OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]); - } - } - - /* Finalize the CPC modules on this openib module */ - for (i = 0; i < openib_btl->num_cpcs; ++i) { - if (NULL != openib_btl->cpcs[i]->cbm_finalize) { - openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]); - } - free(openib_btl->cpcs[i]); - } - free(openib_btl->cpcs); - } - - /* Release device if there are no more users */ - if(!(--openib_btl->device->btls)) { - OBJ_RELEASE(openib_btl->device); - } - - if (NULL != openib_btl->qps) { - free(openib_btl->qps); - } - - return rc; -} - - -int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) -{ - mca_btl_openib_module_t* openib_btl; - int i, rc = OPAL_SUCCESS; - - openib_btl = (mca_btl_openib_module_t*) btl; - - /* Sanity check */ - if( mca_btl_openib_component.ib_num_btls <= 0 ) { - return 0; - } - - /* Remove the btl from component list */ - if ( mca_btl_openib_component.ib_num_btls > 0 ) { - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){ - if (mca_btl_openib_component.openib_btls[i] == openib_btl){ - if( OPAL_SUCCESS != (rc = mca_btl_openib_finalize_resources(btl) ) ) { - BTL_VERBOSE(("Failed to finalize resources")); - } - mca_btl_openib_component.openib_btls[i] = - mca_btl_openib_component.openib_btls[mca_btl_openib_component.ib_num_btls-1]; - break; - } - } - } - - mca_btl_openib_component.ib_num_btls--; - - OBJ_DESTRUCT(&openib_btl->ib_lock); - free(openib_btl); - - BTL_VERBOSE(("Success in closing BTL resources")); - - return rc; -} - -/* - * Send immediate - Minimum function calls minimum checks, send the data ASAP. - * If BTL can't to send the messages imidiate, it creates messages descriptor - * returns it to PML. - */ -int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct opal_convertor_t* convertor, - void* header, - size_t header_size, - size_t payload_size, - uint8_t order, - uint32_t flags, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t** descriptor) -{ - mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; - size_t size = payload_size + header_size; - int qp = frag_size_to_order(obtl, size), - prio = flags & MCA_BTL_DES_FLAGS_PRIORITY, - ib_rc; - bool do_rdma = false; - opal_free_list_item_t* item = NULL; - mca_btl_openib_frag_t *frag; - mca_btl_openib_header_t *hdr; - int send_signaled; - int rc; - - OPAL_THREAD_LOCK(&ep->endpoint_lock); - - if (OPAL_UNLIKELY(MCA_BTL_IB_CONNECTED != ep->endpoint_state)) { - goto cant_send; - } - - /* If it is pending messages on the qp - we can not send */ - if(OPAL_UNLIKELY(!opal_list_is_empty(&ep->qps[qp].no_wqe_pending_frags[prio]))) { - goto cant_send; - } - -#if OPAL_CUDA_GDR_SUPPORT - /* We do not want to use this path when we have GDR support */ - if (convertor->flags & CONVERTOR_CUDA) { - goto cant_send; - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - - /* Allocate WQE */ - if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) { - goto cant_send_wqe; - } - - /* Allocate fragment */ - item = opal_free_list_get (&obtl->device->qps[qp].send_free); - if(OPAL_UNLIKELY(NULL == item)) { - /* we don't return NULL because maybe later we will try to coalesce */ - goto cant_send_wqe; - } - frag = to_base_frag(item); - hdr = to_send_frag(item)->hdr; - - /* eager rdma or send ? Check eager rdma credits */ - /* Note: Maybe we want to implement isend only for eager rdma ?*/ - rc = mca_btl_openib_endpoint_credit_acquire (ep, qp, prio, size, &do_rdma, - to_send_frag(frag), false); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - goto cant_send_frag; - } - - frag->segment.seg_len = size; - frag->base.order = qp; - frag->base.des_flags = flags; - hdr->tag = tag; - to_com_frag(item)->endpoint = ep; - - /* put match header */ - memcpy(frag->segment.seg_addr.pval, header, header_size); - - /* Pack data */ - if(payload_size) { - size_t max_data; - struct iovec iov; - uint32_t iov_count; - /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size); - iov.iov_len = max_data = payload_size; - iov_count = 1; - - (void)opal_convertor_pack( convertor, &iov, &iov_count, &max_data); - - assert(max_data == payload_size); - } - - send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma); - ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled); - - if (!ib_rc) { - if (0 == send_signaled) { - MCA_BTL_IB_FRAG_RETURN(frag); - } - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - - return OPAL_SUCCESS; - } - - /* Failed to send, do clean up all allocated resources */ - if (ep->nbo) { - BTL_OPENIB_HEADER_NTOH(*hdr); - } - - mca_btl_openib_endpoint_credit_release (ep, qp, do_rdma, to_send_frag(frag)); - -cant_send_frag: - MCA_BTL_IB_FRAG_RETURN(frag); -cant_send_wqe: - qp_put_wqe (ep, qp); -cant_send: - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - /* We can not send the data directly, so we just return descriptor */ - if (NULL != descriptor) { - *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); - } - - return OPAL_ERR_RESOURCE_BUSY; -} -/* - * Initiate a send. - */ - -int mca_btl_openib_send( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - mca_btl_base_tag_t tag) - -{ - mca_btl_openib_send_frag_t *frag; - - assert(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND || - openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED); - - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) { - frag = to_coalesced_frag(des)->send_frag; - - /* save coalesced fragment on a main fragment; we will need it after send - * completion to free it and to call upper layer callback */ - opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des); - frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size + - sizeof(mca_btl_openib_header_coalesced_t); - - to_coalesced_frag(des)->sent = true; - to_coalesced_frag(des)->hdr->tag = tag; - to_coalesced_frag(des)->hdr->size = des->des_segments->seg_len; - if(ep->nbo) - BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr); - } else { - frag = to_send_frag(des); - to_com_frag(des)->endpoint = ep; - frag->hdr->tag = tag; - } - - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - return mca_btl_openib_endpoint_send(ep, frag); -} - -static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - void *base, size_t size, uint32_t flags) -{ - mca_btl_openib_module_t *openib_module = (mca_btl_openib_module_t *) btl; - mca_btl_openib_reg_t *reg; - uint32_t mflags = 0; - int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; - int rc; - -#if OPAL_CUDA_GDR_SUPPORT - if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) { - mflags |= MCA_RCACHE_FLAGS_CUDA_GPU_MEM; - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - - rc = openib_module->device->rcache->rcache_register (openib_module->device->rcache, base, size, mflags, - access_flags, (mca_rcache_base_registration_t **) ®); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) { - return NULL; - } - - return ®->btl_handle; -} - -static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) -{ - mca_btl_openib_module_t *openib_module = (mca_btl_openib_module_t *) btl; - mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle)); - - openib_module->device->rcache->rcache_deregister (openib_module->device->rcache, (mca_rcache_base_registration_t *) reg); - - return OPAL_SUCCESS; -} - -#if OPAL_ENABLE_FT_CR == 0 -int mca_btl_openib_ft_event(int state) { - return OPAL_SUCCESS; -} -#else -int mca_btl_openib_ft_event(int state) { - int i; - - if(OPAL_CRS_CHECKPOINT == state) { - /* Continue must reconstruct the routes (including modex), since we - * have to tear down the devices completely. */ - opal_cr_continue_like_restart = true; - - /* - * To keep the node from crashing we need to call ibv_close_device - * before the checkpoint is taken. To do this we need to tear - * everything down, and rebuild it all on continue/restart. :( - */ - - /* Shutdown all modules - * - Do this backwards since the openib_finalize function also loops - * over this variable. - */ - for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i ) { - mca_btl_openib_finalize_resources( &(mca_btl_openib_component.openib_btls[i])->super); - } - - mca_btl_openib_component.devices_count = 0; - mca_btl_openib_component.ib_num_btls = 0; - OBJ_DESTRUCT(&mca_btl_openib_component.ib_procs); - - opal_btl_openib_connect_base_finalize(); - } - else if(OPAL_CRS_CONTINUE == state) { - ; /* Cleared by forcing the modex, no work needed */ - } - else if(OPAL_CRS_RESTART == state) { - ; - } - else if(OPAL_CRS_TERM == state ) { - ; - } - else { - ; - } - - return OPAL_SUCCESS; -} - -#endif /* OPAL_ENABLE_FT_CR */ diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h deleted file mode 100644 index e8b0b29782..0000000000 --- a/opal/mca/btl/openib/btl_openib.h +++ /dev/null @@ -1,933 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015-2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#ifndef MCA_BTL_IB_H -#define MCA_BTL_IB_H - -#include "opal_config.h" -#include -#include -#include - -/* Open MPI includes */ -#include "opal/class/opal_pointer_array.h" -#include "opal/class/opal_hash_table.h" -#include "opal/util/arch.h" -#include "opal/util/output.h" -#include "opal/mca/event/event.h" -#include "opal/threads/threads.h" -#include "opal/mca/btl/btl.h" -#include "opal/mca/rcache/rcache.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/mca/btl/base/btl_base_error.h" -#include "opal/mca/btl/base/base.h" -#include "opal/runtime/opal_progress_threads.h" - -#include "connect/connect.h" - -BEGIN_C_DECLS - -#define HAVE_XRC (OPAL_HAVE_CONNECTX_XRC || OPAL_HAVE_CONNECTX_XRC_DOMAINS) -#define ENABLE_DYNAMIC_SL OPAL_ENABLE_DYNAMIC_SL - -#define MCA_BTL_IB_LEAVE_PINNED 1 -#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll -#define MCA_BTL_IB_PKEY_MASK 0x7fff -#define MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT (256) - - -/*--------------------------------------------------------------------*/ - -#if OPAL_ENABLE_DEBUG -#define ATTACH() do { \ - int i = 0; \ - opal_output(0, "WAITING TO DEBUG ATTACH"); \ - while (i == 0) sleep(5); \ - } while(0); -#else -#define ATTACH() -#endif - -/*--------------------------------------------------------------------*/ - -/** - * Infiniband (IB) BTL component. - */ - -enum { - BTL_OPENIB_HP_CQ, - BTL_OPENIB_LP_CQ, - BTL_OPENIB_MAX_CQ, -}; - -typedef enum { - MCA_BTL_OPENIB_TRANSPORT_IB, - MCA_BTL_OPENIB_TRANSPORT_IWARP, - MCA_BTL_OPENIB_TRANSPORT_RDMAOE, - MCA_BTL_OPENIB_TRANSPORT_UNKNOWN, - MCA_BTL_OPENIB_TRANSPORT_SIZE -} mca_btl_openib_transport_type_t; - -typedef enum { - MCA_BTL_OPENIB_PP_QP, - MCA_BTL_OPENIB_SRQ_QP, - MCA_BTL_OPENIB_XRC_QP -} mca_btl_openib_qp_type_t; - -struct mca_btl_openib_pp_qp_info_t { - int32_t rd_win; - int32_t rd_rsv; -}; typedef struct mca_btl_openib_pp_qp_info_t mca_btl_openib_pp_qp_info_t; - -struct mca_btl_openib_srq_qp_info_t { - int32_t sd_max; - /* The init value for rd_curr_num variables of all SRQs */ - int32_t rd_init; - /* The watermark, threshold - if the number of WQEs in SRQ is less then this value => - the SRQ limit event (IBV_EVENT_SRQ_LIMIT_REACHED) will be generated on corresponding SRQ. - As result the maximal number of pre-posted WQEs on the SRQ will be increased */ - int32_t srq_limit; -}; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t; - -struct mca_btl_openib_qp_info_t { - mca_btl_openib_qp_type_t type; - size_t size; - int32_t rd_num; - int32_t rd_low; - union { - mca_btl_openib_pp_qp_info_t pp_qp; - mca_btl_openib_srq_qp_info_t srq_qp; - } u; -}; typedef struct mca_btl_openib_qp_info_t mca_btl_openib_qp_info_t; - -#define BTL_OPENIB_QP_TYPE(Q) (mca_btl_openib_component.qp_infos[(Q)].type) -#define BTL_OPENIB_QP_TYPE_PP(Q) \ - (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_PP_QP) -#define BTL_OPENIB_QP_TYPE_SRQ(Q) \ - (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_SRQ_QP) -#define BTL_OPENIB_QP_TYPE_XRC(Q) \ - (BTL_OPENIB_QP_TYPE(Q) == MCA_BTL_OPENIB_XRC_QP) - -typedef enum { - BTL_OPENIB_RQ_SOURCE_DEVICE_INI = MCA_BASE_VAR_SOURCE_MAX, -} btl_openib_receive_queues_source_t; - -typedef enum { - BTL_OPENIB_DT_IB, - BTL_OPENIB_DT_IWARP, - BTL_OPENIB_DT_ALL -} btl_openib_device_type_t; - -/* The structer for manage all BTL SRQs */ -typedef struct mca_btl_openib_srq_manager_t { - opal_mutex_t lock; - /* The keys of this hash table are addresses of - SRQs structures, and the elements are BTL modules - pointers that associated with these SRQs */ - opal_hash_table_t srq_addr_table; -} mca_btl_openib_srq_manager_t; - -struct mca_btl_openib_component_t { - mca_btl_base_component_3_0_0_t super; /**< base BTL component */ - - int ib_max_btls; - /**< maximum number of devices available to openib component */ - - int ib_num_btls; - /**< number of devices available to the openib component */ - - int ib_allowed_btls; - /**< number of devices allowed to the openib component */ - - struct mca_btl_openib_module_t **openib_btls; - /**< array of available BTLs */ - - opal_pointer_array_t devices; /**< array of available devices */ - int devices_count; - - int ib_free_list_num; - /**< initial size of free lists */ - - int ib_free_list_max; - /**< maximum size of free lists */ - - int ib_free_list_inc; - /**< number of elements to alloc when growing free lists */ - - opal_list_t ib_procs; - /**< list of ib proc structures */ - - opal_event_t ib_send_event; - /**< event structure for sends */ - - opal_event_t ib_recv_event; - /**< event structure for recvs */ - - opal_mutex_t ib_lock; - /**< lock for accessing module state */ - - char* ib_mpool_hints; - /**< hints for selecting an mpool component */ - - char *ib_rcache_name; - /**< name of ib registration cache */ - - uint8_t num_pp_qps; /**< number of pp qp's */ - uint8_t num_srq_qps; /**< number of srq qp's */ - uint8_t num_xrc_qps; /**< number of xrc qp's */ - uint8_t num_qps; /**< total number of qp's */ - - opal_hash_table_t ib_addr_table; /**< used only for xrc.hash-table that - keeps table of all lids/subnets */ - mca_btl_openib_qp_info_t* qp_infos; - - size_t eager_limit; /**< Eager send limit of first fragment, in Bytes */ - size_t max_send_size; /**< Maximum send size, in Bytes */ - uint32_t max_hw_msg_size;/**< Maximum message size for RDMA protocols in Bytes */ - uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */ - uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */ - - uint32_t ib_cq_size[BTL_OPENIB_MAX_CQ]; /**< Max outstanding CQE on the CQ */ - - int ib_max_inline_data; /**< Max size of inline data */ - unsigned int ib_pkey_val; - unsigned int ib_psn; - unsigned int ib_qp_ous_rd_atom; - uint32_t ib_mtu; - unsigned int ib_min_rnr_timer; - unsigned int ib_timeout; - unsigned int ib_retry_count; - unsigned int ib_rnr_retry; - unsigned int ib_max_rdma_dst_ops; - unsigned int ib_service_level; -#if (ENABLE_DYNAMIC_SL) - unsigned int ib_path_record_service_level; -#endif - int use_eager_rdma; - int eager_rdma_threshold; /**< After this number of msg, use RDMA for short messages, always */ - int eager_rdma_num; - int32_t max_eager_rdma; - unsigned int btls_per_lid; - unsigned int max_lmc; - int apm_lmc; - int apm_ports; - unsigned int buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */ - opal_atomic_int32_t error_counter; /**< Counts number on error events that we got on all devices */ - opal_event_base_t *async_evbase; /**< Async event base */ - bool use_async_event_thread; /**< Use the async event handler */ - mca_btl_openib_srq_manager_t srq_manager; /**< Hash table for all BTL SRQs */ - /* declare as an int instead of btl_openib_device_type_t since there is no - guarantee about the size of an enum. this value will be registered as an - integer with the MCA variable system */ - int device_type; - bool allow_ib; - char *if_include; - char **if_include_list; - char *if_exclude; - char **if_exclude_list; - char *ipaddr_include; - char *ipaddr_exclude; - - /* MCA param btl_openib_receive_queues */ - char *receive_queues; - /* Whether we got a non-default value of btl_openib_receive_queues */ - mca_base_var_source_t receive_queues_source; - - /** Colon-delimited list of filenames for device parameters */ - char *device_params_file_names; - - /** Whether we're in verbose mode or not */ - bool verbose; - - /** Whether we want a warning if no device-specific parameters are - found in INI files */ - bool warn_no_device_params_found; - /** Whether we want a warning if non default GID prefix is not configured - on multiport setup */ - bool warn_default_gid_prefix; - /** Whether we want a warning if the user specifies a non-existent - device and/or port via btl_openib_if_[in|ex]clude MCA params */ - bool warn_nonexistent_if; - /** Whether we want to abort if there's not enough registered - memory available */ - bool abort_not_enough_reg_mem; - - /** Dummy argv-style list; a copy of names from the - if_[in|ex]clude list that we use for error checking (to ensure - that they all exist) */ - char **if_list; - bool use_message_coalescing; - unsigned int cq_poll_ratio; - unsigned int cq_poll_progress; - unsigned int cq_poll_batch; - unsigned int eager_rdma_poll_ratio; - int rdma_qp; - int credits_qp; /* qp used for software flow control */ - bool cpc_explicitly_defined; - /**< free list of frags only; used for pining user memory */ - opal_free_list_t send_user_free; - /**< free list of frags only; used for pining user memory */ - opal_free_list_t recv_user_free; - /**< frags for coalesced massages */ - opal_free_list_t send_free_coalesced; - /** Default receive queues */ - char* default_recv_qps; - /** GID index to use */ - int gid_index; - /* Whether we want to allow connecting processes from different subnets. - * set to 'no' by default */ - bool allow_different_subnets; - /** Whether we want a dynamically resizing srq, enabled by default */ - bool enable_srq_resize; - bool allow_max_memory_registration; - int memory_registration_verbose_level; - int memory_registration_verbose; - int ignore_locality; -#if OPAL_CUDA_SUPPORT - bool cuda_async_send; - bool cuda_async_recv; - bool cuda_have_gdr; - bool driver_have_gdr; - bool cuda_want_gdr; -#endif /* OPAL_CUDA_SUPPORT */ -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - bool rroce_enable; -#endif - unsigned int num_default_gid_btls; /* numbers of btl in the default subnet */ -}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; - -OPAL_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component; - -typedef mca_btl_base_recv_reg_t mca_btl_openib_recv_reg_t; - -/** - * Common information for all ports that is sent in the modex message - */ -typedef struct mca_btl_openib_modex_message_t { - /** The subnet ID of this port */ - uint64_t subnet_id; - /** LID of this port */ - uint16_t lid; - /** APM LID for this port */ - uint16_t apm_lid; - /** The MTU used by this port */ - uint8_t mtu; - /** vendor id define device type and tuning */ - uint32_t vendor_id; - /** vendor part id define device type and tuning */ - uint32_t vendor_part_id; - /** Transport type of remote port */ - uint8_t transport_type; - /** Dummy field used to calculate the real length */ - uint8_t end; -} mca_btl_openib_modex_message_t; - -#define MCA_BTL_OPENIB_MODEX_MSG_NTOH(hdr) \ - do { \ - (hdr).subnet_id = ntoh64((hdr).subnet_id); \ - (hdr).lid = ntohs((hdr).lid); \ - } while (0) -#define MCA_BTL_OPENIB_MODEX_MSG_HTON(hdr) \ - do { \ - (hdr).subnet_id = hton64((hdr).subnet_id); \ - (hdr).lid = htons((hdr).lid); \ - } while (0) - -typedef struct mca_btl_openib_device_qp_t { - opal_free_list_t send_free; /**< free lists of send buffer descriptors */ - opal_free_list_t recv_free; /**< free lists of receive buffer descriptors */ -} mca_btl_openib_device_qp_t; - -struct mca_btl_base_endpoint_t; - -typedef struct mca_btl_openib_device_t { - opal_object_t super; - struct ibv_device *ib_dev; /* the ib device */ -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - struct ibv_comp_channel *ib_channel; /* Channel event for the device */ - opal_thread_t thread; /* Progress thread */ - volatile bool progress; /* Progress status */ -#endif - opal_mutex_t device_lock; /* device level lock */ - struct ibv_context *ib_dev_context; -#if HAVE_DECL_IBV_EXP_QUERY_DEVICE - struct ibv_exp_device_attr ib_exp_dev_attr; -#endif - struct ibv_device_attr ib_dev_attr; - struct ibv_pd *ib_pd; - struct ibv_cq *ib_cq[BTL_OPENIB_MAX_CQ]; - uint32_t cq_size[BTL_OPENIB_MAX_CQ]; - mca_mpool_base_module_t *mpool; - mca_rcache_base_module_t *rcache; - /* MTU for this device */ - uint32_t mtu; - /* Whether this device supports eager RDMA */ - uint8_t use_eager_rdma; - uint8_t btls; /** < number of btls using this device */ - opal_pointer_array_t *endpoints; - opal_pointer_array_t *device_btls; - uint16_t hp_cq_polls; - uint16_t eager_rdma_polls; - bool pollme; - volatile bool got_fatal_event; - volatile bool got_port_event; -#if HAVE_XRC -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_xrcd *xrcd; -#else - struct ibv_xrc_domain *xrc_domain; -#endif - int xrc_fd; -#endif - opal_atomic_int32_t non_eager_rdma_endpoints; - opal_atomic_int32_t eager_rdma_buffers_count; - struct mca_btl_base_endpoint_t **eager_rdma_buffers; - /**< frags for control massages */ - opal_free_list_t send_free_control; - /* QP types and attributes that will be used on this device */ - mca_btl_openib_device_qp_t *qps; - /* Maximum value supported by this device for max_inline_data */ - uint32_t max_inline_data; - /* Registration limit and current count */ - uint64_t mem_reg_max, mem_reg_max_total, mem_reg_active; - /* Device is ready for use */ - bool ready_for_use; - /* Async event */ - opal_event_t async_event; -} mca_btl_openib_device_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_device_t); - -struct mca_btl_openib_module_pp_qp_t { - int32_t dummy; -}; typedef struct mca_btl_openib_module_pp_qp_t mca_btl_openib_module_pp_qp_t; - -struct mca_btl_openib_module_srq_qp_t { - struct ibv_srq *srq; - opal_atomic_int32_t rd_posted; - opal_atomic_int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */ - /* i.e. the number of frags that can be outstanding (down counter) */ - opal_list_t pending_frags[2]; /**< list of high/low prio frags */ - /** The number of receive buffers that can be post in the current time. - The value may be increased in the IBV_EVENT_SRQ_LIMIT_REACHED - event handler. The value starts from (rd_num / 4) and increased up to rd_num */ - int32_t rd_curr_num; - /** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value. - The value increased together with rd_curr_num. The value is unique for every SRQ. */ - int32_t rd_low_local; - /** The flag points if we want to get the - IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */ - bool srq_limit_event_flag; - /**< In difference of the "--mca enable_srq_resize" parameter that says, if we want(or no) - to start with small num of pre-posted receive buffers (rd_curr_num) and to increase this number by needs - (the max of this value is rd_num * the whole size of SRQ), the "srq_limit_event_flag" says if we want to get limit event - from device if the defined srq limit was reached (signal to the main thread) and we put off this flag if the rd_curr_num - was increased up to rd_num. - In order to prevent lock/unlock operation in the critical path we prefer only put-on - the srq_limit_event_flag in asynchronous thread, because in this way we post receive buffers - in the main thread only and only after posting we set (if srq_limit_event_flag is true) - the limit for IBV_EVENT_SRQ_LIMIT_REACHED event. */ -}; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t; - -struct mca_btl_openib_module_qp_t { - union { - mca_btl_openib_module_pp_qp_t pp_qp; - mca_btl_openib_module_srq_qp_t srq_qp; - } u; -}; typedef struct mca_btl_openib_module_qp_t mca_btl_openib_module_qp_t; - -/** - * IB BTL Interface - */ -struct mca_btl_openib_module_t { - /* Base BTL module */ - mca_btl_base_module_t super; - - bool btl_inited; - bool srqs_created; - - /** Common information about all ports */ - mca_btl_openib_modex_message_t port_info; - - /** Array of CPCs on this port */ - opal_btl_openib_connect_base_module_t **cpcs; - - /** Number of elements in the cpcs array */ - uint8_t num_cpcs; - - mca_btl_openib_device_t *device; - uint8_t port_num; /**< ID of the PORT */ - uint16_t pkey_index; - struct ibv_port_attr ib_port_attr; - uint16_t lid; /**< lid that is actually used (for LMC) */ - int apm_port; /**< Alternative port that may be used for APM */ - uint8_t src_path_bits; /**< offset from base lid (for LMC) */ - - opal_atomic_int32_t num_peers; - - opal_mutex_t ib_lock; /**< module level lock */ - - size_t eager_rdma_frag_size; /**< length of eager frag */ - opal_atomic_int32_t eager_rdma_channels; /**< number of open RDMA channels */ - - mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */ - - mca_btl_openib_module_qp_t * qps; - - int local_procs; /** number of local procs */ - - bool atomic_ops_be; /** atomic result is big endian */ - - bool allowed; /** is this port allowed */ -}; -typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; - -extern mca_btl_openib_module_t mca_btl_openib_module; - -struct mca_btl_base_registration_handle_t { - uint32_t rkey; - uint32_t lkey; -}; - -struct mca_btl_openib_reg_t { - mca_rcache_base_registration_t base; - struct ibv_mr *mr; - mca_btl_base_registration_handle_t btl_handle; -}; -typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t; - -#if OPAL_ENABLE_PROGRESS_THREADS == 1 -extern void* mca_btl_openib_progress_thread(opal_object_t*); -#endif - - -/** - * Register a callback function that is called on error.. - * - * @param btl (IN) BTL module - * @return Status indicating if cleanup was successful - */ - -int mca_btl_openib_register_error_cb( - struct mca_btl_base_module_t* btl, - mca_btl_base_module_error_cb_fn_t cbfunc -); - - -/** - * Cleanup any resources held by the BTL. - * - * @param btl BTL instance. - * @return OPAL_SUCCESS or error status on failure. - */ - -extern int mca_btl_openib_finalize( - struct mca_btl_base_module_t* btl -); - - -/** - * PML->BTL notification of change in the process list. - * - * @param btl (IN) BTL module - * @param nprocs (IN) Number of processes - * @param procs (IN) Set of processes - * @param peers (OUT) Set of (optional) peer addressing info. - * @param reachable (IN/OUT) Set of processes that are reachable via this BTL. - * @return OPAL_SUCCESS or error status on failure. - * - */ - -extern int mca_btl_openib_add_procs( - struct mca_btl_base_module_t* btl, - size_t nprocs, - struct opal_proc_t **procs, - struct mca_btl_base_endpoint_t** peers, - opal_bitmap_t* reachable -); - -/** - * PML->BTL notification of change in the process list. - * - * @param btl (IN) BTL instance - * @param nproc (IN) Number of processes. - * @param procs (IN) Set of processes. - * @param peers (IN) Set of peer data structures. - * @return Status indicating if cleanup was successful - * - */ -extern int mca_btl_openib_del_procs( - struct mca_btl_base_module_t* btl, - size_t nprocs, - struct opal_proc_t **procs, - struct mca_btl_base_endpoint_t** peers -); - - -/** - * PML->BTL Initiate a send of the specified size. - * - * @param btl (IN) BTL instance - * @param btl_peer (IN) BTL peer addressing - * @param descriptor (IN) Descriptor of data to be transmitted. - * @param tag (IN) Tag. - */ -extern int mca_btl_openib_send( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor, - mca_btl_base_tag_t tag -); - -/** - * PML->BTL Initiate a immediate send of the specified size. - * - * @param btl (IN) BTL instance - * @param ep (IN) Endpoint - * @param convertor (IN) Datatypes converter - * @param header (IN) PML header - * @param header_size (IN) PML header size - * @param payload_size (IN) Payload size - * @param order (IN) Order - * @param flags (IN) Flags - * @param tag (IN) Tag - * @param descriptor (OUT) Messages descriptor - */ -extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct opal_convertor_t* convertor, - void* header, - size_t header_size, - size_t payload_size, - uint8_t order, - uint32_t flags, - mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t** descriptor -); - -/* forward decaration for internal put/get */ -struct mca_btl_openib_put_frag_t; -struct mca_btl_openib_get_frag_t; - -/** - * @brief Schedule a put fragment with the HCA (internal) - * - * @param btl (IN) BTL instance - * @param ep (IN) BTL endpoint - * @param frag (IN) Fragment prepared by mca_btl_openib_put - * - * If the fragment can not be scheduled due to resource limitations then - * the fragment will be put on the pending put fragment list and retried - * when another get/put fragment has completed. - */ -int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - struct mca_btl_openib_put_frag_t *frag); - -/** - * @brief Schedule an RDMA write with the HCA - * - * @param btl (IN) BTL instance - * @param ep (IN) BTL endpoint - * @param local_address (IN) Source address - * @param remote_address (IN) Destination address - * @param local_handle (IN) Registration handle for region containing the region {local_address, size} - * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} - * @param size (IN) Number of bytes to write - * @param flags (IN) Transfer flags - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion - * @param cbcontext (IN) Context for completion callback - * @param cbdata (IN) Data for completion callback - * - * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed - * @return OPAL_SUCCCESS if the operation was successfully scheduled - * - * This function will attempt to schedule a put operation with the HCA. - */ -int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - -/** - * @brief Schedule a get fragment with the HCA (internal) - * - * @param btl (IN) BTL instance - * @param ep (IN) BTL endpoint - * @param qp (IN) ID of queue pair to schedule the get on - * @param frag (IN) Fragment prepared by mca_btl_openib_get - * - * If the fragment can not be scheduled due to resource limitations then - * the fragment will be put on the pending get fragment list and retried - * when another get/put fragment has completed. - */ -int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - struct mca_btl_openib_get_frag_t *frag); - -/** - * @brief Schedule an RDMA read with the HCA - * - * @param btl (IN) BTL instance - * @param ep (IN) BTL endpoint - * @param local_address (IN) Destination address - * @param remote_address (IN) Source address - * @param local_handle (IN) Registration handle for region containing the region {local_address, size} - * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} - * @param size (IN) Number of bytes to read - * @param flags (IN) Transfer flags - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion - * @param cbcontext (IN) Context for completion callback - * @param cbdata (IN) Data for completion callback - * - * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed - * @return OPAL_SUCCCESS if the operation was successfully scheduled - * - * This function will attempt to schedule a get operation with the HCA. - */ -int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); - -/** - * Initiate an asynchronous fetching atomic operation. - * Completion Semantics: if this function returns a 1 then the operation - * is complete. a return of OPAL_SUCCESS indicates - * the atomic operation has been queued with the - * network. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param local_address (OUT) Local address to store the result in - * @param remote_address (IN) Remote address perfom operation on to (registered remotely) - * @param local_handle (IN) Local registration handle for region containing - * (local_address, local_address + 8) - * @param remote_handle (IN) Remote registration handle for region containing - * (remote_address, remote_address + 8) - * @param op (IN) Operation to perform - * @param operand (IN) Operand for the operation - * @param flags (IN) Flags for this put operation - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion (if queued) - * @param cbcontext (IN) Context for the callback - * @param cbdata (IN) Data for callback - * - * @retval OPAL_SUCCESS The operation was successfully queued - * @retval 1 The operation is complete - * @retval OPAL_ERROR The operation was NOT successfully queued - * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic - * operation. Try again later - * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to - * alignment restrictions or the operation {op} is not supported - * by the hardware. - * - * After the operation is complete the remote address specified by {remote_address} and - * {remote_handle} will be updated with (*remote_address) = (*remote_address) op operand. - * {local_address} will be updated with the previous value stored in {remote_address}. - * The btl will guarantee consistency of atomic operations performed via the btl. Note, - * however, that not all btls will provide consistency between btl atomic operations and - * cpu atomics. - */ -int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, - uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); - -/** - * Initiate an asynchronous compare and swap operation. - * Completion Semantics: if this function returns a 1 then the operation - * is complete. a return of OPAL_SUCCESS indicates - * the atomic operation has been queued with the - * network. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param local_address (OUT) Local address to store the result in - * @param remote_address (IN) Remote address perfom operation on to (registered remotely) - * @param local_handle (IN) Local registration handle for region containing - * (local_address, local_address + 8) - * @param remote_handle (IN) Remote registration handle for region containing - * (remote_address, remote_address + 8) - * @param compare (IN) Operand for the operation - * @param value (IN) Value to store on success - * @param flags (IN) Flags for this put operation - * @param order (IN) Ordering - * @param cbfunc (IN) Function to call on completion (if queued) - * @param cbcontext (IN) Context for the callback - * @param cbdata (IN) Data for callback - * - * @retval OPAL_SUCCESS The operation was successfully queued - * @retval 1 The operation is complete - * @retval OPAL_ERROR The operation was NOT successfully queued - * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the atomic - * operation. Try again later - * @retval OPAL_ERR_NOT_AVAILABLE Atomic operation can not be performed due to - * alignment restrictions or the operation {op} is not supported - * by the hardware. - * - * After the operation is complete the remote address specified by {remote_address} and - * {remote_handle} will be updated with {value} if *remote_address == compare. - * {local_address} will be updated with the previous value stored in {remote_address}. - * The btl will guarantee consistency of atomic operations performed via the btl. Note, - * however, that not all btls will provide consistency between btl atomic operations and - * cpu atomics. - */ -int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, - uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); - -/** - * Allocate a descriptor. - * - * @param btl (IN) BTL module - * @param size (IN) Requested descriptor size. - */ -extern mca_btl_base_descriptor_t* mca_btl_openib_alloc( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - uint8_t order, - size_t size, - uint32_t flags); - - -/** - * Return a segment allocated by this BTL. - * - * @param btl (IN) BTL module - * @param descriptor (IN) Allocated descriptor. - */ -extern int mca_btl_openib_free( - struct mca_btl_base_module_t* btl, - mca_btl_base_descriptor_t* des); - - -/** - * Pack data and return a descriptor that can be - * used for send/put. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags - ); - -extern void mca_btl_openib_frag_progress_pending_put_get( - struct mca_btl_base_endpoint_t*, const int); - -/** - * Fault Tolerance Event Notification Function - * - * @param state (IN) Checkpoint State - * @return OPAL_SUCCESS or failure status - */ -extern int mca_btl_openib_ft_event(int state); - - -/** - * Show an error during init, particularly when running out of - * registered memory. - */ -void mca_btl_openib_show_init_error(const char *file, int line, - const char *func, const char *dev); -/** - * Post to Shared Receive Queue with certain priority - * - * @param openib_btl (IN) BTL module - * @param additional (IN) Additional Bytes to reserve - * @param prio (IN) Priority (either BTL_OPENIB_HP_QP or BTL_OPENIB_LP_QP) - * @return OPAL_SUCCESS or failure status - */ - -int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp); - -/** - * Get a transport name of btl by its transport type. - */ - -const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type); - -/** - * Get an endpoint for a process - * - * @param btl (IN) BTL module - * @param proc (IN) opal process object - * - * This function will return an existing endpoint if one exists otherwise it will allocate - * a new endpoint and return it. - */ -struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_module_t *btl, - struct opal_proc_t *proc); - -/** - * Get a transport type of btl. - */ - -mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl); - -static inline int qp_cq_prio(const int qp) -{ - if(0 == qp) - return BTL_OPENIB_HP_CQ; /* smallest qp is always HP */ - - /* If the size for this qp is <= the eager limit, make it a - high priority QP. Otherwise, make it a low priority QP. */ - return (mca_btl_openib_component.qp_infos[qp].size <= - mca_btl_openib_component.eager_limit) ? - BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ; -} - -#define BTL_OPENIB_RDMA_QP(QP) \ - ((QP) == mca_btl_openib_component.rdma_qp) - -/** - * Run function as part of opal_progress() - * - * @param[in] fn function to run - * @param[in] arg function data - */ -int mca_btl_openib_run_in_main (void *(*fn)(void *), void *arg); - - -END_C_DECLS - -#endif /* MCA_BTL_IB_H */ diff --git a/opal/mca/btl/openib/btl_openib_async.c b/opal/mca/btl/openib/btl_openib_async.c deleted file mode 100644 index 3957bae2a9..0000000000 --- a/opal/mca/btl/openib/btl_openib_async.c +++ /dev/null @@ -1,508 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved - * Copyright (c) 2013-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "opal_config.h" - -#include -#include -#include -#include -#include - -#include "opal/util/show_help.h" -#include "opal/util/proc.h" - -#include "opal/mca/btl/base/base.h" -#include "btl_openib.h" -#include "btl_openib_mca.h" -#include "btl_openib_async.h" -#include "btl_openib_proc.h" -#include "btl_openib_endpoint.h" - -static opal_list_t ignore_qp_err_list; -static opal_mutex_t ignore_qp_err_list_lock; -static opal_atomic_int32_t btl_openib_async_device_count = 0; - -struct mca_btl_openib_async_poll { - int active_poll_size; - int poll_size; - struct pollfd *async_pollfd; -}; -typedef struct mca_btl_openib_async_poll mca_btl_openib_async_poll; - -typedef struct { - opal_list_item_t super; - struct ibv_qp *qp; -} mca_btl_openib_qp_list; - -OBJ_CLASS_INSTANCE(mca_btl_openib_qp_list, opal_list_item_t, NULL, NULL); - -static const char *openib_event_to_str (enum ibv_event_type event); - -/* Function converts event to string (name) - * Open Fabris don't have function that do this job :( - */ -static const char *openib_event_to_str (enum ibv_event_type event) -{ - switch (event) { - case IBV_EVENT_CQ_ERR: - return "IBV_EVENT_CQ_ERR"; - case IBV_EVENT_QP_FATAL: - return "IBV_EVENT_QP_FATAL"; - case IBV_EVENT_QP_REQ_ERR: - return "IBV_EVENT_QP_REQ_ERR"; - case IBV_EVENT_QP_ACCESS_ERR: - return "IBV_EVENT_QP_ACCESS_ERR"; - case IBV_EVENT_PATH_MIG: - return "IBV_EVENT_PATH_MIG"; - case IBV_EVENT_PATH_MIG_ERR: - return "IBV_EVENT_PATH_MIG_ERR"; - case IBV_EVENT_DEVICE_FATAL: - return "IBV_EVENT_DEVICE_FATAL"; - case IBV_EVENT_SRQ_ERR: - return "IBV_EVENT_SRQ_ERR"; - case IBV_EVENT_PORT_ERR: - return "IBV_EVENT_PORT_ERR"; - case IBV_EVENT_COMM_EST: - return "IBV_EVENT_COMM_EST"; - case IBV_EVENT_PORT_ACTIVE: - return "IBV_EVENT_PORT_ACTIVE"; - case IBV_EVENT_SQ_DRAINED: - return "IBV_EVENT_SQ_DRAINED"; - case IBV_EVENT_LID_CHANGE: - return "IBV_EVENT_LID_CHANGE"; - case IBV_EVENT_PKEY_CHANGE: - return "IBV_EVENT_PKEY_CHANGE"; - case IBV_EVENT_SM_CHANGE: - return "IBV_EVENT_SM_CHANGE"; - case IBV_EVENT_QP_LAST_WQE_REACHED: - return "IBV_EVENT_QP_LAST_WQE_REACHED"; -#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER - case IBV_EVENT_CLIENT_REREGISTER: - return "IBV_EVENT_CLIENT_REREGISTER"; -#endif - case IBV_EVENT_SRQ_LIMIT_REACHED: - return "IBV_EVENT_SRQ_LIMIT_REACHED"; - default: - return "UNKNOWN"; - } -} -/* QP to endpoint */ -static mca_btl_openib_endpoint_t * qp2endpoint(struct ibv_qp *qp, mca_btl_openib_device_t *device) -{ - mca_btl_openib_endpoint_t *ep; - int ep_i, qp_i; - for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) { - ep = opal_pointer_array_get_item(device->endpoints, ep_i); - for(qp_i = 0; qp_i < mca_btl_openib_component.num_qps; qp_i++) { - if (qp == ep->qps[qp_i].qp->lcl_qp) - return ep; - } - } - return NULL; -} - -#if OPAL_HAVE_CONNECTX_XRC -/* XRC recive QP to endpoint */ -static mca_btl_openib_endpoint_t * xrc_qp2endpoint(uint32_t qp_num, mca_btl_openib_device_t *device) -{ - mca_btl_openib_endpoint_t *ep; - int ep_i; - for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) { - ep = opal_pointer_array_get_item(device->endpoints, ep_i); - if (qp_num == ep->xrc_recv_qp_num) - return ep; - } - return NULL; -} -#endif - -/* Function inits mca_btl_openib_async_poll */ - -/* The main idea of resizing SRQ algorithm - - We create a SRQ with size = rd_num, but for efficient usage of resources - the number of WQEs that we post = rd_curr_num < rd_num and this value is - increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function), - the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */ -static int btl_openib_async_srq_limit_event(struct ibv_srq* srq) -{ - int qp, rc = OPAL_SUCCESS; - mca_btl_openib_module_t *openib_btl = NULL; - - opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock; - opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; - - opal_mutex_lock(lock); - - if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(srq_addr_table, - &srq, sizeof(struct ibv_srq*), (void*) &openib_btl)) { - /* If there isn't any element with the key in the table => - we assume that SRQ was destroyed and don't serve the event */ - goto srq_limit_event_exit; - } - - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if (!BTL_OPENIB_QP_TYPE_PP(qp)) { - if(openib_btl->qps[qp].u.srq_qp.srq == srq) { - break; - } - } - } - - if(qp >= mca_btl_openib_component.num_qps) { - BTL_ERROR(("Open MPI tried to access a shared receive queue (SRQ) on the device %s that was not found. This should not happen, and is a fatal error. Your MPI job will now abort.\n", ibv_get_device_name(openib_btl->device->ib_dev))); - rc = OPAL_ERROR; - goto srq_limit_event_exit; - } - - /* dynamically re-size the SRQ to be larger */ - openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1; - - if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= - mca_btl_openib_component.qp_infos[qp].rd_num) { - openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num; - openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low; - - openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false; - - goto srq_limit_event_exit; - } - - openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1; - openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true; - -srq_limit_event_exit: - opal_mutex_unlock(lock); - return rc; -} - -/* Function handle async device events */ -static void btl_openib_async_device (int fd, short flags, void *arg) -{ - mca_btl_openib_device_t *device = (mca_btl_openib_device_t *) arg; - struct ibv_async_event event; - int event_type; - - if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) { - if (EWOULDBLOCK != errno) { - BTL_ERROR(("Failed to get async event")); - } - - return; - } - - event_type = event.event_type; -#if OPAL_HAVE_CONNECTX_XRC - /* is it XRC event ?*/ - bool xrc_event = false; - if (IBV_XRC_QP_EVENT_FLAG & event.event_type) { - xrc_event = true; - /* Clean the bitnd handel as usual */ - event_type ^= IBV_XRC_QP_EVENT_FLAG; - } -#endif - switch(event_type) { - case IBV_EVENT_PATH_MIG: - BTL_ERROR(("Alternative path migration event reported")); - if (APM_ENABLED) { - BTL_ERROR(("Trying to find additional path...")); -#if OPAL_HAVE_CONNECTX_XRC - if (xrc_event) - mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num, - xrc_qp2endpoint(event.element.xrc_qp_num, device)); - else -#endif - mca_btl_openib_load_apm(event.element.qp, - qp2endpoint(event.element.qp, device)); - } - break; - case IBV_EVENT_DEVICE_FATAL: - /* Set the flag to fatal */ - device->got_fatal_event = true; - /* It is not critical to protect the counter */ - OPAL_THREAD_ADD_FETCH32(&mca_btl_openib_component.error_counter, 1); - /* fall through */ - case IBV_EVENT_CQ_ERR: - case IBV_EVENT_QP_FATAL: - if (event_type == IBV_EVENT_QP_FATAL) { - mca_btl_openib_qp_list *qp_item; - bool in_ignore_list = false; - - BTL_VERBOSE(("QP is in err state %p", (void *)event.element.qp)); - - /* look through ignore list */ - opal_mutex_lock (&ignore_qp_err_list_lock); - OPAL_LIST_FOREACH(qp_item, &ignore_qp_err_list, mca_btl_openib_qp_list) { - if (qp_item->qp == event.element.qp) { - BTL_VERBOSE(("QP %p is in error ignore list", - (void *)event.element.qp)); - in_ignore_list = true; - break; - } - } - opal_mutex_unlock (&ignore_qp_err_list_lock); - - if (in_ignore_list) { - break; - } - } - /* fall through */ - case IBV_EVENT_QP_REQ_ERR: - case IBV_EVENT_QP_ACCESS_ERR: - case IBV_EVENT_PATH_MIG_ERR: - case IBV_EVENT_SRQ_ERR: - opal_show_help("help-mpi-btl-openib.txt", "of error event", - true,opal_process_info.nodename, (int)getpid(), - event_type, - openib_event_to_str((enum ibv_event_type)event_type)); - break; - case IBV_EVENT_PORT_ERR: - opal_show_help("help-mpi-btl-openib.txt", "of error event", - true,opal_process_info.nodename, (int)getpid(), - event_type, - openib_event_to_str((enum ibv_event_type)event_type)); - /* Set the flag to indicate port error */ - device->got_port_event = true; - OPAL_THREAD_ADD_FETCH32(&mca_btl_openib_component.error_counter, 1); - break; - case IBV_EVENT_COMM_EST: - case IBV_EVENT_PORT_ACTIVE: - case IBV_EVENT_SQ_DRAINED: - case IBV_EVENT_LID_CHANGE: - case IBV_EVENT_PKEY_CHANGE: - case IBV_EVENT_SM_CHANGE: - case IBV_EVENT_QP_LAST_WQE_REACHED: -#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER - case IBV_EVENT_CLIENT_REREGISTER: -#endif - break; - /* The event is signaled when number of prepost receive WQEs is going - under predefined threshold - srq_limit */ - case IBV_EVENT_SRQ_LIMIT_REACHED: - (void) btl_openib_async_srq_limit_event (event.element.srq); - - break; - default: - opal_show_help("help-mpi-btl-openib.txt", "of unknown event", - true,opal_process_info.nodename, (int)getpid(), - event_type); - } - - ibv_ack_async_event(&event); -} - -static void apm_update_attr(struct ibv_qp_attr *attr, enum ibv_qp_attr_mask *mask) -{ - *mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE; - attr->alt_ah_attr.dlid = attr->ah_attr.dlid + 1; - attr->alt_ah_attr.src_path_bits = attr->ah_attr.src_path_bits + 1; - attr->alt_ah_attr.static_rate = attr->ah_attr.static_rate; - attr->alt_ah_attr.sl = attr->ah_attr.sl; - attr->alt_pkey_index = attr->pkey_index; - attr->alt_port_num = attr->port_num; - attr->alt_timeout = attr->timeout; - attr->path_mig_state = IBV_MIG_REARM; - BTL_VERBOSE(("New APM LMC loaded: alt_src_port:%d, dlid: %d, src_bits %d, old_src_bits: %d, old_dlid %d", - attr->alt_port_num, attr->alt_ah_attr.dlid, - attr->alt_ah_attr.src_path_bits, attr->ah_attr.src_path_bits, attr->ah_attr.dlid)); -} - -static int apm_update_port(mca_btl_openib_endpoint_t *ep, - struct ibv_qp_attr *attr, enum ibv_qp_attr_mask *mask) -{ - size_t port_i; - uint16_t apm_lid = 0; - - if (attr->port_num == ep->endpoint_btl->apm_port) { - /* all ports were used */ - BTL_ERROR(("APM: already all ports were used port_num %d apm_port %d", - attr->port_num, ep->endpoint_btl->apm_port)); - return OPAL_ERROR; - } - /* looking for alternatve lid on remote site */ - for(port_i = 0; port_i < ep->endpoint_proc->proc_port_count; port_i++) { - if (ep->endpoint_proc->proc_ports[port_i].pm_port_info.lid == attr->ah_attr.dlid - mca_btl_openib_component.apm_lmc) { - apm_lid = ep->endpoint_proc->proc_ports[port_i].pm_port_info.apm_lid; - } - } - if (0 == apm_lid) { - /* APM was disabled on one of site ? */ - BTL_VERBOSE(("APM: Was disabled ? dlid %d %d %d", attr->ah_attr.dlid, attr->ah_attr.src_path_bits, ep->endpoint_btl->src_path_bits)); - return OPAL_ERROR; - } - /* We guess cthat the LMC is the same on all ports */ - attr->alt_ah_attr.static_rate = attr->ah_attr.static_rate; - attr->alt_ah_attr.sl = attr->ah_attr.sl; - attr->alt_pkey_index = attr->pkey_index; - attr->alt_timeout = attr->timeout; - attr->path_mig_state = IBV_MIG_REARM; - *mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE; - - attr->alt_port_num = ep->endpoint_btl->apm_port; - attr->alt_ah_attr.src_path_bits = ep->endpoint_btl->src_path_bits; - attr->alt_ah_attr.dlid = apm_lid; - - BTL_VERBOSE(("New APM port loaded: alt_src_port:%d, dlid: %d, src_bits: %d:%d, old_dlid %d", - attr->alt_port_num, attr->alt_ah_attr.dlid, - attr->ah_attr.src_path_bits, attr->alt_ah_attr.src_path_bits, - attr->ah_attr.dlid)); - return OPAL_SUCCESS; -} - -/* Load new dlid to the QP */ -void mca_btl_openib_load_apm(struct ibv_qp *qp, mca_btl_openib_endpoint_t *ep) -{ - struct ibv_qp_init_attr qp_init_attr; - struct ibv_qp_attr attr; - enum ibv_qp_attr_mask mask = 0; - struct mca_btl_openib_module_t *btl; - - BTL_VERBOSE(("APM: Loading alternative path")); - assert (NULL != ep); - btl = ep->endpoint_btl; - - if (ibv_query_qp(qp, &attr, mask, &qp_init_attr)) - BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp->qp_num)); - - if (mca_btl_openib_component.apm_lmc && - attr.ah_attr.src_path_bits - btl->src_path_bits < mca_btl_openib_component.apm_lmc) { - BTL_VERBOSE(("APM LMC: src: %d btl_src: %d lmc_max: %d", - attr.ah_attr.src_path_bits, - btl->src_path_bits, - mca_btl_openib_component.apm_lmc)); - apm_update_attr(&attr, &mask); - } else { - if (mca_btl_openib_component.apm_ports) { - /* Try to migrate to next port */ - if (OPAL_SUCCESS != apm_update_port(ep, &attr, &mask)) - return; - } else { - BTL_ERROR(("Failed to load alternative path, all %d were used", - attr.ah_attr.src_path_bits - btl->src_path_bits)); - } - } - - if (ibv_modify_qp(qp, &attr, mask)) - BTL_ERROR(("Failed to ibv_query_qp, qp num: %d, errno says: %s (%d)", - qp->qp_num, strerror(errno), errno)); -} - -#if OPAL_HAVE_CONNECTX_XRC -void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep) -{ - struct ibv_qp_init_attr qp_init_attr; - struct ibv_qp_attr attr; - enum ibv_qp_attr_mask mask = 0; - struct mca_btl_openib_module_t *btl; - - BTL_VERBOSE(("APM XRC: Loading alternative path")); - assert (NULL != ep); - btl = ep->endpoint_btl; - - if (ibv_query_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask, &qp_init_attr)) - BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp_num)); - - if (mca_btl_openib_component.apm_lmc && - attr.ah_attr.src_path_bits - btl->src_path_bits < mca_btl_openib_component.apm_lmc) { - apm_update_attr(&attr, &mask); - } else { - if (mca_btl_openib_component.apm_ports) { - /* Try to migrate to next port */ - if (OPAL_SUCCESS != apm_update_port(ep, &attr, &mask)) - return; - } else { - BTL_ERROR(("Failed to load alternative path, all %d were used", - attr.ah_attr.src_path_bits - btl->src_path_bits)); - } - } - - ibv_modify_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask); - - /* Maybe the qp already was modified by other process - ignoring error */ -} -#endif - -int mca_btl_openib_async_init (void) -{ - if (!mca_btl_openib_component.use_async_event_thread || - mca_btl_openib_component.async_evbase) { - return OPAL_SUCCESS; - } - - mca_btl_openib_component.async_evbase = opal_progress_thread_init (NULL); - - OBJ_CONSTRUCT(&ignore_qp_err_list, opal_list_t); - OBJ_CONSTRUCT(&ignore_qp_err_list_lock, opal_mutex_t); - - /* Set the error counter to zero */ - mca_btl_openib_component.error_counter = 0; - - return OPAL_SUCCESS; -} - -void mca_btl_openib_async_fini (void) -{ - if (mca_btl_openib_component.async_evbase) { - OPAL_LIST_DESTRUCT(&ignore_qp_err_list); - OBJ_DESTRUCT(&ignore_qp_err_list_lock); - opal_progress_thread_finalize (NULL); - mca_btl_openib_component.async_evbase = NULL; - } -} - -void mca_btl_openib_async_add_device (mca_btl_openib_device_t *device) -{ - if (mca_btl_openib_component.async_evbase) { - if (1 == OPAL_THREAD_ADD_FETCH32 (&btl_openib_async_device_count, 1)) { - mca_btl_openib_async_init (); - } - opal_event_set (mca_btl_openib_component.async_evbase, &device->async_event, - device->ib_dev_context->async_fd, OPAL_EV_READ | OPAL_EV_PERSIST, - btl_openib_async_device, device); - opal_event_add (&device->async_event, 0); - } -} - -void mca_btl_openib_async_rem_device (mca_btl_openib_device_t *device) -{ - if (mca_btl_openib_component.async_evbase) { - opal_event_del (&device->async_event); - if (0 == OPAL_THREAD_ADD_FETCH32 (&btl_openib_async_device_count, -1)) { - mca_btl_openib_async_fini (); - } - } -} - -void mca_btl_openib_async_add_qp_ignore (struct ibv_qp *qp) -{ - if (mca_btl_openib_component.async_evbase) { - mca_btl_openib_qp_list *new_qp = OBJ_NEW(mca_btl_openib_qp_list); - if (OPAL_UNLIKELY(NULL == new_qp)) { - /* can allocate a small object. not much more can be done */ - return; - } - - BTL_VERBOSE(("Ignoring errors on QP %p", (void *) qp)); - new_qp->qp = qp; - opal_mutex_lock (&ignore_qp_err_list_lock); - opal_list_append (&ignore_qp_err_list, (opal_list_item_t *) new_qp); - opal_mutex_unlock (&ignore_qp_err_list_lock); - } -} diff --git a/opal/mca/btl/openib/btl_openib_async.h b/opal/mca/btl/openib/btl_openib_async.h deleted file mode 100644 index b62fdbec3f..0000000000 --- a/opal/mca/btl/openib/btl_openib_async.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * received. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#ifndef MCA_BTL_OPENIB_ASYNC_H -#define MCA_BTL_OPENIB_ASYNC_H -#include "btl_openib_endpoint.h" - -void mca_btl_openib_load_apm(struct ibv_qp *qp, mca_btl_openib_endpoint_t *ep); -#if OPAL_HAVE_CONNECTX_XRC -void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep); -#endif - -#define APM_ENABLED (0 != mca_btl_openib_component.apm_lmc || 0 != mca_btl_openib_component.apm_ports) - -/** - * Initialize the async event base - */ -int mca_btl_openib_async_init (void); - -/** - * Finalize the async event base - */ -void mca_btl_openib_async_fini (void); - -/** - * Register a device with the async event base - * - * @param[in] device device to register - */ -void mca_btl_openib_async_add_device (mca_btl_openib_device_t *device); - -/** - * Deregister a device with the async event base - * - * @param[in] device device to deregister - */ -void mca_btl_openib_async_rem_device (mca_btl_openib_device_t *device); - -/** - * Ignore error events on a queue pair - * - * @param[in] qp queue pair to ignore - */ -void mca_btl_openib_async_add_qp_ignore (struct ibv_qp *qp); - -#endif diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c deleted file mode 100644 index ec0eb644f1..0000000000 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ /dev/null @@ -1,140 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_openib.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_xrc.h" - -#if HAVE_DECL_IBV_ATOMIC_HCA - -static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, enum ibv_wr_opcode opcode, - int64_t operand, int64_t operand2, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) -{ - mca_btl_openib_get_frag_t* frag = NULL; - int qp = order; - int32_t rkey; - int rc; - - frag = to_get_frag(alloc_recv_user_frag()); - if (OPAL_UNLIKELY(NULL == frag)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (MCA_BTL_NO_ORDER == qp) { - qp = mca_btl_openib_component.rdma_qp; - } - - /* set base descriptor flags */ - to_base_frag(frag)->base.order = qp; - /* free this descriptor when the operation is complete */ - to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - - /* set up scatter-gather entry */ - to_com_frag(frag)->sg_entry.length = 8; - to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; - to_com_frag(frag)->endpoint = endpoint; - - /* set up rdma callback */ - frag->cb.func = cbfunc; - frag->cb.context = cbcontext; - frag->cb.data = cbdata; - frag->cb.local_handle = local_handle; - - /* set up descriptor */ - frag->sr_desc.wr.atomic.remote_addr = remote_address; - frag->sr_desc.opcode = opcode; - frag->sr_desc.wr.atomic.compare_add = operand; - frag->sr_desc.wr.atomic.swap = operand2; - - rkey = remote_handle->rkey; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - rkey = opal_swap_bytes4 (rkey); - } -#endif - - frag->sr_desc.wr.atomic.rkey = rkey; - - /* NTH: the SRQ# is set in mca_btl_get_internal */ - - if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - rc = check_endpoint_state(endpoint, &to_base_frag(frag)->base, &endpoint->pending_get_frags); - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - if (OPAL_ERR_RESOURCE_BUSY == rc) { - return OPAL_SUCCESS; - } - - if (OPAL_SUCCESS != rc) { - MCA_BTL_IB_FRAG_RETURN (frag); - return rc; - } - } - - rc = mca_btl_openib_get_internal (btl, endpoint, frag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { - rc = OPAL_SUCCESS; - - OPAL_THREAD_SCOPED_LOCK(&endpoint->endpoint_lock, - opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag)); - } else { - MCA_BTL_IB_FRAG_RETURN (frag); - } - } - - return rc; -} - -int mca_btl_openib_atomic_fop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, - uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) -{ - - if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op || (MCA_BTL_ATOMIC_FLAG_32BIT & flags))) { - return OPAL_ERR_NOT_SUPPORTED; - } - - return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, - remote_handle, IBV_WR_ATOMIC_FETCH_AND_ADD, operand, 0, - flags, order, cbfunc, cbcontext, cbdata); -} - -int mca_btl_openib_atomic_cswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, - uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) -{ - if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_FLAG_32BIT & flags)) { - return OPAL_ERR_NOT_SUPPORTED; - } - - return mca_btl_openib_atomic_internal (btl, endpoint, local_address, remote_address, local_handle, - remote_handle, IBV_WR_ATOMIC_CMP_AND_SWP, compare, value, - flags, order, cbfunc, cbcontext, cbdata); -} - -#endif diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c deleted file mode 100644 index 587ace9ac7..0000000000 --- a/opal/mca/btl/openib/btl_openib_component.c +++ /dev/null @@ -1,4093 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#include -#include -#include -#include - -#include "opal/mca/memory/memory.h" -#include "opal/mca/event/event.h" -#include "opal/align.h" -#include "opal/util/output.h" -#include "opal/util/argv.h" -#include "opal/mca/timer/base/base.h" -#include "opal/sys/atomic.h" -#include "opal/util/sys_limits.h" -#include "opal/util/argv.h" -#include "opal/memoryhooks/memory.h" -/* Define this before including hwloc.h so that we also get the hwloc - verbs helper header file, too. We have to do this level of - indirection because the hwloc subsystem is a component -- we don't - know its exact path. We have to rely on the framework header files - to find the right hwloc verbs helper file for us. */ -#define OPAL_HWLOC_WANT_VERBS_HELPER 1 -#include "opal/mca/hwloc/hwloc-internal.h" -#include "opal/mca/hwloc/base/base.h" -#include "opal/mca/installdirs/installdirs.h" -#include "opal_stdint.h" -#include "opal/util/show_help.h" -#include "opal/util/printf.h" -#include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/base.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/rcache/rcache.h" -#include "opal/mca/rcache/base/base.h" -#include "opal/mca/common/cuda/common_cuda.h" -#include "opal/mca/common/verbs/common_verbs.h" -#include "opal/runtime/opal_params.h" -#include "opal/runtime/opal.h" -#include "opal/mca/pmix/pmix.h" -#include "opal/util/proc.h" - -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_eager_rdma.h" -#include "btl_openib_proc.h" -#include "btl_openib_ini.h" -#include "btl_openib_mca.h" -#include "btl_openib_xrc.h" -#include "btl_openib_async.h" -#include "connect/base.h" -#include "btl_openib_ip.h" - -#define EPS 1.e-6 -/* - * Local functions - */ -static int btl_openib_component_register(void); -static int btl_openib_component_open(void); -static int btl_openib_component_close(void); -static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool); -static int btl_openib_component_progress(void); -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ -static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl, - mca_btl_openib_endpoint_t *ep, - mca_btl_base_descriptor_t* des, - int status); -#endif /* OPAL_CUDA_SUPPORT */ -/* - * Local variables - */ -static mca_btl_openib_device_t *receive_queues_device = NULL; -static int num_devices_intentionally_ignored = 0; - -mca_btl_openib_component_t mca_btl_openib_component = { - .super = { - /* First, the mca_base_component_t struct containing meta information - about the component itself */ - - .btl_version = { - MCA_BTL_DEFAULT_VERSION("openib"), - .mca_open_component = btl_openib_component_open, - .mca_close_component = btl_openib_component_close, - .mca_register_component_params = btl_openib_component_register, - }, - .btl_data = { - /* The component is checkpoint ready */ - .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - .btl_init = btl_openib_component_init, - .btl_progress = btl_openib_component_progress, - } -}; - -static int btl_openib_component_register(void) -{ - int ret; - - /* register IB component parameters */ - if (OPAL_SUCCESS != (ret = btl_openib_register_mca_params())) { - return ret; - } - - mca_btl_openib_component.max_send_size = - mca_btl_openib_module.super.btl_max_send_size; - mca_btl_openib_component.eager_limit = - mca_btl_openib_module.super.btl_eager_limit; - - /* if_include and if_exclude need to be mutually exclusive */ - if (OPAL_SUCCESS != - mca_base_var_check_exclusive("ompi", - mca_btl_openib_component.super.btl_version.mca_type_name, - mca_btl_openib_component.super.btl_version.mca_component_name, - "if_include", - mca_btl_openib_component.super.btl_version.mca_type_name, - mca_btl_openib_component.super.btl_version.mca_component_name, - "if_exclude")) { - /* Return ERR_NOT_AVAILABLE so that a warning message about - "open" failing is not printed */ - return OPAL_ERR_NOT_AVAILABLE; - } - -#if OPAL_CUDA_SUPPORT - mca_common_cuda_register_mca_variables(); -#endif - - return OPAL_SUCCESS; -} - -/* - * Called by MCA framework to open the component - */ -static int btl_openib_component_open(void) -{ - opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock; - opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; - - /* Construct hash table that stores pointers to SRQs */ - OBJ_CONSTRUCT(lock, opal_mutex_t); - OBJ_CONSTRUCT(srq_addr_table, opal_hash_table_t); - - /* initialize state */ - mca_btl_openib_component.ib_num_btls = 0; - mca_btl_openib_component.num_default_gid_btls = 0; - mca_btl_openib_component.openib_btls = NULL; - OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t); - mca_btl_openib_component.devices_count = 0; - mca_btl_openib_component.cpc_explicitly_defined = false; - - /* initialize objects */ - OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t); - mca_btl_openib_component.memory_registration_verbose = -1; - -#if OPAL_CUDA_SUPPORT - mca_common_cuda_stage_one_init(); -#endif /* OPAL_CUDA_SUPPORT */ - - return OPAL_SUCCESS; -} - -/* - * component cleanup - sanity checking of queue lengths - */ - -static int btl_openib_component_close(void) -{ - int rc = OPAL_SUCCESS; - - /* remove the async event from the event base */ - mca_btl_openib_async_fini (); - - OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.lock); - OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.srq_addr_table); - - opal_btl_openib_connect_base_finalize(); - opal_btl_openib_ini_finalize(); - - if (NULL != mca_btl_openib_component.default_recv_qps) { - free(mca_btl_openib_component.default_recv_qps); - } - - /* close memory registration debugging output */ - opal_output_close (mca_btl_openib_component.memory_registration_verbose); - -#if OPAL_CUDA_SUPPORT - mca_common_cuda_fini(); -#endif /* OPAL_CUDA_SUPPORT */ - - return rc; -} - -static void inline pack8(char **dest, uint8_t value) -{ - /* Copy one character */ - **dest = (char) value; - /* Most the dest ahead one */ - ++*dest; -} - -/* - * Register local openib port information with the modex so that it - * can be shared with all other peers. - */ -static int btl_openib_modex_send(void) -{ - int rc, i, j; - int modex_message_size; - char *message, *offset; - size_t size, msg_size; - opal_btl_openib_connect_base_module_t *cpc; - - opal_output(-1, "Starting to modex send"); - if (0 == mca_btl_openib_component.ib_num_btls) { - return 0; - } - modex_message_size = offsetof(mca_btl_openib_modex_message_t, end); - - /* The message is packed into multiple parts: - * 1. a uint8_t indicating the number of modules (ports) in the message - * 2. for each module: - * a. the common module data - * b. a uint8_t indicating how many CPCs follow - * c. for each CPC: - * a. a uint8_t indicating the index of the CPC in the all[] - * array in btl_openib_connect_base.c - * b. a uint8_t indicating the priority of this CPC - * c. a uint8_t indicating the length of the blob to follow - * d. a blob that is only meaningful to that CPC - */ - msg_size = - /* uint8_t for number of modules in the message */ - 1 + - /* For each module: */ - mca_btl_openib_component.ib_allowed_btls * - ( - /* Common module data */ - modex_message_size + - /* uint8_t for how many CPCs follow */ - 1 - ); - /* For each module, add in the size of the per-CPC data */ - for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - if (! mca_btl_openib_component.openib_btls[i]->allowed) { - continue; - } - for (j = 0; - j < mca_btl_openib_component.openib_btls[i]->num_cpcs; - ++j) { - msg_size += - /* uint8_t for the index of the CPC */ - 1 + - /* uint8_t for the CPC's priority */ - 1 + - /* uint8_t for the blob length */ - 1 + - /* blob length */ - mca_btl_openib_component.openib_btls[i]->cpcs[j]->data.cbm_modex_message_len; - } - } - message = (char *) malloc(msg_size); - if (NULL == message) { - BTL_ERROR(("Failed malloc")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Pack the number of modules */ - offset = message; - pack8(&offset, mca_btl_openib_component.ib_allowed_btls); - opal_output(-1, "modex sending %d btls (packed: %d, offset now at %d)", mca_btl_openib_component.ib_allowed_btls, *((uint8_t*) message), (int) (offset - message)); - - /* Pack each of the modules */ - for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - - if (! mca_btl_openib_component.openib_btls[i]->allowed) { - continue; - } - /* Pack the modex common message struct. */ - size = modex_message_size; - - (mca_btl_openib_component.openib_btls[i]->port_info).vendor_id = - (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id; - - (mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id = - (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id; - - (mca_btl_openib_component.openib_btls[i]->port_info).transport_type = - mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]); - - memcpy(offset, - &(mca_btl_openib_component.openib_btls[i]->port_info), - size); - opal_output(-1, "modex packed btl port modex message: 0x%" PRIx64 ", %d, %d (size: %d)", - mca_btl_openib_component.openib_btls[i]->port_info.subnet_id, - mca_btl_openib_component.openib_btls[i]->port_info.mtu, - mca_btl_openib_component.openib_btls[i]->port_info.lid, - (int) size); - -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - MCA_BTL_OPENIB_MODEX_MSG_HTON(*(mca_btl_openib_modex_message_t *)offset); -#endif - offset += size; - opal_output(-1, "modex packed btl %d: modex message, offset now %d", - i, (int) (offset -message)); - - /* Pack the number of CPCs that follow */ - pack8(&offset, - mca_btl_openib_component.openib_btls[i]->num_cpcs); - opal_output(-1, "modex packed btl %d: to pack %d cpcs (packed: %d, offset now %d)", - i, mca_btl_openib_component.openib_btls[i]->num_cpcs, - *((uint8_t*) (offset - 1)), (int) (offset-message)); - - /* Pack each CPC */ - for (j = 0; - j < mca_btl_openib_component.openib_btls[i]->num_cpcs; - ++j) { - uint8_t u8; - - cpc = mca_btl_openib_component.openib_btls[i]->cpcs[j]; - opal_output(-1, "modex packed btl %d: packing cpc %s", - i, cpc->data.cbm_component->cbc_name); - /* Pack the CPC index */ - u8 = opal_btl_openib_connect_base_get_cpc_index(cpc->data.cbm_component); - pack8(&offset, u8); - opal_output(-1, "packing btl %d: cpc %d: index %d (packed %d, offset now %d)", - i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message)); - /* Pack the CPC priority */ - pack8(&offset, cpc->data.cbm_priority); - opal_output(-1, "packing btl %d: cpc %d: priority %d (packed %d, offset now %d)", - i, j, cpc->data.cbm_priority, *((uint8_t*) (offset-1)), (int)(offset-message)); - /* Pack the blob length */ - u8 = cpc->data.cbm_modex_message_len; - pack8(&offset, u8); - opal_output(-1, "packing btl %d: cpc %d: message len %d (packed %d, offset now %d)", - i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message)); - /* If the blob length is > 0, pack the blob */ - if (u8 > 0) { - memcpy(offset, cpc->data.cbm_modex_message, u8); - offset += u8; - opal_output(-1, "packing btl %d: cpc %d: blob packed %d %x (offset now %d)", - i, j, - ((uint32_t*)cpc->data.cbm_modex_message)[0], - ((uint32_t*)cpc->data.cbm_modex_message)[1], - (int)(offset-message)); - } - - /* Sanity check */ - assert((size_t) (offset - message) <= msg_size); - } - } - - /* All done -- send it! */ - OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, - &mca_btl_openib_component.super.btl_version, - message, msg_size); - free(message); - opal_output(-1, "Modex sent! %d calculated, %d actual\n", (int) msg_size, (int) (offset - message)); - - return rc; -} - -/* - * Active Message Callback function on control message. - */ - -static void btl_openib_control(mca_btl_base_module_t* btl, - mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, - void* cbdata) -{ - /* don't return credits used for control messages */ - mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; - mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint; - mca_btl_openib_control_header_t *ctl_hdr = - (mca_btl_openib_control_header_t *) to_base_frag(des)->segment.seg_addr.pval; - mca_btl_openib_eager_rdma_header_t *rdma_hdr; - mca_btl_openib_header_coalesced_t *clsc_hdr = - (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1); - mca_btl_active_message_callback_t* reg; - size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr); - - switch (ctl_hdr->type) { - case MCA_BTL_OPENIB_CONTROL_CREDITS: - assert(0); /* Credit message is handled elsewhere */ - break; - case MCA_BTL_OPENIB_CONTROL_RDMA: - rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr; - - BTL_VERBOSE(("prior to NTOH received rkey %" PRIu32 - ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32, - rdma_hdr->rkey, - rdma_hdr->rdma_start.lval, - rdma_hdr->rdma_start.pval, - rdma_hdr->rdma_start.ival - )); - - if(ep->nbo) { - BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr); - } - - BTL_VERBOSE(("received rkey %" PRIu32 - ", rdma_start.lval %" PRIx64 ", pval %p," - " ival %" PRIu32, rdma_hdr->rkey, - rdma_hdr->rdma_start.lval, - rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival)); - - if (ep->eager_rdma_remote.base.pval) { - BTL_ERROR(("Got RDMA connect twice!")); - return; - } - ep->eager_rdma_remote.rkey = rdma_hdr->rkey; - ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval; - ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1; - break; - case MCA_BTL_OPENIB_CONTROL_COALESCED: - { - size_t pad = 0; - while(len > 0) { - size_t skip; - mca_btl_openib_header_coalesced_t* unalign_hdr = 0; - mca_btl_base_descriptor_t tmp_des; - mca_btl_base_segment_t tmp_seg; - - assert(len >= sizeof(*clsc_hdr)); - - if(ep->nbo) - BTL_OPENIB_HEADER_COALESCED_NTOH(*clsc_hdr); - - skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad); - - tmp_des.des_segments = &tmp_seg; - tmp_des.des_segment_count = 1; - tmp_seg.seg_addr.pval = clsc_hdr + 1; - tmp_seg.seg_len = clsc_hdr->size; - - /* call registered callback */ - reg = mca_btl_base_active_message_trigger + clsc_hdr->tag; - reg->cbfunc( &obtl->super, clsc_hdr->tag, &tmp_des, reg->cbdata ); - len -= (skip + pad); - unalign_hdr = (mca_btl_openib_header_coalesced_t*) - ((unsigned char*)clsc_hdr + skip); - pad = (size_t)BTL_OPENIB_COALESCE_HDR_PADDING(unalign_hdr); - clsc_hdr = (mca_btl_openib_header_coalesced_t*)((unsigned char*)unalign_hdr + - pad); - } - } - break; - case MCA_BTL_OPENIB_CONTROL_CTS: - OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d", - opal_get_proc_hostname(ep->endpoint_proc->proc_opal), - (void*) ctl_hdr, - ep->endpoint_posted_recvs, ep->endpoint_cts_sent)); - ep->endpoint_cts_received = true; - - /* Only send the CTS back and mark connected if: - - we have posted our receives (it's possible that we can - get this CTS before this side's CPC has called - cpc_complete()) - - we have not yet sent our CTS - - We don't even want to mark the endpoint connected() until - we have posted our receives because otherwise we will - trigger credit management (because the rd_credits will - still be negative), and Bad Things will happen. */ - if (ep->endpoint_posted_recvs) { - /* need to hold to lock for both send_cts and connected */ - OPAL_THREAD_LOCK(&ep->endpoint_lock); - if (!ep->endpoint_cts_sent) { - mca_btl_openib_endpoint_send_cts(ep); - } - mca_btl_openib_endpoint_connected(ep); - } - break; - default: - BTL_ERROR(("Unknown message type received by BTL")); - break; - } -} - -static int openib_reg_mr (void *reg_data, void *base, size_t size, - mca_rcache_base_registration_t *reg) -{ - mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; - mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; - enum ibv_access_flags access_flag = 0; - - if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) { - access_flag |= IBV_ACCESS_REMOTE_READ; - } - - if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_WRITE) { - access_flag |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; - } - - if (reg->access_flags & MCA_RCACHE_ACCESS_LOCAL_WRITE) { - access_flag |= IBV_ACCESS_LOCAL_WRITE; - } - -#if HAVE_DECL_IBV_ATOMIC_HCA - if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_ATOMIC) { - access_flag |= IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE; - } -#endif - - if (device->mem_reg_max && - device->mem_reg_max < (device->mem_reg_active + size)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - device->mem_reg_active += size; - -#if HAVE_DECL_IBV_ACCESS_SO - if (reg->flags & MCA_RCACHE_FLAGS_SO_MEM) { - access_flag |= IBV_ACCESS_SO; - } -#endif - - openib_reg->mr = ibv_reg_mr(device->ib_pd, base, size, access_flag); - - if (NULL == openib_reg->mr) { - OPAL_OUTPUT_VERBOSE((5, mca_btl_openib_component.memory_registration_verbose, - "ibv_reg_mr() failed: base=%p, bound=%p, size=%d, flags=0x%x, errno=%d", - reg->base, reg->bound, (int) (reg->bound - reg->base + 1), reg->flags, errno)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - openib_reg->btl_handle.lkey = openib_reg->mr->lkey; - openib_reg->btl_handle.rkey = openib_reg->mr->rkey; - - OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose, - "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound, - (int) (reg->bound - reg->base + 1), reg->flags)); - -#if OPAL_CUDA_SUPPORT - if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) { - mca_common_cuda_register (base, size, - openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name); - } -#endif - - return OPAL_SUCCESS; -} - -static int openib_dereg_mr(void *reg_data, mca_rcache_base_registration_t *reg) -{ - mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; - mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; - - OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose, - "openib_dereg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound, - (int) (reg->bound - reg->base + 1), reg->flags)); - - if(openib_reg->mr != NULL) { - if(ibv_dereg_mr(openib_reg->mr)) { - BTL_ERROR(("%s: error unpinning openib memory errno says %s", - __func__, strerror(errno))); - return OPAL_ERROR; - } - -#if OPAL_CUDA_SUPPORT - if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) { - mca_common_cuda_unregister(openib_reg->base.base, - openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name); - } -#endif - - } - - device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1); - - openib_reg->mr = NULL; - return OPAL_SUCCESS; -} - -static inline int param_register_uint(const char* param_name, unsigned int default_value, unsigned int *storage) -{ - *storage = default_value; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - return *storage; -} - -static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, - uint8_t port_num, uint16_t pkey_index, - struct ibv_port_attr *ib_port_attr) -{ - uint16_t lid, i, lmc, lmc_step; - mca_btl_openib_module_t *openib_btl; - mca_btl_base_selected_module_t *ib_selected; - union ibv_gid gid; - uint64_t subnet_id; - -/* - * Starting with Open MPI 4.0 we don't support infiniband - * unless the user specifically requested to override this - * policy. For ancient OFED, only allow if user has set - * the MCA parameter. - */ - if (! mca_btl_openib_component.allow_ib -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - && IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer -#endif - ) { - openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t)); - if(NULL == openib_btl) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - memcpy(openib_btl, &mca_btl_openib_module, - sizeof(mca_btl_openib_module)); - ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); - ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; - openib_btl->device = device; - openib_btl->port_num = (uint8_t) port_num; - openib_btl->allowed = false; - OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); - opal_list_append(btl_list, (opal_list_item_t*) ib_selected); - opal_pointer_array_add(device->device_btls, (void*) openib_btl); - ++device->btls; - ++mca_btl_openib_component.ib_num_btls; - if (-1 != mca_btl_openib_component.ib_max_btls && - mca_btl_openib_component.ib_num_btls >= - mca_btl_openib_component.ib_max_btls) { - return OPAL_ERR_VALUE_OUT_OF_BOUNDS; - } - return OPAL_SUCCESS; - } - - - /* Ensure that the requested GID index (via the - btl_openib_gid_index MCA param) is within the GID table - size. */ - if (mca_btl_openib_component.gid_index > - ib_port_attr->gid_tbl_len) { - opal_show_help("help-mpi-btl-openib.txt", "gid index too large", - true, opal_process_info.nodename, - ibv_get_device_name(device->ib_dev), port_num, - mca_btl_openib_component.gid_index, - ib_port_attr->gid_tbl_len); - return OPAL_ERR_NOT_FOUND; - } - BTL_VERBOSE(("looking for %s:%d GID index %d", - ibv_get_device_name(device->ib_dev), port_num, - mca_btl_openib_component.gid_index)); - - /* If we have struct ibv_device.transport_type, then we're >= OFED - v1.2, and the transport could be iWarp or IB. If we don't have - that member, then we're < OFED v1.2, and it can only be IB. */ -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) { - subnet_id = mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num); - BTL_VERBOSE(("my iWARP subnet_id is %016" PRIx64, subnet_id)); - } else { - memset(&gid, 0, sizeof(gid)); - if (0 != ibv_query_gid(device->ib_dev_context, port_num, - mca_btl_openib_component.gid_index, &gid)) { - BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n", - ibv_get_device_name(device->ib_dev), port_num, - mca_btl_openib_component.gid_index)); - return OPAL_ERR_NOT_FOUND; - } - -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - if (IBV_LINK_LAYER_ETHERNET == ib_port_attr->link_layer) { - subnet_id = mca_btl_openib_component.rroce_enable ? 0 : - mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num); - } else { - subnet_id = ntoh64(gid.global.subnet_prefix); - } -#else - subnet_id = ntoh64(gid.global.subnet_prefix); -#endif - - BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64, - ibv_get_device_name(device->ib_dev), port_num, subnet_id)); - } -#else - if (0 != ibv_query_gid(device->ib_dev_context, port_num, - mca_btl_openib_component.gid_index, &gid)) { - BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n", - ibv_get_device_name(device->ib_dev), port_num, - mca_btl_openib_component.gid_index)); - return OPAL_ERR_NOT_FOUND; - } - subnet_id = ntoh64(gid.global.subnet_prefix); - BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64, - ibv_get_device_name(device->ib_dev), port_num, subnet_id)); -#endif - - if(mca_btl_openib_component.num_default_gid_btls > 0 && - IB_DEFAULT_GID_PREFIX == subnet_id && - mca_btl_openib_component.warn_default_gid_prefix) { - opal_show_help("help-mpi-btl-openib.txt", "default subnet prefix", - true, opal_process_info.nodename); - } - - if (IB_DEFAULT_GID_PREFIX == subnet_id) { - mca_btl_openib_component.num_default_gid_btls++; - } - - lmc = (1 << ib_port_attr->lmc); - lmc_step = 1; - - if (0 != mca_btl_openib_component.max_lmc && - mca_btl_openib_component.max_lmc < lmc) { - lmc = mca_btl_openib_component.max_lmc; - } - - /* APM support -- only meaningful if async event support is - enabled. If async events are not enabled, then there's nothing - to listen for the APM event to load the new path, so it's not - worth enabling APM. */ - if (lmc > 1){ - if (-1 == mca_btl_openib_component.apm_lmc) { - lmc_step = lmc; - mca_btl_openib_component.apm_lmc = lmc - 1; - } else if (0 == lmc % (mca_btl_openib_component.apm_lmc + 1)) { - lmc_step = mca_btl_openib_component.apm_lmc + 1; - } else { - opal_show_help("help-mpi-btl-openib.txt", "apm with wrong lmc",true, - mca_btl_openib_component.apm_lmc, lmc); - return OPAL_ERROR; - } - } else { - if (mca_btl_openib_component.apm_lmc) { - /* Disable apm and report warning */ - mca_btl_openib_component.apm_lmc = 0; - opal_show_help("help-mpi-btl-openib.txt", "apm without lmc",true); - } - } - - for(lid = ib_port_attr->lid; - lid < ib_port_attr->lid + lmc; lid += lmc_step){ - for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){ - char param[40]; - - openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t)); - if(NULL == openib_btl) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - memcpy(openib_btl, &mca_btl_openib_module, - sizeof(mca_btl_openib_module)); - memcpy(&openib_btl->ib_port_attr, ib_port_attr, - sizeof(struct ibv_port_attr)); - ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); - ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; - openib_btl->device = device; - openib_btl->port_num = (uint8_t) port_num; - openib_btl->pkey_index = pkey_index; - openib_btl->lid = lid; - openib_btl->apm_port = 0; - openib_btl->src_path_bits = lid - ib_port_attr->lid; - - openib_btl->port_info.subnet_id = subnet_id; - openib_btl->port_info.mtu = device->mtu; - openib_btl->port_info.lid = lid; - - openib_btl->cpcs = NULL; - openib_btl->num_cpcs = 0; - openib_btl->local_procs = 0; - - mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; - mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; - - if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) { - openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz; - } - - openib_btl->super.btl_get_alignment = 0; - - if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) { - openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz; - } - - openib_btl->super.btl_put_local_registration_threshold = openib_btl->device->max_inline_data; - openib_btl->super.btl_get_local_registration_threshold = 0; - -#if HAVE_DECL_IBV_ATOMIC_HCA - openib_btl->atomic_ops_be = false; - -#ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXT_ATOM - /* check that 8-byte atomics are supported */ - if (!(device->ib_exp_dev_attr.ext_atom.log_atomic_arg_sizes & (1<<3ull))) { - openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS; - openib_btl->super.btl_atomic_flags = 0; - openib_btl->super.btl_atomic_fop = NULL; - openib_btl->super.btl_atomic_cswap = NULL; - } -#endif - -#ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXP_ATOMIC_CAP - switch (openib_btl->device->ib_exp_dev_attr.exp_atomic_cap) -#else - switch (openib_btl->device->ib_dev_attr.atomic_cap) -#endif - { - case IBV_ATOMIC_GLOB: - openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; - break; -#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE - case IBV_EXP_ATOMIC_HCA_REPLY_BE: - openib_btl->atomic_ops_be = true; - break; -#endif - case IBV_ATOMIC_HCA: - break; - case IBV_ATOMIC_NONE: - default: - /* no atomics or an unsupported atomic type */ - openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS; - openib_btl->super.btl_atomic_flags = 0; - openib_btl->super.btl_atomic_fop = NULL; - openib_btl->super.btl_atomic_cswap = NULL; - } -#endif - - openib_btl->super.btl_put_alignment = 0; - - openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); - - /* Check bandwidth configured for this device */ - sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev)); - param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth); - - /* Check bandwidth configured for this device/port */ - sprintf(param, "bandwidth_%s:%d", ibv_get_device_name(device->ib_dev), - port_num); - param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth); - - /* Check bandwidth configured for this device/port/LID */ - sprintf(param, "bandwidth_%s:%d:%d", - ibv_get_device_name(device->ib_dev), port_num, lid); - param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth); - - /* Check latency configured for this device */ - sprintf(param, "latency_%s", ibv_get_device_name(device->ib_dev)); - param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency); - - /* Check latency configured for this device/port */ - sprintf(param, "latency_%s:%d", ibv_get_device_name(device->ib_dev), - port_num); - param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency); - - /* Check latency configured for this device/port/LID */ - sprintf(param, "latency_%s:%d:%d", ibv_get_device_name(device->ib_dev), - port_num, lid); - param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency); - - /* Auto-detect the port bandwidth */ - if (0 == openib_btl->super.btl_bandwidth) { - if (OPAL_SUCCESS != - opal_common_verbs_port_bw(ib_port_attr, - &openib_btl->super.btl_bandwidth)) { - /* If we can't figure out the bandwidth, declare - this port unreachable (do not* return - ERR_VALUE_OF_OUT_OF_BOUNDS; that is reserved - for when we exceed the number of allowable - BTLs). */ - return OPAL_ERR_UNREACH; - } - } - - openib_btl->allowed = true; - - opal_list_append(btl_list, (opal_list_item_t*) ib_selected); - opal_pointer_array_add(device->device_btls, (void*) openib_btl); - ++device->btls; - ++mca_btl_openib_component.ib_num_btls; - ++mca_btl_openib_component.ib_allowed_btls; - if (-1 != mca_btl_openib_component.ib_max_btls && - mca_btl_openib_component.ib_num_btls >= - mca_btl_openib_component.ib_max_btls) { - return OPAL_ERR_VALUE_OUT_OF_BOUNDS; - } - } - } - - return OPAL_SUCCESS; -} - -static void device_construct(mca_btl_openib_device_t *device) -{ - device->ib_dev = NULL; - device->ib_dev_context = NULL; - device->ib_pd = NULL; - device->mpool = NULL; - device->rcache = NULL; -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - device->ib_channel = NULL; -#endif - device->btls = 0; - device->endpoints = NULL; - device->device_btls = NULL; - device->ib_cq[BTL_OPENIB_HP_CQ] = NULL; - device->ib_cq[BTL_OPENIB_LP_CQ] = NULL; - device->cq_size[BTL_OPENIB_HP_CQ] = 0; - device->cq_size[BTL_OPENIB_LP_CQ] = 0; - device->non_eager_rdma_endpoints = 0; - device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio; - device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio; - device->pollme = true; - device->eager_rdma_buffers_count = 0; - device->eager_rdma_buffers = NULL; -#if HAVE_XRC - device->xrc_fd = -1; -#endif - device->qps = NULL; - OBJ_CONSTRUCT(&device->device_lock, opal_mutex_t); - OBJ_CONSTRUCT(&device->send_free_control, opal_free_list_t); - device->max_inline_data = 0; - device->ready_for_use = false; -} - -static void device_destruct(mca_btl_openib_device_t *device) -{ - int i; - -#if OPAL_ENABLE_PROGRESS_THREADS == 1 - if (device->progress) { - device->progress = false; - if (pthread_cancel(device->thread.t_handle)) { - BTL_ERROR(("Failed to cancel OpenIB progress thread")); - goto device_error; - } - opal_thread_join(&device->thread, NULL); - } - - if (ibv_destroy_comp_channel(device->ib_channel)) { - BTL_VERBOSE(("Failed to close comp_channel")); - goto device_error; - } -#endif - - /* signaling to async_tread to stop poll for this device */ - mca_btl_openib_async_rem_device (device); - - if(device->eager_rdma_buffers) { - int i; - for(i = 0; i < device->eager_rdma_buffers_count; i++) - if(device->eager_rdma_buffers[i]) - OBJ_RELEASE(device->eager_rdma_buffers[i]); - free(device->eager_rdma_buffers); - } - - if (NULL != device->qps) { - for (i = 0; i < mca_btl_openib_component.num_qps; i++) { - OBJ_DESTRUCT(&device->qps[i].send_free); - OBJ_DESTRUCT(&device->qps[i].recv_free); - } - free(device->qps); - } - - OBJ_DESTRUCT(&device->send_free_control); - - /* Release CQs */ - if(device->ib_cq[BTL_OPENIB_HP_CQ] != NULL) { - if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_HP_CQ])) { - BTL_VERBOSE(("Failed to close HP CQ")); - goto device_error; - } - } - - if(device->ib_cq[BTL_OPENIB_LP_CQ] != NULL) { - if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_LP_CQ])) { - BTL_VERBOSE(("Failed to close LP CQ")); - goto device_error; - } - } - - if (OPAL_SUCCESS != mca_rcache_base_module_destroy (device->rcache)) { - BTL_VERBOSE(("failed to release registration cache")); - goto device_error; - } - -#if HAVE_XRC - - if (MCA_BTL_XRC_ENABLED) { - if (OPAL_SUCCESS != mca_btl_openib_close_xrc_domain(device)) { - BTL_VERBOSE(("XRC Internal error. Failed to close xrc domain")); - goto device_error; - } - } -#endif - - if (ibv_dealloc_pd(device->ib_pd)) { - BTL_VERBOSE(("Warning! Failed to release PD")); - goto device_error; - } - - OBJ_DESTRUCT(&device->device_lock); - - if (ibv_close_device(device->ib_dev_context)) { - if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) { - BTL_VERBOSE(("Warning! Failed to close device")); - goto device_error; - } else { - BTL_ERROR(("Error! Failed to close device")); - goto device_error; - } - } - BTL_VERBOSE(("device was successfully released")); - return; -device_error: - BTL_VERBOSE(("Failed to destroy device resources")); -} - -OBJ_CLASS_INSTANCE(mca_btl_openib_device_t, opal_object_t, device_construct, - device_destruct); - -static int -get_port_list(mca_btl_openib_device_t *device, int *allowed_ports) -{ - int i, j, k, num_ports = 0; - const char *dev_name; - char *name; - - dev_name = ibv_get_device_name(device->ib_dev); - name = (char*) malloc(strlen(dev_name) + 4); - if (NULL == name) { - return 0; - } - - /* Assume that all ports are allowed. num_ports will be adjusted - below to reflect whether this is true or not. */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - allowed_ports[num_ports++] = i; - } - num_ports = 0; - if (NULL != mca_btl_openib_component.if_include_list) { - /* If only the device name is given (eg. mtdevice0,mtdevice1) use all - ports */ - i = 0; - while (mca_btl_openib_component.if_include_list[i]) { - if (0 == strcmp(dev_name, - mca_btl_openib_component.if_include_list[i])) { - num_ports = device->ib_dev_attr.phys_port_cnt; - goto done; - } - ++i; - } - /* Include only requested ports on the device */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name,"%s:%d",dev_name,i); - for (j = 0; - NULL != mca_btl_openib_component.if_include_list[j]; ++j) { - if (0 == strcmp(name, - mca_btl_openib_component.if_include_list[j])) { - allowed_ports[num_ports++] = i; - break; - } - } - } - } else if (NULL != mca_btl_openib_component.if_exclude_list) { - /* If only the device name is given (eg. mtdevice0,mtdevice1) exclude - all ports */ - i = 0; - while (mca_btl_openib_component.if_exclude_list[i]) { - if (0 == strcmp(dev_name, - mca_btl_openib_component.if_exclude_list[i])) { - num_ports = 0; - goto done; - } - ++i; - } - /* Exclude the specified ports on this device */ - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name,"%s:%d",dev_name,i); - for (j = 0; - NULL != mca_btl_openib_component.if_exclude_list[j]; ++j) { - if (0 == strcmp(name, - mca_btl_openib_component.if_exclude_list[j])) { - /* If found, set a sentinel value */ - j = -1; - break; - } - } - /* If we didn't find it, it's ok to include in the list */ - if (-1 != j) { - allowed_ports[num_ports++] = i; - } - } - } else { - num_ports = device->ib_dev_attr.phys_port_cnt; - } - -done: - - /* Remove the following from the error-checking if_list: - - bare device name - - device name suffixed with port number */ - if (NULL != mca_btl_openib_component.if_list) { - for (i = 0; NULL != mca_btl_openib_component.if_list[i]; ++i) { - - /* Look for raw device name */ - if (0 == strcmp(mca_btl_openib_component.if_list[i], dev_name)) { - j = opal_argv_count(mca_btl_openib_component.if_list); - opal_argv_delete(&j, &(mca_btl_openib_component.if_list), - i, 1); - --i; - } - } - for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) { - sprintf(name, "%s:%d", dev_name, i); - for (j = 0; NULL != mca_btl_openib_component.if_list[j]; ++j) { - if (0 == strcmp(mca_btl_openib_component.if_list[j], name)) { - k = opal_argv_count(mca_btl_openib_component.if_list); - opal_argv_delete(&k, &(mca_btl_openib_component.if_list), - j, 1); - --j; - break; - } - } - } - } - - free(name); - - return num_ports; -} - -/* - * Prefer values that are already in the target - */ -static void merge_values(opal_btl_openib_ini_values_t *target, - opal_btl_openib_ini_values_t *src) -{ - if (!target->mtu_set && src->mtu_set) { - target->mtu = src->mtu; - target->mtu_set = true; - } - - if (!target->use_eager_rdma_set && src->use_eager_rdma_set) { - target->use_eager_rdma = src->use_eager_rdma; - target->use_eager_rdma_set = true; - } - - if (NULL == target->receive_queues && NULL != src->receive_queues) { - target->receive_queues = strdup(src->receive_queues); - } - - if (!target->max_inline_data_set && src->max_inline_data_set) { - target->max_inline_data = src->max_inline_data; - target->max_inline_data_set = true; - } -} - -static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag) -{ - mca_btl_openib_control_header_t* chdr = - (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval; - return (MCA_BTL_TAG_IB == frag->hdr->tag) && - (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type); -} - -static bool inline is_cts_message(const mca_btl_openib_recv_frag_t *frag) -{ - mca_btl_openib_control_header_t* chdr = - (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval; - return (MCA_BTL_TAG_IB == frag->hdr->tag) && - (MCA_BTL_OPENIB_CONTROL_CTS == chdr->type); -} - -static int32_t atoi_param(char *param, int32_t dflt) -{ - if (NULL == param || '\0' == param[0]) { - return dflt ? dflt : 1; - } - - return atoi(param); -} - -static void init_apm_port(mca_btl_openib_device_t *device, int port, uint16_t lid) -{ - int index; - struct mca_btl_openib_module_t *btl; - for(index = 0; index < device->btls; index++) { - btl = (mca_btl_openib_module_t *) opal_pointer_array_get_item(device->device_btls, index); - /* Ok, we already have btl for the fist port, - * second one will be used for APM */ - btl->apm_port = port; - btl->port_info.apm_lid = lid + btl->src_path_bits; - mca_btl_openib_component.apm_ports++; - BTL_VERBOSE(("APM-PORT: Setting alternative port - %d, lid - %d" - ,port ,lid)); - } -} - -static int get_var_source (const char *var_name, mca_base_var_source_t *source) -{ - int vari = mca_base_var_find ("opal", "btl", "openib", var_name); - if (0 > vari) { - return vari; - } - - return mca_base_var_get_value (vari, NULL, source, NULL); -} - -static int setup_qps(void) -{ - char **queues, **params = NULL; - int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0; - uint32_t max_qp_size, max_size_needed; - int32_t min_freelist_size = 0; - int smallest_pp_qp = INT_MAX, ret = OPAL_ERROR; - - queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':'); - if (0 == opal_argv_count(queues)) { - opal_show_help("help-mpi-btl-openib.txt", - "no qps in receive_queues", true, - opal_process_info.nodename, - mca_btl_openib_component.receive_queues); - ret = OPAL_ERROR; - goto error; - } - - while (queues[qp] != NULL) { - if (0 == strncmp("P,", queues[qp], 2)) { - num_pp_qps++; - if (smallest_pp_qp > qp) { - smallest_pp_qp = qp; - } - } else if (0 == strncmp("S,", queues[qp], 2)) { - num_srq_qps++; - } else if (0 == strncmp("X,", queues[qp], 2)) { -#if HAVE_XRC - num_xrc_qps++; -#else - opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true, - opal_process_info.nodename, - mca_btl_openib_component.receive_queues); - ret = OPAL_ERR_NOT_AVAILABLE; - goto error; -#endif - } else { - opal_show_help("help-mpi-btl-openib.txt", - "invalid qp type in receive_queues", true, - opal_process_info.nodename, - mca_btl_openib_component.receive_queues, - queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - qp++; - } - -#if HAVE_XRC - /* Current XRC implementation can't used with other QP types - PP - and SRQ */ - if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) { - opal_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true, - opal_process_info.nodename, - mca_btl_openib_component.receive_queues); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - - /* Current XRC implementation can't used with btls_per_lid > 1 */ - if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) { - opal_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID", - true, opal_process_info.nodename, - mca_btl_openib_component.receive_queues, num_xrc_qps); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } -#endif - - mca_btl_openib_component.num_pp_qps = num_pp_qps; - mca_btl_openib_component.num_srq_qps = num_srq_qps; - mca_btl_openib_component.num_xrc_qps = num_xrc_qps; - mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps; - - mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*) - malloc(sizeof(mca_btl_openib_qp_info_t) * - mca_btl_openib_component.num_qps); - if (NULL == mca_btl_openib_component.qp_infos) { - ret = OPAL_ERR_OUT_OF_RESOURCE; - goto error; - } - - qp = 0; -#define P(N) (((N) > count) ? NULL : params[(N)]) - while (queues[qp] != NULL) { - int count; - int32_t rd_low, rd_num; - params = opal_argv_split_with_empty(queues[qp], ','); - count = opal_argv_count(params); - - if ('P' == params[0][0]) { - int32_t rd_win, rd_rsv; - if (count < 3 || count > 6) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - rd_win = atoi_param(P(4), (rd_num - rd_low) * 2); - - if (0 >= rd_win) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid pp qp specification", true, - opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - - rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win); - - BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d", - rd_num, rd_low, rd_win, rd_rsv)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num + rd_rsv > min_freelist_size) { - min_freelist_size = rd_num + rd_rsv; - } - - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win; - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv; - if ((rd_num - rd_low) > rd_win) { - opal_show_help("help-mpi-btl-openib.txt", "non optimal rd_win", - true, rd_win, rd_num - rd_low); - } - } else { - int32_t sd_max, rd_init, srq_limit; - if (count < 3 || count > 7) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid srq specification", true, - opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ? - MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP; - mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0); - rd_num = atoi_param(P(2), 256); - /* by default set rd_low to be 3/4 of rd_num */ - rd_low = atoi_param(P(3), rd_num - (rd_num / 4)); - sd_max = atoi_param(P(4), rd_low / 4); - /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */ - rd_init = atoi_param(P(5), rd_num / 4); - /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local, - the value of rd_low_local we calculate in create_srq function) */ - srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4); - - /* If we set srq_limit less or greater than rd_init - (init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED - event immediately and the value of rd_curr_num will be increased */ - - /* If we set srq_limit to zero, but size of SRQ greater than 1 => set it to be 1 */ - if((0 == srq_limit) && (1 < rd_num)) { - srq_limit = 1; - } - - BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d", - rd_num, rd_low, sd_max, rd_init, srq_limit)); - - /* Calculate the smallest freelist size that can be allowed */ - if (rd_num > min_freelist_size) { - min_freelist_size = rd_num; - } - - if (rd_num < rd_init) { - opal_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init", - true, opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - - if (rd_num < srq_limit) { - opal_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num", - true, opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max; - mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init; - mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit; - } - - if (rd_num <= rd_low) { - opal_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low", - true, opal_process_info.nodename, queues[qp]); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - mca_btl_openib_component.qp_infos[qp].rd_num = rd_num; - mca_btl_openib_component.qp_infos[qp].rd_low = rd_low; - opal_argv_free(params); - qp++; - } - params = NULL; - - /* Sanity check some sizes */ - - max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size; - max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > - mca_btl_openib_module.super.btl_max_send_size) ? - mca_btl_openib_module.super.btl_eager_limit : - mca_btl_openib_module.super.btl_max_send_size; - - if (max_qp_size < max_size_needed) { - mca_base_var_source_t eager_source = MCA_BASE_VAR_SOURCE_DEFAULT; - mca_base_var_source_t max_send_source = MCA_BASE_VAR_SOURCE_DEFAULT; - - (void) get_var_source ("max_send_size", &max_send_source); - (void) get_var_source ("eager_limit", &eager_source); - - /* the largest queue pair is too small for either the max send size or eager - * limit. check where we got the max_send_size and eager_limit and adjust if - * the user did not specify one or the other. */ - if (mca_btl_openib_module.super.btl_eager_limit > max_qp_size && - MCA_BASE_VAR_SOURCE_DEFAULT == eager_source) { - mca_btl_openib_module.super.btl_eager_limit = max_qp_size; - } - - if (mca_btl_openib_module.super.btl_max_send_size > max_qp_size && - MCA_BASE_VAR_SOURCE_DEFAULT == max_send_source) { - mca_btl_openib_module.super.btl_max_send_size = max_qp_size; - } - - max_size_needed = (mca_btl_openib_module.super.btl_eager_limit > - mca_btl_openib_module.super.btl_max_send_size) ? - mca_btl_openib_module.super.btl_eager_limit : - mca_btl_openib_module.super.btl_max_send_size; - } - - if (max_qp_size < max_size_needed) { - opal_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too small", true, - opal_process_info.nodename, max_qp_size, - max_size_needed); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } else if (max_qp_size > max_size_needed) { - opal_show_help("help-mpi-btl-openib.txt", - "biggest qp size is too big", true, - opal_process_info.nodename, max_qp_size, - max_size_needed); - } - - if (mca_btl_openib_component.ib_free_list_max > 0 && - min_freelist_size > mca_btl_openib_component.ib_free_list_max) { - opal_show_help("help-mpi-btl-openib.txt", "freelist too small", true, - opal_process_info.nodename, - mca_btl_openib_component.ib_free_list_max, - min_freelist_size); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - - mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1; - if (mca_btl_openib_component.num_qps > smallest_pp_qp) { - mca_btl_openib_component.credits_qp = smallest_pp_qp; - } else { - mca_btl_openib_component.credits_qp = mca_btl_openib_component.num_qps - 1; - } - - ret = OPAL_SUCCESS; -error: - if (NULL != params) { - opal_argv_free(params); - } - - if (NULL != queues) { - opal_argv_free(queues); - } - - return ret; -} - -/* read a single integer from a linux module parameters file */ -static uint64_t read_module_param(char *file, uint64_t value, uint64_t max) -{ - int fd = open(file, O_RDONLY); - char buffer[64]; - uint64_t ret; - int rc; - - if (0 > fd) { - return value; - } - - rc = read (fd, buffer, 64); - - close (fd); - - if (0 == rc) { - return value; - } - - errno = 0; - ret = strtoull(buffer, NULL, 10); - - if (ret > max) { - /* NTH: probably should report a bogus value */ - ret = max; - } - - return (0 == errno) ? ret : value; -} - -/* calculate memory registation limits */ -static uint64_t calculate_total_mem (void) -{ - hwloc_obj_t machine; - int rc; - uint64_t mem, *mptr; - opal_process_name_t wildcard_rank; - - /* first try to retrieve it from PMIx as it may have - * been provided */ - wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; - wildcard_rank.vpid = OPAL_VPID_WILDCARD; - mptr = &mem; - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY, - &wildcard_rank, &mptr, OPAL_UINT64); - if (OPAL_SUCCESS == rc) { - return mem; - } - - /* if not available, then ensure that the topology has been - * loaded and try to get it from there */ - if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { - machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); - if (NULL == machine) { - return 0; - } -#if HWLOC_API_VERSION < 0x20000 - return machine->memory.total_memory; -#else - return machine->total_memory; -#endif - } - - /* couldn't find it */ - return 0; -} - - -static uint64_t calculate_max_reg (const char *device_name) -{ - struct stat statinfo; - uint64_t mtts_per_seg = 1; - uint64_t num_mtt = 1 << 19; - uint64_t reserved_mtt = 0; - uint64_t max_reg, mem_total; - - mem_total = calculate_total_mem (); - - /* On older OFED(<2.0), may need to turn off this parameter*/ - if (mca_btl_openib_component.allow_max_memory_registration) { - max_reg = 2 * mem_total; - /* Limit us to 87.5% of the registered memory (some fluff for QPs, - file systems, etc) */ - return (max_reg * 7) >> 3; - } - - /* Default to being able to register everything (to ensure that - max_reg is initialized in all cases) */ - max_reg = mem_total; - if (!strncmp(device_name, "mlx5", 4)) { - max_reg = 2 * mem_total; - - } else if (!strncmp(device_name, "mlx4", 4)) { - if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) { - mtts_per_seg = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63); - num_mtt = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63); - if (1 == num_mtt) { - /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */ - num_mtt = 1 << 19; - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - } else { - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - } - } - - } else if (!strncmp(device_name, "mthca", 5)) { - if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) { - mtts_per_seg = 1ull << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1, 63); - num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20, (uint64_t) -1); - reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0, (uint64_t) -1); - - max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg; - } else { - max_reg = mem_total; - } - - } else { - /* Need to update to determine the registration limit for this - configuration */ - max_reg = mem_total; - } - - /* Print a warning if we can't register more than 75% of physical - memory. Abort if the abort_not_enough_reg_mem MCA param was - set. */ - if (max_reg < mem_total * 3 / 4) { - char *action; - - if (mca_btl_openib_component.abort_not_enough_reg_mem) { - action = "Your MPI job will now abort."; - } else { - action = "Your MPI job will continue, but may be behave poorly and/or hang."; - } - opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true, - opal_process_info.nodename, (unsigned long)(max_reg >> 20), - (unsigned long)(mem_total >> 20), action); - return 0; /* signal that we can't have enough memory */ - } - - /* Limit us to 87.5% of the registered memory (some fluff for QPs, - file systems, etc) */ - return (max_reg * 7) >> 3; -} - -static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) -{ - mca_rcache_base_resources_t rcache_resources; - mca_btl_openib_device_t *device; - uint8_t i, k = 0; - int ret = -1, port_cnt; - opal_btl_openib_ini_values_t values, default_values; - int *allowed_ports = NULL; - bool need_search; - struct ibv_context *dev_context = NULL; - - /* Open up the device */ - dev_context = ibv_open_device(ib_dev); - if (NULL == dev_context) { - return OPAL_ERR_NOT_SUPPORTED; - } - - /* Find out if this device supports RC QPs */ - if (OPAL_SUCCESS != opal_common_verbs_qp_test(dev_context, - OPAL_COMMON_VERBS_FLAGS_RC)) { - ibv_close_device(dev_context); - BTL_VERBOSE(("openib: RC QPs not supported -- skipping %s", - ibv_get_device_name(ib_dev))); - ++num_devices_intentionally_ignored; - return OPAL_ERR_NOT_SUPPORTED; - } - - device = OBJ_NEW(mca_btl_openib_device_t); - if(NULL == device){ - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - ibv_close_device(dev_context); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - device->mem_reg_active = 0; - device->mem_reg_max_total = calculate_max_reg(ibv_get_device_name(ib_dev)); - device->mem_reg_max = device->mem_reg_max_total; - if(( 0 == device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) { - return OPAL_ERROR; - } - - device->ib_dev = ib_dev; - device->ib_dev_context = dev_context; - device->ib_pd = NULL; - device->device_btls = OBJ_NEW(opal_pointer_array_t); - if (OPAL_SUCCESS != opal_pointer_array_init(device->device_btls, 2, INT_MAX, 2)) { - BTL_ERROR(("Failed to initialize device_btls array: %s:%d", __FILE__, __LINE__)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if(NULL == device->ib_dev_context){ - BTL_ERROR(("error obtaining device context for %s errno says %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - goto error; - } -#if HAVE_DECL_IBV_EXP_QUERY_DEVICE - memset(&device->ib_exp_dev_attr, 0, sizeof(device->ib_exp_dev_attr)); - device->ib_exp_dev_attr.comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1; - if(ibv_exp_query_device(device->ib_dev_context, &device->ib_exp_dev_attr)){ - BTL_ERROR(("error obtaining device attributes for %s errno says %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - goto error; - } -#endif - if(ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)){ - BTL_ERROR(("error obtaining device attributes for %s errno says %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - goto error; - } - /* If mca_btl_if_include/exclude were specified, get usable ports */ - allowed_ports = (int*)malloc(device->ib_dev_attr.phys_port_cnt * sizeof(int)); - if (NULL == allowed_ports) { - ret = OPAL_ERR_OUT_OF_RESOURCE; - goto error; - } - - port_cnt = get_port_list(device, allowed_ports); - if (0 == port_cnt) { - ret = OPAL_SUCCESS; - ++num_devices_intentionally_ignored; - goto error; - } - - /* Load in vendor/part-specific device parameters. Note that even if - we don't find values for this vendor/part, "values" will be set - indicating that it does not have good values */ - ret = opal_btl_openib_ini_query(device->ib_dev_attr.vendor_id, - device->ib_dev_attr.vendor_part_id, - &values); - if (OPAL_SUCCESS != ret && - OPAL_ERR_NOT_FOUND != ret) { - /* If we get a serious error, propagate it upwards */ - goto error; - } - if (OPAL_ERR_NOT_FOUND == ret) { - /* If we didn't find a matching device in the INI files, output a - warning that we're using default values (unless overridden - that we don't want to see these warnings) */ - if (mca_btl_openib_component.warn_no_device_params_found) { - opal_show_help("help-mpi-btl-openib.txt", - "no device params found", true, - opal_process_info.nodename, - ibv_get_device_name(device->ib_dev), - device->ib_dev_attr.vendor_id, - device->ib_dev_attr.vendor_part_id); - } - } - - /* If we're supposed to ignore devices of this vendor/part ID, - then do so */ - if (values.ignore_device_set && values.ignore_device) { - BTL_VERBOSE(("device %s skipped; ignore_device=1", - ibv_get_device_name(device->ib_dev))); - ret = OPAL_SUCCESS; - ++num_devices_intentionally_ignored; - goto error; - } - - /* Note that even if we don't find default values, "values" will - be set indicating that it does not have good values */ - ret = opal_btl_openib_ini_query(0, 0, &default_values); - if (OPAL_SUCCESS != ret && - OPAL_ERR_NOT_FOUND != ret) { - /* If we get a serious error, propagate it upwards */ - goto error; - } - - /* If we did find values for this device (or in the defaults - section), handle them */ - merge_values(&values, &default_values); - /* If MCA param was set, use it. If not, check the INI file - or default to IBV_MTU_1024 */ - if (0 < mca_btl_openib_component.ib_mtu) { - device->mtu = mca_btl_openib_component.ib_mtu; - } else if (values.mtu_set) { - switch (values.mtu) { - case 256: - device->mtu = IBV_MTU_256; - break; - case 512: - device->mtu = IBV_MTU_512; - break; - case 1024: - device->mtu = IBV_MTU_1024; - break; - case 2048: - device->mtu = IBV_MTU_2048; - break; - case 4096: - device->mtu = IBV_MTU_4096; - break; - default: - BTL_ERROR(("invalid MTU value specified in INI file (%d); ignored", values.mtu)); - device->mtu = IBV_MTU_1024 ; - break; - } - } else { - device->mtu = IBV_MTU_1024 ; - } - - /* Allocate the protection domain for the device */ - device->ib_pd = ibv_alloc_pd(device->ib_dev_context); - if(NULL == device->ib_pd){ - BTL_ERROR(("error allocating protection domain for %s errno says %s", - ibv_get_device_name(device->ib_dev), strerror(errno))); - goto error; - } - - /* Figure out what the max_inline_data value should be for all - ports and QPs on this device */ - need_search = false; - if(-2 != mca_btl_openib_component.ib_max_inline_data) { - /* User has explicitly set btl_openib_max_inline_data MCA parameter - Per setup in _mca.c, we know that the MCA param value is guaranteed - to be >= -1 */ - if (-1 == mca_btl_openib_component.ib_max_inline_data) { - need_search = true; - } else { - device->max_inline_data = (uint32_t) - mca_btl_openib_component.ib_max_inline_data; - } - } else if (values.max_inline_data_set) { - if (-1 == values.max_inline_data) { - need_search = true; - } else if (values.max_inline_data >= 0) { - device->max_inline_data = (uint32_t) values.max_inline_data; - } else { - if(default_values.max_inline_data_set && - default_values.max_inline_data >= -1) { - BTL_ERROR(("Invalid max_inline_data value specified " - "in INI file (%d); using default value (%d)", - values.max_inline_data, - default_values.max_inline_data)); - device->max_inline_data = (uint32_t) - default_values.max_inline_data; - } else { - BTL_ERROR(("Invalid max_inline_data value specified " - "in INI file (%d)", values.max_inline_data)); - ret = OPAL_ERR_BAD_PARAM; - goto error; - } - } - } - - /* If we don't have a set max inline data size, search for it */ - if (need_search) { - opal_common_verbs_find_max_inline(device->ib_dev, - device->ib_dev_context, - device->ib_pd, - &device->max_inline_data); - } - - /* Should we use RDMA for short / eager messages? First check MCA - param, then check INI file values. */ - if (mca_btl_openib_component.use_eager_rdma >= 0) { - device->use_eager_rdma = mca_btl_openib_component.use_eager_rdma; - } else if (values.use_eager_rdma_set) { - device->use_eager_rdma = values.use_eager_rdma; - } - /* Eager RDMA is not currently supported with progress threads */ - if (device->use_eager_rdma && OPAL_ENABLE_PROGRESS_THREADS) { - device->use_eager_rdma = 0; - opal_show_help("help-mpi-btl-openib.txt", - "eager RDMA and progress threads", true); - } - - opal_asprintf (&rcache_resources.cache_name, "verbs.%" PRIu64, device->ib_dev_attr.node_guid); - rcache_resources.reg_data = (void*)device; - rcache_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t); - rcache_resources.register_mem = openib_reg_mr; - rcache_resources.deregister_mem = openib_dereg_mr; - device->rcache = - mca_rcache_base_module_create (mca_btl_openib_component.ib_rcache_name, - device, &rcache_resources); - if (NULL == device->rcache) { - /* Don't print an error message here -- we'll get one from - mpool_create anyway */ - goto error; - } - - device->mpool = mca_mpool_base_module_lookup (mca_btl_openib_component.ib_mpool_hints); - if (NULL == device->mpool) { - goto error; - } - -#if OPAL_ENABLE_PROGRESS_THREADS - device->ib_channel = ibv_create_comp_channel(device->ib_dev_context); - if (NULL == device->ib_channel) { - BTL_ERROR(("error creating channel for %s errno says %s", - ibv_get_device_name(device->ib_dev), - strerror(errno))); - goto error; - } -#endif - - ret = OPAL_SUCCESS; - - /* Note ports are 1 based (i >= 1) */ - for(k = 0; k < port_cnt; k++){ - struct ibv_port_attr ib_port_attr; - i = allowed_ports[k]; - if(ibv_query_port(device->ib_dev_context, i, &ib_port_attr)){ - BTL_ERROR(("error getting port attributes for device %s " - "port number %d errno says %s", - ibv_get_device_name(device->ib_dev), i, strerror(errno))); - break; - } - if(IBV_PORT_ACTIVE == ib_port_attr.state) { - /* Select the lower of the HCA and port active speed. With QLogic - HCAs that are capable of 4K MTU we had an issue when connected - to switches with 2K MTU. This fix is valid for other IB vendors - as well. */ - if (ib_port_attr.active_mtu < device->mtu){ - device->mtu = ib_port_attr.active_mtu; - } - if (mca_btl_openib_component.apm_ports && device->btls > 0) { - init_apm_port(device, i, ib_port_attr.lid); - break; - } - if (0 == mca_btl_openib_component.ib_pkey_val) { - ret = init_one_port(btl_list, device, i, 0, &ib_port_attr); - } else { - uint16_t pkey,j; - for (j = 0; j < device->ib_dev_attr.max_pkeys; j++) { - if(ibv_query_pkey(device->ib_dev_context, i, j, &pkey)){ - BTL_ERROR(("error getting pkey for index %d, device %s " - "port number %d errno says %s", - j, ibv_get_device_name(device->ib_dev), i, strerror(errno))); - } - pkey = ntohs(pkey) & MCA_BTL_IB_PKEY_MASK; - if(pkey == mca_btl_openib_component.ib_pkey_val){ - ret = init_one_port(btl_list, device, i, j, &ib_port_attr); - break; - } - } - } - if (OPAL_SUCCESS != ret) { - /* Out of bounds error indicates that we hit max btl number - * don't propagate the error to the caller */ - if (OPAL_ERR_VALUE_OUT_OF_BOUNDS == ret) { - ret = OPAL_SUCCESS; - } - break; - } - } - } - free(allowed_ports); - allowed_ports = NULL; - - /* If we made a BTL, check APM status and return. Otherwise, fall - through and destroy everything */ - if (device->btls > 0) { - /* if apm was enabled it should be > 1 */ - if (1 == mca_btl_openib_component.apm_ports) { - opal_show_help("help-mpi-btl-openib.txt", - "apm not enough ports", true); - mca_btl_openib_component.apm_ports = 0; - } - - /* Check to ensure that all devices used in this process have - compatible receive_queues values (we check elsewhere to see - if all devices used in other processes in this job have - compatible receive_queues values). - - Not only is the check complex, but the reasons behind what - it does (and does not do) are complex. Before explaining - the code below, here's some notes: - - 1. The openib BTL component only supports 1 value of the - receive_queues between all of its modules. - - --> This could be changed to allow every module to have - its own receive_queues. But that would be a big - deal; no one has time to code this up right now. - - 2. The receive_queues value can be specified either as an - MCA parameter or in the INI file. Specifying the value - as an MCA parameter overrides all INI file values - (meaning: that MCA param value will be used for all - openib BTL modules in the process). - - Effectively, the first device through init_one_device() - gets to decide what the receive_queues will be for the all - modules in this process. This is an unfortunate artifact - of the openib BTL startup sequence (see below for more - details). The first device will choose the receive_queues - value from: (in priority order): - - 1. If the btl_openib_receive_queues MCA param was - specified, use that. - 2. If this device has a receive_queues value specified in - the INI file, use that. - 3. Otherwise, use the default MCA param value for - btl_openib_receive_queues. - - If any successive device has a different value specified in - the INI file, we show_help and return up the stack that - this device failed. - - In the case that the user does not specify a - mca_btl_openib_receive_queues value, the short description - of what is allowed is that either a) no devices specify a - receive_queues value in the INI file (in which case we use - the default MCA param value), b) all devices specify the - same receive_queues value in the INI value, or c) some/all - devices specify the same receive_queues value in the INI - value as the default MCA param value. - - Let's take some sample cases to explain this more clearly... - - THESE ARE THE "GOOD" CASES - -------------------------- - - Case 1: no INI values - - MCA parameter: not specified - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: no receive_queues in INI file - - device 2: no receive_queues in INI file - --> use receive_queues value A with all devices - - Case 2: all INI values the same (same as default) - - MCA parameter: not specified - - default receive_queues: value A - - device 0: receive_queues value A in the INI file - - device 1: receive_queues value A in the INI file - - device 2: receive_queues value A in the INI file - --> use receive_queues value A with all devices - - Case 3: all INI values the same (but different than default) - - MCA parameter: not specified - - default receive_queues: value A - - device 0: receive_queues value B in the INI file - - device 1: receive_queues value B in the INI file - - device 2: receive_queues value B in the INI file - --> use receive_queues value B with all devices - - Case 4: some INI unspecified, but rest same as default - - MCA parameter: not specified - - default receive_queues: value A - - device 0: receive_queues value A in the INI file - - device 1: no receive_queues in INI file - - device 2: receive_queues value A in the INI file - --> use receive_queues value A with all devices - - Case 5: some INI unspecified (including device 0), but rest same as default - - MCA parameter: not specified - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: no receive_queues in INI file - - device 2: receive_queues value A in the INI file - --> use receive_queues value A with all devices - - Case 6: different default/INI values, but MCA param is specified - - MCA parameter: value D - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: receive_queues value B in INI file - - device 2: receive_queues value C in INI file - --> use receive_queues value D with all devices - - What this means is that this selection process is - unfortunately tied to the order of devices. :-( Device 0 - effectively sets what the receive_queues value will be for - that process. If any later device disagrees, that's - problematic and we have to error/abort. - - ALL REMAINING CASES WILL FAIL - ----------------------------- - - Case 7: one INI value (different than default) - - MCA parameter: not specified - - default receive_queues: value A - - device 0: receive_queues value B in INI file - - device 1: no receive_queues in INI file - - device 2: no receive_queues in INI file - --> Jeff thinks that it would be great to use - receive_queues value B with all devices. However, it - shares one of the problems cited in case 8, below. So - we need to fail this scenario; print an error and - abort. - - Case 8: one INI value, different than default - - MCA parameter: not specified - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: receive_queues value B in INI file - - device 2: no receive_queues in INI file - - --> Jeff thinks that it would be great to use - receive_queues value B with all devices. However, it - has (at least) 2 problems: - - 1. The check for local receive_queue compatibility is - done here in init_one_device(). By the time we call - init_one_device() for device 1, we have already - called init_one_device() for device 0, meaning that - device 0's QPs have already been created and setup - using the MCA parameter's default receive_queues - value. So if device 1 *changes* the - component.receive_queues value, then device 0 and - device 1 now have different receive_queue sets (more - specifically: the QPs setup for device 0 are now - effectively lost). This is Bad. - - It would be great if we didn't have this restriction - -- either by letting each module have its own - receive_queues value or by scanning all devices and - figuring out a final receive_queues value *before* - actually setting up any QPs. But that's not the - current flow of the code (patches would be greatly - appreciated here, of course!). Unfortunately, no - one has time to code this up right now, so we're - leaving this as explicitly documented for some - future implementer... - - 2. Conside a scenario with server 1 having HCA A/subnet - X, and server 2 having HCA B/subnet X and HCA - C/subnet Y. And let's assume: - - Server 1: - HCA A: no receive_queues in INI file - - Server 2: - HCA B: no receive_queues in INI file - HCA C: receive_queues specified in INI file - - A will therefore use the default receive_queues - value. B and C will use C's INI receive_queues. - But note that modex [currently] only sends around - vendor/part IDs for OpenFabrics devices -- not the - actual receive_queues value (it was felt that - including the final receive_queues string value in - the modex would dramatically increase the size of - the modex). So processes on server 1 will get the - vendor/part ID for HCA B, look it up in the INI - file, see that it has no receive_queues value - specified, and then assume that it uses the default - receive_queues value. Hence, procs on server 1 will - try to connect HCA A-->HCA B with the wrong - receive_queues value. Bad. Further, the error - won't be discovered by checks like this because A - won't check D's receive_queues because D is on a - different subnet. - - This could be fixed, of course; either by a) send - the final receive_queues value in the modex (perhaps - compressing or encoding it so that it can be much - shorter than the string -- the current vendor/part - ID stuff takes 8 bytes for each device), or b) - replicating the determination process of each host - in each process (i.e., procs on server 1 would see - both B and C, and use them both to figure out what - the "final" receive_queues value is for B). - Unfortunately, no one has time to code this up right - now, so we're leaving this as explicitly documented - for some future implementer... - - Because of both of these problems, this case is - problematic and must fail with a show_help error. - - Case 9: two devices with same INI value (different than default) - - MCA parameter: not specified - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: receive_queues value B in INI file - - device 2: receive_queues value B in INI file - --> per case 8, fail with a show_help message. - - Case 10: two devices with different INI values - - MCA parameter: not specified - - default receive_queues: value A - - device 0: no receive_queues in INI file - - device 1: receive_queues value B in INI file - - device 2: receive_queues value C in INI file - --> per case 8, fail with a show_help message. - - */ - - { - /* we need to read this MCA param at this point in case someone - * altered it via MPI_T */ - mca_base_var_source_t source; - - if (OPAL_SUCCESS != (ret = get_var_source ("receive_queues", &source))) { - BTL_ERROR(("mca_base_var_get_value failed to get value for receive_queues: %s:%d", - __FILE__, __LINE__)); - goto error; - } - - mca_btl_openib_component.receive_queues_source = source; - } - - /* If the MCA param was specified, skip all the checks */ - if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) { - goto good; - } - - /* If we're the first device and we have a receive_queues - value from the INI file *that is different than the - already-existing default value*, then set the component to - use that. */ - if (0 == mca_btl_openib_component.devices_count) { - if (NULL != values.receive_queues && - 0 != strcmp(values.receive_queues, - mca_btl_openib_component.receive_queues)) { - if (NULL != mca_btl_openib_component.receive_queues) { - free(mca_btl_openib_component.receive_queues); - } - mca_btl_openib_component.receive_queues = - strdup(values.receive_queues); - mca_btl_openib_component.receive_queues_source = - BTL_OPENIB_RQ_SOURCE_DEVICE_INI; - } - } - - /* If we're not the first device, then we have to conform to - either the default value if the first device didn't set - anything, or to whatever the first device decided. */ - else { - /* In all cases, if this device has a receive_queues value - in the INI, then it must agree with - component.receive_queues. */ - if (NULL != values.receive_queues) { - if (0 != strcmp(values.receive_queues, - mca_btl_openib_component.receive_queues)) { - opal_show_help("help-mpi-btl-openib.txt", - "locally conflicting receive_queues", true, - opal_install_dirs.opaldatadir, - opal_process_info.nodename, - ibv_get_device_name(receive_queues_device->ib_dev), - receive_queues_device->ib_dev_attr.vendor_id, - receive_queues_device->ib_dev_attr.vendor_part_id, - mca_btl_openib_component.receive_queues, - ibv_get_device_name(device->ib_dev), - device->ib_dev_attr.vendor_id, - device->ib_dev_attr.vendor_part_id, - values.receive_queues); - ret = OPAL_ERR_RESOURCE_BUSY; - goto error; - } - } - - /* If this device doesn't have an INI receive_queues - value, then if the component.receive_queues value came - from the default, we're ok. But if the - component.receive_queues value came from the 1st - device's INI file, we must error. */ - else if ((mca_base_var_source_t) BTL_OPENIB_RQ_SOURCE_DEVICE_INI == - mca_btl_openib_component.receive_queues_source) { - opal_show_help("help-mpi-btl-openib.txt", - "locally conflicting receive_queues", true, - opal_install_dirs.opaldatadir, - opal_process_info.nodename, - ibv_get_device_name(receive_queues_device->ib_dev), - receive_queues_device->ib_dev_attr.vendor_id, - receive_queues_device->ib_dev_attr.vendor_part_id, - mca_btl_openib_component.receive_queues, - ibv_get_device_name(device->ib_dev), - device->ib_dev_attr.vendor_id, - device->ib_dev_attr.vendor_part_id, - mca_btl_openib_component.default_recv_qps); - ret = OPAL_ERR_RESOURCE_BUSY; - goto error; - } - } - - receive_queues_device = device; - - good: - mca_btl_openib_component.devices_count++; - return OPAL_SUCCESS; - } - -error: - if (OPAL_SUCCESS != ret) { - opal_show_help("help-mpi-btl-openib.txt", - "error in device init", true, - opal_process_info.nodename, - ibv_get_device_name(device->ib_dev)); - } - - if (NULL != allowed_ports) { - free(allowed_ports); - } - OBJ_RELEASE(device); - return ret; -} - -static int finish_btl_init(mca_btl_openib_module_t *openib_btl) -{ - int qp; - openib_btl->num_peers = 0; - - /* Initialize module state */ - OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); - - /* setup the qp structure */ - openib_btl->qps = (mca_btl_openib_module_qp_t*) - calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_module_qp_t)); - if (NULL == openib_btl->qps) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* setup all the qps */ - for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if (!BTL_OPENIB_QP_TYPE_PP(qp)) { - OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0], - opal_list_t); - OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1], - opal_list_t); - openib_btl->qps[qp].u.srq_qp.sd_credits = - mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; - openib_btl->qps[qp].u.srq_qp.srq = NULL; - } - } - - /* initialize the memory pool using the device */ - openib_btl->super.btl_mpool = openib_btl->device->mpool; - - openib_btl->eager_rdma_channels = 0; - - openib_btl->eager_rdma_frag_size = OPAL_ALIGN( - sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_footer_t) + - openib_btl->super.btl_eager_limit, - mca_btl_openib_component.buffer_alignment, size_t); - - opal_output_verbose(1, opal_btl_base_framework.framework_output, - "[rank=%d] openib: using port %s:%d", - OPAL_PROC_MY_NAME.vpid, - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num); - return OPAL_SUCCESS; -} - -struct dev_distance { - struct ibv_device *ib_dev; - float distance; -}; - -static int compare_distance(const void *p1, const void *p2) -{ - const struct dev_distance *d1 = (const struct dev_distance *) p1; - const struct dev_distance *d2 = (const struct dev_distance *) p2; - - if (d1->distance > (d2->distance+EPS)) { - return 1; - } else if ((d1->distance + EPS) < d2->distance) { - return -1; - } else { - return 0; - } -} - -static float get_ib_dev_distance(struct ibv_device *dev) -{ - /* If we don't have hwloc, we'll default to a distance of 0, - because we have no way of measuring. */ - float distance = 0; - float a, b; - int i; - hwloc_cpuset_t my_cpuset = NULL, ibv_cpuset = NULL; - hwloc_obj_t my_obj, ibv_obj, node_obj; - struct hwloc_distances_s *hwloc_distances = NULL; - - /* Override any distance logic so all devices are used */ - if (0 != mca_btl_openib_component.ignore_locality || - OPAL_SUCCESS != opal_hwloc_base_get_topology()) { - return distance; - } - -#if HWLOC_API_VERSION >= 0x20000 - unsigned int j, distances_nr = 1; - int ibvindex, myindex; -#endif - - if (NULL == hwloc_distances) { - #if HWLOC_API_VERSION < 0x20000 - hwloc_distances = - (struct hwloc_distances_s*)hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE); - /* If we got no info, just return 0 */ - if (NULL == hwloc_distances || NULL == hwloc_distances->latency) { - goto out; - } - - #else - if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, - &distances_nr, &hwloc_distances, - HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == distances_nr) { - hwloc_distances = NULL; - goto out; - } - #endif - } - - /* Next, find the NUMA node where this IBV device is located */ - ibv_cpuset = hwloc_bitmap_alloc(); - if (NULL == ibv_cpuset) { - goto out; - } - if (0 != hwloc_ibv_get_device_cpuset(opal_hwloc_topology, dev, ibv_cpuset)) { - goto out; - } - ibv_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, ibv_cpuset); - if (NULL == ibv_obj) { - goto out; - } - - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs); -#if HWLOC_API_VERSION < 0x20000 - for (i = 0; i < (int)(2 * hwloc_distances->nbobjs); i++) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]); - } -#else - for (i = 0; i < (int)hwloc_distances->nbobjs; i++) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "hwloc_distances->values[%d]=%"PRIu64, i, hwloc_distances->values[i]); - } -#endif - - /* If ibv_obj is a NUMA node or below, we're good. */ - switch (ibv_obj->type) { - case HWLOC_OBJ_NODE: - case HWLOC_OBJ_SOCKET: -#if HWLOC_API_VERSION < 0x20000 - case HWLOC_OBJ_CACHE: -#else - case HWLOC_OBJ_L1CACHE: - case HWLOC_OBJ_L2CACHE: - case HWLOC_OBJ_L3CACHE: - case HWLOC_OBJ_L4CACHE: - case HWLOC_OBJ_L5CACHE: -#endif - case HWLOC_OBJ_CORE: - case HWLOC_OBJ_PU: - while (NULL != ibv_obj && ibv_obj->type != HWLOC_OBJ_NODE) { - ibv_obj = ibv_obj->parent; - } - break; - - default: - /* If it's above a NUMA node, then I don't know how to compute - the distance... */ - opal_output_verbose(5, opal_btl_base_framework.framework_output, "ibv_obj->type set to NULL"); - ibv_obj = NULL; - break; - } - - /* If we don't have an object for this ibv device, give up */ - if (NULL == ibv_obj) { - goto out; - } - #if HWLOC_API_VERSION >= 0x20000 - /* the new matrix format isn't quite as friendly, so we have to - * do an exhaustive search to find the index of this object - * in that array */ - ibvindex = -1; - for (j=0; j < distances_nr; j++) { - if (ibv_obj == hwloc_distances->objs[j]) { - ibvindex = j; - break; - } - } - if (-1 == ibvindex) { - OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND); - goto out; - } - #endif - - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "ibv_obj->logical_index=%d", ibv_obj->logical_index); - /* This function is only called if the process is bound, so let's - find out where we are bound to. For the moment, we only care - about the NUMA node to which we are bound. */ - my_cpuset = hwloc_bitmap_alloc(); - if (NULL == my_cpuset) { - goto out; - } - if (0 != hwloc_get_cpubind(opal_hwloc_topology, my_cpuset, 0)) { - goto out; - } - my_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, my_cpuset); - if (NULL == my_obj) { - goto out; - } - - /* If my_obj is a NUMA node or below, we're good. */ - switch (my_obj->type) { - case HWLOC_OBJ_NODE: - case HWLOC_OBJ_SOCKET: - #if HWLOC_API_VERSION < 0x20000 - case HWLOC_OBJ_CACHE: - #else - case HWLOC_OBJ_L1CACHE: - case HWLOC_OBJ_L2CACHE: - case HWLOC_OBJ_L3CACHE: - case HWLOC_OBJ_L4CACHE: - case HWLOC_OBJ_L5CACHE: - #endif - case HWLOC_OBJ_CORE: - case HWLOC_OBJ_PU: - while (NULL != my_obj && my_obj->type != HWLOC_OBJ_NODE) { - my_obj = my_obj->parent; - } - if (NULL != my_obj) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "my_obj->logical_index=%d", my_obj->logical_index); - /* Distance may be asymetrical, so calculate both of them - and take the max */ - #if HWLOC_API_VERSION < 0x20000 - a = hwloc_distances->latency[my_obj->logical_index + - (ibv_obj->logical_index * - hwloc_distances->nbobjs)]; - b = hwloc_distances->latency[ibv_obj->logical_index + - (my_obj->logical_index * - hwloc_distances->nbobjs)]; - #else - /* the new matrix format isn't quite as friendly, so we have to - * do an exhaustive search to find the index of this object - * in that array */ - myindex = -1; - for (j=0; j < distances_nr; j++) { - if (my_obj == hwloc_distances->objs[j]) { - myindex = j; - break; - } - } - if (-1 == myindex) { - OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND); - goto out; - } - a = (float)hwloc_distances->values[myindex + (ibvindex * hwloc_distances->nbobjs)]; - b = (float)hwloc_distances->values[ibvindex + (myindex * hwloc_distances->nbobjs)]; - #endif - distance = (a > b) ? a : b; - } - break; - - default: - /* If the obj is above a NUMA node, then we're bound to more than - one NUMA node. Find the max distance. */ - i = 0; - for (node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology, - ibv_obj->cpuset, - HWLOC_OBJ_NODE, i); - NULL != node_obj; - node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology, - ibv_obj->cpuset, - HWLOC_OBJ_NODE, ++i)) { - #if HWLOC_API_VERSION < 0x20000 - a = hwloc_distances->latency[node_obj->logical_index + - (ibv_obj->logical_index * - hwloc_distances->nbobjs)]; - b = hwloc_distances->latency[ibv_obj->logical_index + - (node_obj->logical_index * - hwloc_distances->nbobjs)]; - #else - unsigned int j; - j = node_obj->logical_index + (ibv_obj->logical_index * hwloc_distances->nbobjs); - if (j < distances_nr) { - a = (float)hwloc_distances->values[j]; - } else { - goto out; - } - j = ibv_obj->logical_index + (node_obj->logical_index * hwloc_distances->nbobjs); - if (j < distances_nr) { - b = (float)hwloc_distances->values[j]; - } else { - goto out; - } - #endif - a = (a > b) ? a : b; - distance = (a > distance) ? a : distance; - } - break; - } - - out: - if (NULL != ibv_cpuset) { - hwloc_bitmap_free(ibv_cpuset); - } - if (NULL != my_cpuset) { - hwloc_bitmap_free(my_cpuset); - } - -#if HWLOC_API_VERSION >= 0x20000 - if (NULL != hwloc_distances) { - hwloc_distances_release(opal_hwloc_topology, hwloc_distances); - } -#endif - return distance; -} - -static struct dev_distance * -sort_devs_by_distance(struct ibv_device **ib_devs, int count) -{ - int i; - struct dev_distance *devs = (struct dev_distance *) malloc(count * sizeof(struct dev_distance)); - if (NULL == devs) { - return NULL; - } - - for (i = 0; i < count; i++) { - devs[i].ib_dev = ib_devs[i]; - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "Checking distance from this process to device=%s", ibv_get_device_name(ib_devs[i])); - /* If we're not bound, just assume that the device is close. */ - devs[i].distance = 0; - if (opal_process_info.cpuset) { - /* If this process is bound to one or more PUs, we can get - an accurate distance. */ - devs[i].distance = get_ib_dev_distance(ib_devs[i]); - } - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "Process is %s: distance to device is %f", - (opal_process_info.cpuset ? "bound" : "not bound"), devs[i].distance); - } - - qsort(devs, count, sizeof(struct dev_distance), compare_distance); - - return devs; -} - - -/* - * IB component initialization: - * (1) read interface list from kernel and compare against component parameters - * then create a BTL instance for selected interfaces - * (2) setup IB listen socket for incoming connection attempts - * (3) register BTL parameters with the MCA - */ - -static mca_btl_base_module_t** -btl_openib_component_init(int *num_btl_modules, - bool enable_progress_threads, - bool enable_mpi_threads) -{ - struct ibv_device **ib_devs; - mca_btl_base_module_t** btls = NULL; - int i, ret, num_devs, length; - opal_list_t btl_list; - mca_btl_openib_module_t * openib_btl; - mca_btl_base_selected_module_t* ib_selected; - opal_list_item_t* item; - mca_btl_openib_frag_init_data_t *init_data; - struct dev_distance *dev_sorted; - float distance; - int index; - bool found; - mca_base_var_source_t source; - int list_count = 0; - - /* initialization */ - *num_btl_modules = 0; - num_devs = 0; - - /* If we got this far, then setup the memory alloc hook (because - we're most likely going to be using this component). The hook - is to be set up as early as possible in this function since we - want most of the allocated resources be aligned. - */ - opal_memory->memoryc_set_alignment(32, mca_btl_openib_module.super.btl_eager_limit); - - /* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to - see if $sysfsdir/class/infiniband exists. If it does not, - assume that the RDMA hardware drivers are not loaded, and - therefore we don't want OpenFabrics verbs support in this OMPI - job. No need to print a warning. */ - if (!opal_common_verbs_check_basics()) { - goto no_btls; - } - - /* Read in INI files with device-specific parameters */ - if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_init())) { - goto no_btls; - } - - index = mca_base_var_find("ompi", "btl", "openib", "max_inline_data"); - if (index >= 0) { - if (OPAL_SUCCESS == mca_base_var_get_value(index, NULL, &source, NULL)) { - if (-1 == mca_btl_openib_component.ib_max_inline_data && - MCA_BASE_VAR_SOURCE_DEFAULT == source) { - /* If the user has not explicitly set this MCA parameter - use max_inline_data value specified in the - device-specific parameters INI file */ - mca_btl_openib_component.ib_max_inline_data = -2; - } - } - } - - OBJ_CONSTRUCT(&mca_btl_openib_component.send_free_coalesced, opal_free_list_t); - OBJ_CONSTRUCT(&mca_btl_openib_component.send_user_free, opal_free_list_t); - OBJ_CONSTRUCT(&mca_btl_openib_component.recv_user_free, opal_free_list_t); - - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - if (NULL == init_data) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - - init_data->order = mca_btl_openib_component.rdma_qp; - init_data->list = &mca_btl_openib_component.send_user_free; - - /* Align fragments on 8-byte boundaries (instead of 2) to fix bus errors that - occur on some 32-bit platforms. Depending on the size of the fragment this - will waste 2-6 bytes of space per frag. In most cases this shouldn't waste - any space. */ - if (OPAL_SUCCESS != opal_free_list_init ( - &mca_btl_openib_component.send_user_free, - sizeof(mca_btl_openib_put_frag_t), 8, - OBJ_CLASS(mca_btl_openib_put_frag_t), - 0, 0, - mca_btl_openib_component.ib_free_list_num, - mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, - NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) { - goto no_btls; - } - - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - if (NULL == init_data) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - - init_data->order = mca_btl_openib_component.rdma_qp; - init_data->list = &mca_btl_openib_component.recv_user_free; - - if(OPAL_SUCCESS != opal_free_list_init ( - &mca_btl_openib_component.recv_user_free, - sizeof(mca_btl_openib_get_frag_t), 8, - OBJ_CLASS(mca_btl_openib_get_frag_t), - 0, 0, - mca_btl_openib_component.ib_free_list_num, - mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, - NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) { - goto no_btls; - } - - init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t)); - if (NULL == init_data) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - length = sizeof(mca_btl_openib_coalesced_frag_t); - - init_data->list = &mca_btl_openib_component.send_free_coalesced; - - if(OPAL_SUCCESS != opal_free_list_init ( - &mca_btl_openib_component.send_free_coalesced, - length, 8, OBJ_CLASS(mca_btl_openib_coalesced_frag_t), - 0, 0, mca_btl_openib_component.ib_free_list_num, - mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, - NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) { - goto no_btls; - } - - /* If fork support is requested, try to enable it */ - if (OPAL_SUCCESS != (ret = opal_common_verbs_fork_test())) { - goto no_btls; - } - - /* Parse the include and exclude lists, checking for errors */ - mca_btl_openib_component.if_include_list = - mca_btl_openib_component.if_exclude_list = - mca_btl_openib_component.if_list = NULL; - - if (NULL != mca_btl_openib_component.if_include) - list_count++; - if (NULL != mca_btl_openib_component.if_exclude) - list_count++; - if (NULL != mca_btl_openib_component.ipaddr_include) - list_count++; - if (NULL != mca_btl_openib_component.ipaddr_exclude) - list_count++; - - if (list_count > 1) { - opal_show_help("help-mpi-btl-openib.txt", - "specified include and exclude", true, - NULL == mca_btl_openib_component.if_include ? - "" : mca_btl_openib_component.if_include, - NULL == mca_btl_openib_component.if_exclude ? - "" : mca_btl_openib_component.if_exclude, - NULL == mca_btl_openib_component.ipaddr_include ? - "" :mca_btl_openib_component.ipaddr_include, - NULL == mca_btl_openib_component.ipaddr_exclude ? - "" :mca_btl_openib_component.ipaddr_exclude, - NULL); - goto no_btls; - } else if (NULL != mca_btl_openib_component.if_include) { - mca_btl_openib_component.if_include_list = - opal_argv_split(mca_btl_openib_component.if_include, ','); - mca_btl_openib_component.if_list = - opal_argv_copy(mca_btl_openib_component.if_include_list); - } else if (NULL != mca_btl_openib_component.if_exclude) { - mca_btl_openib_component.if_exclude_list = - opal_argv_split(mca_btl_openib_component.if_exclude, ','); - mca_btl_openib_component.if_list = - opal_argv_copy(mca_btl_openib_component.if_exclude_list); - } - - ib_devs = opal_ibv_get_device_list(&num_devs); - - if(0 == num_devs || NULL == ib_devs) { - goto no_btls; - } - - dev_sorted = sort_devs_by_distance(ib_devs, num_devs); - if (NULL == dev_sorted) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - - OBJ_CONSTRUCT(&btl_list, opal_list_t); - OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t); - - distance = dev_sorted[0].distance; - for (found = false, i = 0; - i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls || - mca_btl_openib_component.ib_num_btls < - mca_btl_openib_component.ib_max_btls); i++) { - if (0 != mca_btl_openib_component.ib_num_btls && - (dev_sorted[i].distance - distance) > EPS) { - opal_output_verbose(1, opal_btl_base_framework.framework_output, - "[rank=%d] openib: skipping device %s; it is too far away", - OPAL_PROC_MY_NAME.vpid, - ibv_get_device_name(dev_sorted[i].ib_dev)); - break; - } - - /* Only take devices that match the type specified by - btl_openib_device_type */ - switch (mca_btl_openib_component.device_type) { - case BTL_OPENIB_DT_IB: -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_TRANSPORT_IWARP == dev_sorted[i].ib_dev->transport_type) { - BTL_VERBOSE(("openib: only taking infiniband devices -- skipping %s", - ibv_get_device_name(dev_sorted[i].ib_dev))); - continue; - } -#endif - break; - - case BTL_OPENIB_DT_IWARP: -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_TRANSPORT_IB == dev_sorted[i].ib_dev->transport_type) { - BTL_VERBOSE(("openib: only taking iwarp devices -- skipping %s", - ibv_get_device_name(dev_sorted[i].ib_dev))); - continue; - } -#else - opal_show_help("help-mpi-btl-openib.txt", "no iwarp support", - true); -#endif - break; - - case BTL_OPENIB_DT_ALL: - break; - } - - found = true; - ret = init_one_device(&btl_list, dev_sorted[i].ib_dev); - if (OPAL_ERR_NOT_SUPPORTED == ret) { - ++num_devices_intentionally_ignored; - continue; - } else if (OPAL_SUCCESS != ret) { - free(dev_sorted); - goto no_btls; - } - } - free(dev_sorted); - if (!found) { - opal_show_help("help-mpi-btl-openib.txt", "no devices right type", - true, opal_process_info.nodename, - ((BTL_OPENIB_DT_IB == mca_btl_openib_component.device_type) ? - "InfiniBand" : - (BTL_OPENIB_DT_IWARP == mca_btl_openib_component.device_type) ? - "iWARP" : "")); - goto no_btls; - } - - /* If we got back from checking all the devices and find that - there are still items in the component.if_list, that means that - they didn't exist. Show an appropriate warning if the warning - was not disabled. */ - - if (0 != opal_argv_count(mca_btl_openib_component.if_list) && - mca_btl_openib_component.warn_nonexistent_if) { - char *str = opal_argv_join(mca_btl_openib_component.if_list, ','); - opal_show_help("help-mpi-btl-openib.txt", "nonexistent port", - true, opal_process_info.nodename, - ((NULL != mca_btl_openib_component.if_include) ? - "in" : "ex"), str); - free(str); - } - - if(0 == mca_btl_openib_component.ib_num_btls) { - /* If there were unusable devices that weren't specifically - ignored, warn about it */ - if (num_devices_intentionally_ignored < num_devs) { - opal_show_help("help-mpi-btl-openib.txt", - "no active ports found", true, - opal_process_info.nodename); - } - goto no_btls; - } - - if (0 < mca_btl_openib_component.ib_allowed_btls) { - /* Now that we know we have devices and ports that we want to use, - init CPC components */ - if (OPAL_SUCCESS != (ret = opal_btl_openib_connect_base_init())) { - goto no_btls; - } - - /* Setup the BSRQ QP's based on the final value of - mca_btl_openib_component.receive_queues. */ - if (OPAL_SUCCESS != setup_qps()) { - goto no_btls; - } - if (mca_btl_openib_component.num_srq_qps > 0 || - mca_btl_openib_component.num_xrc_qps > 0) { - opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; - if(OPAL_SUCCESS != opal_hash_table_init( - srq_addr_table, (mca_btl_openib_component.num_srq_qps + - mca_btl_openib_component.num_xrc_qps) * - mca_btl_openib_component.ib_num_btls)) { - BTL_ERROR(("SRQ internal error. Failed to allocate SRQ addr hash table")); - goto no_btls; - } - } - - /* For XRC: - * from this point we know if MCA_BTL_XRC_ENABLED it true or false */ - - /* Init XRC IB Addr hash table */ - if (MCA_BTL_XRC_ENABLED) { - OBJ_CONSTRUCT(&mca_btl_openib_component.ib_addr_table, - opal_hash_table_t); - } - } - - /* Allocate space for btl modules */ - mca_btl_openib_component.openib_btls = - (mca_btl_openib_module_t **) malloc(sizeof(mca_btl_openib_module_t*) * - mca_btl_openib_component.ib_num_btls); - if(NULL == mca_btl_openib_component.openib_btls) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - btls = (struct mca_btl_base_module_t **) - malloc(mca_btl_openib_component.ib_num_btls * - sizeof(struct mca_btl_base_module_t*)); - if(NULL == btls) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - goto no_btls; - } - - /* Copy the btl module structs into a contiguous array and fully - initialize them */ - i = 0; - while (NULL != (item = opal_list_remove_first(&btl_list))) { - ib_selected = (mca_btl_base_selected_module_t*)item; - openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module; - - if (openib_btl->allowed) { - /* Search for a CPC that can handle this port */ - ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl); - /* If we get NOT_SUPPORTED, then no CPC was found for this - port. But that's not a fatal error -- just keep going; - let's see if we find any usable openib modules or not. */ - if (OPAL_ERR_NOT_SUPPORTED == ret) { - continue; - } else if (OPAL_SUCCESS != ret) { - /* All others *are* fatal. Note that we already did a - show_help in the lower layer */ - goto no_btls; - } - - if (mca_btl_openib_component.max_hw_msg_size > 0 && - (uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) { - BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")", - mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz)); - } - - if (finish_btl_init(openib_btl) != OPAL_SUCCESS) { - goto no_btls; - } - } - - mca_btl_openib_component.openib_btls[i] = openib_btl; - OBJ_RELEASE(ib_selected); - btls[i] = &openib_btl->super; - ++i; - } - /* If we got nothing, then error out */ - if (0 == i) { - goto no_btls; - } - /* Otherwise reset to the number of openib modules that we - actually got */ - mca_btl_openib_component.ib_num_btls = i; - - btl_openib_modex_send(); - - *num_btl_modules = mca_btl_openib_component.ib_num_btls; - opal_ibv_free_device_list(ib_devs); - if (NULL != mca_btl_openib_component.if_include_list) { - opal_argv_free(mca_btl_openib_component.if_include_list); - mca_btl_openib_component.if_include_list = NULL; - } - if (NULL != mca_btl_openib_component.if_exclude_list) { - opal_argv_free(mca_btl_openib_component.if_exclude_list); - mca_btl_openib_component.if_exclude_list = NULL; - } - -#if OPAL_CUDA_SUPPORT - if (mca_btl_openib_component.cuda_want_gdr && (0 == opal_leave_pinned)) { - opal_show_help("help-mpi-btl-openib.txt", - "CUDA_gdr_and_nopinned", true, - opal_process_info.nodename); - goto no_btls; - } -#endif /* OPAL_CUDA_SUPPORT */ - - mca_btl_openib_component.memory_registration_verbose = opal_output_open(NULL); - opal_output_set_verbosity (mca_btl_openib_component.memory_registration_verbose, - mca_btl_openib_component.memory_registration_verbose_level); - - /* setup the fork warning message as we are sensitive - * to memory corruption issues when fork is called - */ - opal_warn_fork(); - return btls; - - no_btls: - /* If we fail early enough in the setup, we just modex around that - there are no openib BTL's in this process and return NULL. */ - - mca_btl_openib_component.ib_num_btls = 0; - mca_btl_openib_component.ib_allowed_btls = 0; - btl_openib_modex_send(); - if (NULL != btls) { - free(btls); - } - return NULL; -} - -/* - * Progress the no_credits_pending_frags lists on all qp's - */ -static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep) -{ - int qp, pri, rc, len; - opal_list_item_t *frag; - - OPAL_THREAD_LOCK(&ep->endpoint_lock); - - /* Traverse all QPs and all priorities */ - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - for (pri = 0; pri < 2; ++pri) { - /* Note that entries in the no_credits_pending_frags list - may be eager RDMA or send fragments. So be sure to - check that we have at least 1 RDMA or send credit. - - This loop needs a little explaining. :-\ - - In the body of the loop, we call _endpoint_post_send(). - The frag will either be successfully sent, or it will - be [re]added to the no_credit_pending_frags list. So - if we keep trying to drain the no_credits_pending_frag - list, we could end up in an infinite loop. So instead, - we get the initial length of the list and ensure to run - through every entry at least once. This attempts to - send *every* frag once and catches the case where a - frag may be on the RDMA list, but because of - coalescing, is now too big for RDMA and defaults over - to sending -- but then we're out of send credits, so it - doesn't go. But if we *do* still have some RDMA - credits and there are RDMA frags on the list behind - this now-too-big frag, they'll get a chance to go. - - Specifically, the condition in this for loop is as follows: - - - len > 0: ensure to go through all entries in the list once - - the 2nd part of the conditional checks to see if we - have any credits at all. Specifically, do we have - any RDMA credits or any send credits, *or* are we on - an SRQ, in which case we define that we *always* have - credits (because the hardware will continually - retransmit for us). - */ - for (len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]); - len > 0 && - (ep->eager_rdma_remote.tokens > 0 || - ep->qps[qp].u.pp_qp.sd_credits > 0 || - !BTL_OPENIB_QP_TYPE_PP(qp)); --len) { - frag = opal_list_remove_first(&ep->qps[qp].no_credits_pending_frags[pri]); - assert (NULL != frag); - - /* If _endpoint_post_send() fails because of - RESOURCE_BUSY, then the frag was re-added to the - no_credits_pending list. Specifically: either the - frag was initially an RDMA frag, but there were no - RDMA credits so it fell through the trying to send, - but we had no send credits and therefore re-added - the frag to the no_credits list, or the frag was a - send frag initially (and the same sequence - occurred, starting at the send frag out-of-credits - scenario). In this case, just continue and try the - rest of the frags in the list. - - If it fails because of another error, return the - error upward. */ - rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag)); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc && - OPAL_ERR_RESOURCE_BUSY != rc)) { - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return rc; - } - } - } - } - - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return OPAL_SUCCESS; -} - -void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, - const int qp) -{ - mca_btl_openib_module_t* openib_btl = ep->endpoint_btl; - opal_list_item_t *frag; - size_t i, len = opal_list_get_size(&ep->pending_get_frags); - int rc; - - for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - frag = opal_list_remove_first(&(ep->pending_get_frags)); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (NULL == frag) - break; - rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep, - to_get_frag(frag)); - if (OPAL_ERR_OUT_OF_RESOURCE == rc) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_prepend (&ep->pending_get_frags, frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - break; - } - } - - len = opal_list_get_size(&ep->pending_put_frags); - for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0; i++) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - frag = opal_list_remove_first(&(ep->pending_put_frags)); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (NULL == frag) - break; - rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep, - to_put_frag(frag)); - if (OPAL_ERR_OUT_OF_RESOURCE == rc) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_prepend (&ep->pending_put_frags, frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - break; - } - } -} - -static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, - mca_btl_openib_endpoint_t *ep, - mca_btl_openib_recv_frag_t *frag, - size_t byte_len) -{ - mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; - mca_btl_openib_header_t *hdr = frag->hdr; - int rqp = to_base_frag(frag)->base.order, cqp; - uint16_t rcredits = 0, credits; - bool is_credit_msg; - - if(ep->nbo) { - BTL_OPENIB_HEADER_NTOH(*hdr); - } - - /* advance the segment address past the header and subtract from the - * length.*/ - des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t); - - if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) { - /* call registered callback */ - mca_btl_active_message_callback_t* reg; - -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ - /* The COPY_ASYNC flag should not be set */ - assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC)); -#endif /* OPAL_CUDA_SUPPORT */ - reg = mca_btl_base_active_message_trigger + hdr->tag; - reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata ); -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ - if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) { - /* Since ASYNC flag is set, we know this descriptor is being used - * for asynchronous copy and cannot be freed yet. Therefore, set - * up callback for PML to call when complete, add argument into - * descriptor and return. */ - des->des_cbfunc = btl_openib_handle_incoming_completion; - to_in_frag(des)->endpoint = ep; - return OPAL_SUCCESS; - } -#endif /* OPAL_CUDA_SUPPORT */ - if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { - cqp = (hdr->credits >> 11) & 0x0f; - hdr->credits &= 0x87ff; - } else { - cqp = rqp; - } - if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { - rcredits = BTL_OPENIB_CREDITS(hdr->credits); - hdr->credits = 0; - } - } else { - mca_btl_openib_rdma_credits_header_t *chdr = - (mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval; - if(ep->nbo) { - BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr); - } - cqp = chdr->qpn; - rcredits = chdr->rdma_credits; - } - - credits = hdr->credits; - - if(hdr->cm_seen) - OPAL_THREAD_ADD_FETCH32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen); - - /* Now return fragment. Don't touch hdr after this point! */ - if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { - mca_btl_openib_eager_rdma_local_t *erl = &ep->eager_rdma_local; - OPAL_THREAD_LOCK(&erl->lock); - MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); - while(erl->tail != erl->head) { - mca_btl_openib_recv_frag_t *tf; - tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(ep, erl->tail); - if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(tf)) - break; - OPAL_THREAD_ADD_FETCH32(&erl->credits, 1); - MCA_BTL_OPENIB_RDMA_NEXT_INDEX(erl->tail); - } - OPAL_THREAD_UNLOCK(&erl->lock); - } else { - if (is_cts_message(frag)) { - /* If this was a CTS, free it here (it was - malloc'ed+ibv_reg_mr'ed -- so it should *not* be - FRAG_RETURN'ed). */ - int rc = opal_btl_openib_connect_base_free_cts(ep); - if (OPAL_SUCCESS != rc) { - return rc; - } - } else { - /* Otherwise, FRAG_RETURN it and repost if necessary */ - MCA_BTL_IB_FRAG_RETURN(frag); - if (BTL_OPENIB_QP_TYPE_PP(rqp)) { - if (OPAL_UNLIKELY(is_credit_msg)) { - OPAL_THREAD_ADD_FETCH32(&ep->qps[cqp].u.pp_qp.cm_received, 1); - } else { - OPAL_THREAD_ADD_FETCH32(&ep->qps[rqp].u.pp_qp.rd_posted, -1); - } - mca_btl_openib_endpoint_post_rr(ep, cqp); - } else { - mca_btl_openib_module_t *btl = ep->endpoint_btl; - OPAL_THREAD_ADD_FETCH32(&btl->qps[rqp].u.srq_qp.rd_posted, -1); - mca_btl_openib_post_srr(btl, rqp); - } - } - } - - assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits); - - /* If we got any credits (RDMA or send), then try to progress all - the no_credits_pending_frags lists */ - if (rcredits > 0) { - OPAL_THREAD_ADD_FETCH32(&ep->eager_rdma_remote.tokens, rcredits); - } - if (credits > 0) { - OPAL_THREAD_ADD_FETCH32(&ep->qps[cqp].u.pp_qp.sd_credits, credits); - } - if (rcredits + credits > 0) { - int rc; - - if (OPAL_SUCCESS != - (rc = progress_no_credits_pending_frags(ep))) { - return rc; - } - } - - send_credits(ep, cqp); - - return OPAL_SUCCESS; -} - -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ -/** - * Called by the PML when the copying of the data out of the fragment - * is complete. - */ -static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t *ep, - mca_btl_base_descriptor_t* des, - int status) -{ - mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des; - mca_btl_openib_header_t *hdr = frag->hdr; - int rqp = to_base_frag(frag)->base.order, cqp; - uint16_t rcredits = 0, credits; - - ep = to_in_frag (des)->endpoint; - - OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des)); - - if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) { - cqp = (hdr->credits >> 11) & 0x0f; - hdr->credits &= 0x87ff; - } else { - cqp = rqp; - } - if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { - rcredits = BTL_OPENIB_CREDITS(hdr->credits); - hdr->credits = 0; - } - - credits = hdr->credits; - - if(hdr->cm_seen) - OPAL_THREAD_ADD_FETCH32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen); - - /* We should not be here with eager, control, or credit messages */ - assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA); - assert(0 == is_cts_message(frag)); - assert(0 == is_credit_message(frag)); - /* HACK - clear out flags. Must be better way */ - des->des_flags = 0; - /* Otherwise, FRAG_RETURN it and repost if necessary */ - MCA_BTL_IB_FRAG_RETURN(frag); - if (BTL_OPENIB_QP_TYPE_PP(rqp)) { - OPAL_THREAD_ADD_FETCH32(&ep->qps[rqp].u.pp_qp.rd_posted, -1); - mca_btl_openib_endpoint_post_rr(ep, cqp); - } else { - mca_btl_openib_module_t *btl = ep->endpoint_btl; - OPAL_THREAD_ADD_FETCH32(&btl->qps[rqp].u.srq_qp.rd_posted, -1); - mca_btl_openib_post_srr(btl, rqp); - } - - assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits); - - /* If we got any credits (RDMA or send), then try to progress all - the no_credits_pending_frags lists */ - if (rcredits > 0) { - OPAL_THREAD_ADD_FETCH32(&ep->eager_rdma_remote.tokens, rcredits); - } - if (credits > 0) { - OPAL_THREAD_ADD_FETCH32(&ep->qps[cqp].u.pp_qp.sd_credits, credits); - } - if (rcredits + credits > 0) { - int rc; - - if (OPAL_SUCCESS != - (rc = progress_no_credits_pending_frags(ep))) { - /* This is a fatal issue so call into PML and let it know. */ - mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - NULL, NULL); - return; - } - } - - send_credits(ep, cqp); - -} -#endif /* OPAL_CUDA_SUPPORT */ - -static char* btl_openib_component_status_to_string(enum ibv_wc_status status) -{ - switch(status) { - case IBV_WC_SUCCESS: - return "SUCCESS"; - break; - case IBV_WC_LOC_LEN_ERR: - return "LOCAL LENGTH ERROR"; - break; - case IBV_WC_LOC_QP_OP_ERR: - return "LOCAL QP OPERATION ERROR"; - break; - case IBV_WC_LOC_PROT_ERR: - return "LOCAL PROTOCOL ERROR"; - break; - case IBV_WC_WR_FLUSH_ERR: - return "WORK REQUEST FLUSHED ERROR"; - break; - case IBV_WC_MW_BIND_ERR: - return "MEMORY WINDOW BIND ERROR"; - break; - case IBV_WC_BAD_RESP_ERR: - return "BAD RESPONSE ERROR"; - break; - case IBV_WC_LOC_ACCESS_ERR: - return "LOCAL ACCESS ERROR"; - break; - case IBV_WC_REM_INV_REQ_ERR: - return "INVALID REQUEST ERROR"; - break; - case IBV_WC_REM_ACCESS_ERR: - return "REMOTE ACCESS ERROR"; - break; - case IBV_WC_REM_OP_ERR: - return "REMOTE OPERATION ERROR"; - break; - case IBV_WC_RETRY_EXC_ERR: - return "RETRY EXCEEDED ERROR"; - break; - case IBV_WC_RNR_RETRY_EXC_ERR: - return "RECEIVER NOT READY RETRY EXCEEDED ERROR"; - break; - case IBV_WC_LOC_RDD_VIOL_ERR: - return "LOCAL RDD VIOLATION ERROR"; - break; - case IBV_WC_REM_INV_RD_REQ_ERR: - return "INVALID READ REQUEST ERROR"; - break; - case IBV_WC_REM_ABORT_ERR: - return "REMOTE ABORT ERROR"; - break; - case IBV_WC_INV_EECN_ERR: - return "INVALID EECN ERROR"; - break; - case IBV_WC_INV_EEC_STATE_ERR: - return "INVALID EEC STATE ERROR"; - break; - case IBV_WC_FATAL_ERR: - return "FATAL ERROR"; - break; - case IBV_WC_RESP_TIMEOUT_ERR: - return "RESPONSE TIMEOUT ERROR"; - break; - case IBV_WC_GENERAL_ERR: - return "GENERAL ERROR"; - break; - default: - return "STATUS UNDEFINED"; - break; - } -} - -static void -progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn) -{ - int ret; - opal_list_item_t *frag; - mca_btl_openib_qp_t *qp = ep->qps[qpn].qp; - - OPAL_THREAD_LOCK(&ep->endpoint_lock); - for(int i = 0; i < 2; i++) { - while(qp->sd_wqe > 0) { - mca_btl_base_endpoint_t *tmp_ep; - frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]); - if(NULL == frag) - break; -#if OPAL_ENABLE_DEBUG - assert(0 == frag->opal_list_item_refcount); -#endif - tmp_ep = to_com_frag(frag)->endpoint; - ret = mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag)); - if (OPAL_SUCCESS != ret) { - /* NTH: this handles retrying if we are out of credits but other errors are not - * handled (maybe abort?). */ - if (OPAL_ERR_RESOURCE_BUSY != ret) { - opal_list_prepend (&ep->qps[qpn].no_wqe_pending_frags[i], (opal_list_item_t *) frag); - } - break; - } - } - } - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); -} - -static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl, - const int qp) -{ - opal_list_item_t *frag; - int i; - - assert(BTL_OPENIB_QP_TYPE_SRQ(qp) || BTL_OPENIB_QP_TYPE_XRC(qp)); - - for(i = 0; i < 2; i++) { - while(openib_btl->qps[qp].u.srq_qp.sd_credits > 0) { - OPAL_THREAD_LOCK(&openib_btl->ib_lock); - frag = opal_list_remove_first( - &openib_btl->qps[qp].u.srq_qp.pending_frags[i]); - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - - if(NULL == frag) - break; - - mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint, - to_send_frag(frag)); - } - } -} - -static char *cq_name[] = {"HP CQ", "LP CQ"}; -static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, - struct ibv_wc *wc) -{ - static int flush_err_printed[] = {0, 0}; - mca_btl_openib_com_frag_t* frag; - mca_btl_base_descriptor_t *des; - mca_btl_openib_endpoint_t* endpoint; - mca_btl_openib_module_t *openib_btl = NULL; - const opal_proc_t* remote_proc = NULL; - int qp, btl_ownership; - int n; - - des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id; - frag = to_com_frag(des); - - /* For receive fragments "order" contains QP idx the fragment was posted - * to. For send fragments "order" contains QP idx the fragment was send - * through */ - qp = des->order; - - if (IBV_WC_RECV == wc->opcode && (wc->wc_flags & IBV_WC_WITH_IMM)) { -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - wc->imm_data = ntohl(wc->imm_data); -#endif - frag->endpoint = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(device->endpoints, wc->imm_data); - } - - endpoint = frag->endpoint; - - assert (NULL != endpoint); - - openib_btl = endpoint->endpoint_btl; - - if(wc->status != IBV_WC_SUCCESS) { - OPAL_OUTPUT((-1, "Got WC: ERROR")); - goto error; - } - - /* Handle work completions */ - switch(wc->opcode) { - case IBV_WC_RDMA_READ: - case IBV_WC_COMP_SWAP: - case IBV_WC_FETCH_ADD: - OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE")); - - OPAL_THREAD_ADD_FETCH32(&endpoint->get_tokens, 1); - - mca_btl_openib_get_frag_t *get_frag = to_get_frag(des); - - /* check if atomic result needs to be byte swapped (mlx5) */ - if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) { - *((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr)); - } - - get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, - get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, - OPAL_SUCCESS); - /* fall through */ - case IBV_WC_RDMA_WRITE: - if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) { - mca_btl_openib_put_frag_t *put_frag = to_put_frag(des); - - put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, - put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data, - OPAL_SUCCESS); - put_frag->cb.func = NULL; - } - /* fall through */ - case IBV_WC_SEND: - OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND")); - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { - opal_list_item_t *i; - while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { - btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, - &to_base_frag(i)->base, OPAL_SUCCESS); - if( btl_ownership ) { - mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); - } - } - } - /* Process a completed send/put/get */ - btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS); - } - if( btl_ownership ) { - mca_btl_openib_free(&openib_btl->super, des); - } - - /* return send wqe */ - qp_put_wqe(endpoint, qp); - - /* return wqes that were sent before this frag */ - n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des)); - - if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_THREAD_ADD_FETCH32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n); - - /* new SRQ credit available. Try to progress pending frags*/ - progress_pending_frags_srq(openib_btl, qp); - } - /* new wqe or/and get token available. Try to progress pending frags */ - progress_pending_frags_wqe(endpoint, qp); - mca_btl_openib_frag_progress_pending_put_get(endpoint, qp); - break; - case IBV_WC_RECV: - OPAL_OUTPUT((-1, "Got WC: RDMA_RECV, qp %d, src qp %d, WR ID %" PRIx64, - wc->qp_num, wc->src_qp, wc->wr_id)); - - /* Process a RECV */ - if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag), - wc->byte_len) != OPAL_SUCCESS) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - NULL, NULL); - break; - } - - /* decide if it is time to setup an eager rdma channel */ - if(!endpoint->eager_rdma_local.base.pval && endpoint->use_eager_rdma && - wc->byte_len < mca_btl_openib_component.eager_limit && - openib_btl->eager_rdma_channels < - mca_btl_openib_component.max_eager_rdma && - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_recv_count, 1) == - mca_btl_openib_component.eager_rdma_threshold) { - mca_btl_openib_endpoint_connect_eager_rdma(endpoint); - } - break; - default: - BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode)); - if(openib_btl) - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - NULL, NULL); - break; - } - - return; - -error: - if(endpoint->endpoint_proc && endpoint->endpoint_proc->proc_opal) - remote_proc = endpoint->endpoint_proc->proc_opal; - - /* For iWARP, the TCP connection is tied to the QP once the QP is - * in RTS. And destroying the QP is thus tied to connection - * teardown for iWARP. To destroy the connection in iWARP you - * must move the QP out of RTS, either into CLOSING for a nice - * graceful close (e.g., via rdma_disconnect()), or to ERROR if - * you want to be rude (e.g., just destroying the QP without - * disconnecting first). In both cases, all pending non-completed - * SQ and RQ WRs will automatically be flushed. - */ -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_WC_WR_FLUSH_ERR == wc->status && - IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) { - return; - } -#endif - - if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) { - BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s" - "status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d", - cq_name[cq], btl_openib_component_status_to_string(wc->status), - wc->status, wc->wr_id, - wc->opcode, wc->vendor_err, qp)); - } - - if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status || - IBV_WC_RETRY_EXC_ERR == wc->status) { - const char *peer_hostname; - peer_hostname = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal); - const char *device_name = - ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device); - - if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) { - // The show_help checker script gets confused if the topic - // is an inline logic check, so separate it into two calls - // to show_help. - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - opal_show_help("help-mpi-btl-openib.txt", - "pp rnr retry exceeded", - true, - opal_process_info.nodename, - device_name, - peer_hostname); - } else { - opal_show_help("help-mpi-btl-openib.txt", - "srq rnr retry exceeded", - true, - opal_process_info.nodename, - device_name, - peer_hostname); - } - } else if (IBV_WC_RETRY_EXC_ERR == wc->status) { - opal_show_help("help-mpi-btl-openib.txt", - "pp retry exceeded", true, - opal_process_info.nodename, - device_name, peer_hostname); - } - } - - if(openib_btl) { - /* return send wqe */ - qp_put_wqe(endpoint, qp); - - /* return wqes that were sent before this frag */ - n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des)); - - /* force emptying the pending frags toward the dead endpoint - * in progress_pending_frags* below */ - endpoint->endpoint_state = MCA_BTL_IB_FAILED; - - if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) { - BTL_VERBOSE(("frag %p returning %d credits", (void*) frag, 1+n)); - OPAL_THREAD_FETCH_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n); - /* new SRQ credit available. Try to progress pending frags*/ - progress_pending_frags_srq(openib_btl, qp); - } - /* new wqe or/and get token available. Try to progress pending frags */ - progress_pending_frags_wqe(endpoint, qp); - mca_btl_openib_frag_progress_pending_put_get(endpoint, qp); - - if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - des->des_cbfunc(&openib_btl->super, endpoint, des, wc->status); - } - if (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) { - mca_btl_openib_free(&openib_btl->super, des); - } - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - (struct opal_proc_t*)remote_proc, NULL); - } -} - -static int poll_device(mca_btl_openib_device_t* device, int count) -{ - int ne = 0, cq; - uint32_t hp_iter = 0; - struct ibv_wc wc[MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT]; - int i; - - device->pollme = false; - for(cq = 0; cq < 2 && hp_iter < mca_btl_openib_component.cq_poll_progress;) - { - ne = ibv_poll_cq(device->ib_cq[cq], mca_btl_openib_component.cq_poll_batch, wc); - if(0 == ne) { - /* don't check low prio cq if there was something in high prio cq, - * but for each cq_poll_ratio hp cq polls poll lp cq once */ - if(count && device->hp_cq_polls) - break; - cq++; - device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio; - continue; - } - - if(ne < 0) - goto error; - - count++; - - if(BTL_OPENIB_HP_CQ == cq) { - device->pollme = true; - hp_iter++; - device->hp_cq_polls--; - } - - for (i = 0; i < ne; i++) - handle_wc(device, cq, &wc[i]); - } - - return count; -error: - BTL_ERROR(("error polling %s with %d errno says %s", cq_name[cq], ne, - strerror(errno))); - return count; -} - -#if OPAL_ENABLE_PROGRESS_THREADS -void* mca_btl_openib_progress_thread(opal_object_t* arg) -{ - opal_thread_t* thread = (opal_thread_t*)arg; - mca_btl_openib_device_t* device = thread->t_arg; - struct ibv_cq *ev_cq; - void *ev_ctx; - - /* This thread enter in a cancel enabled state */ - pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL ); - pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL ); - - opal_output(-1, "WARNING: the openib btl progress thread code *does not yet work*. Your run is likely to hang, crash, break the kitchen sink, and/or eat your cat. You have been warned."); - - while (device->progress) { -#if 0 - while(ompi_progress_threads()) { - while(ompi_progress_threads()) - sched_yield(); - usleep(100); /* give app a chance to re-enter library */ - } -#endif - - if(ibv_get_cq_event(device->ib_channel, &ev_cq, &ev_ctx)) - BTL_ERROR(("Failed to get CQ event with error %s", - strerror(errno))); - if(ibv_req_notify_cq(ev_cq, 0)) { - BTL_ERROR(("Couldn't request CQ notification with error %s", - strerror(errno))); - } - - ibv_ack_cq_events(ev_cq, 1); - - while(poll_device(device, 0)); - } - - return PTHREAD_CANCELED; -} -#endif - -static int progress_one_device(mca_btl_openib_device_t *device) -{ - int i, c, count = 0, ret; - mca_btl_openib_recv_frag_t* frag; - mca_btl_openib_endpoint_t* endpoint; - uint32_t non_eager_rdma_endpoints = 0; - - c = device->eager_rdma_buffers_count; - non_eager_rdma_endpoints += (device->non_eager_rdma_endpoints + device->pollme); - - for(i = 0; i < c; i++) { - endpoint = device->eager_rdma_buffers[i]; - - if(!endpoint) - continue; - - OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock); - frag = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint, - endpoint->eager_rdma_local.head); - - if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(frag)) { - uint32_t size; - mca_btl_openib_module_t *btl = endpoint->endpoint_btl; - - opal_atomic_mb(); - - if(endpoint->nbo) { - BTL_OPENIB_FOOTER_NTOH(*frag->ftr); - } - size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr); -#if OPAL_ENABLE_DEBUG - if (frag->ftr->seq != endpoint->eager_rdma_local.seq) - BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d", - frag->ftr->seq, - endpoint->eager_rdma_local.seq)); - endpoint->eager_rdma_local.seq++; -#endif - MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head); - - OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); - frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) - - size - BTL_OPENIB_FTR_PADDING(size) + sizeof(mca_btl_openib_footer_t)); - to_base_frag(frag)->segment.seg_addr.pval = - ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - - ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint, - frag, size - sizeof(mca_btl_openib_footer_t)); - if (ret != OPAL_SUCCESS) { - btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); - return 0; - } - - count++; - } else - OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); - } - - device->eager_rdma_polls--; - - if(0 == count || non_eager_rdma_endpoints != 0 || !device->eager_rdma_polls) { - count += poll_device(device, count); - device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio; - } - - return count; -} - -/* - * IB component progress. - */ -static int btl_openib_component_progress(void) -{ - int i; - int count = 0; - - if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread && - mca_btl_openib_component.error_counter)) { - goto error; - } - - for(i = 0; i < mca_btl_openib_component.devices_count; i++) { - mca_btl_openib_device_t *device = - (mca_btl_openib_device_t *) opal_pointer_array_get_item(&mca_btl_openib_component.devices, i); - if (NULL != device) { - count += progress_one_device(device); - } - } - -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */ - /* Check to see if there are any outstanding dtoh CUDA events that - * have completed. If so, issue the PML callbacks on the fragments. - * The only thing that gets completed here are asynchronous copies - * so there is no need to free anything. - */ - { - int local_count = 0; - mca_btl_base_descriptor_t *frag; - while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) { - OPAL_OUTPUT((-1, "btl_openib: event completed on frag=%p", (void *)frag)); - frag->des_cbfunc(NULL, NULL, frag, OPAL_SUCCESS); - local_count++; - } - count += local_count; - } - if (count > 0) { - OPAL_OUTPUT((-1, "btl_openib: DONE with openib progress, count=%d", count)); - } -#endif /* OPAL_CUDA_SUPPORT */ - - return count; - -error: - /* Set the fatal counter to zero */ - mca_btl_openib_component.error_counter = 0; - /* Lets find all error events */ - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - mca_btl_openib_module_t* openib_btl = - mca_btl_openib_component.openib_btls[i]; - if(openib_btl->device->got_fatal_event) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - NULL, NULL); - } - if(openib_btl->device->got_port_event) { - /* These are non-fatal so just ignore it. */ - openib_btl->device->got_port_event = false; - } - } - return count; -} - -int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp) -{ - int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local; - int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num; - int num_post, i, rc; - struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL; - - assert(!BTL_OPENIB_QP_TYPE_PP(qp)); - - OPAL_THREAD_LOCK(&openib_btl->ib_lock); - if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) { - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - return OPAL_SUCCESS; - } - num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted; - - if (0 == num_post) { - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - return OPAL_SUCCESS; - } - - for(i = 0; i < num_post; i++) { - opal_free_list_item_t* item; - item = opal_free_list_wait (&openib_btl->device->qps[qp].recv_free); - to_base_frag(item)->base.order = qp; - to_com_frag(item)->endpoint = NULL; - if(NULL == wr) - wr = wr_list = &to_recv_frag(item)->rd_desc; - else - wr = wr->next = &to_recv_frag(item)->rd_desc; - } - - wr->next = NULL; - - rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr); - if(OPAL_LIKELY(0 == rc)) { - struct ibv_srq_attr srq_attr; - - OPAL_THREAD_ADD_FETCH32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post); - - if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) { - srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num; - srq_attr.max_sge = 1; - srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit; - - openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false; - if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) { - BTL_ERROR(("Failed to request limit event for srq on %s. " - "Fatal error, stoping asynch event thread", - ibv_get_device_name(openib_btl->device->ib_dev))); - - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - return OPAL_ERROR; - } - } - - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - return OPAL_SUCCESS; - } - - for(i = 0; wr_list && wr_list != bad_wr; i++, wr_list = wr_list->next); - - BTL_ERROR(("error posting receive descriptors to shared receive " - "queue %d (%d from %d)", qp, i, num_post)); - - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - return OPAL_ERROR; -} - - -struct mca_btl_openib_event_t { - opal_event_t super; - void *(*fn)(void *); - void *arg; - opal_event_t *event; -}; - -typedef struct mca_btl_openib_event_t mca_btl_openib_event_t; - -static void *mca_btl_openib_run_once_cb (int fd, int flags, void *context) -{ - mca_btl_openib_event_t *event = (mca_btl_openib_event_t *) context; - void *ret; - - ret = event->fn (event->arg); - opal_event_del (&event->super); - free (event); - return ret; -} - -int mca_btl_openib_run_in_main (void *(*fn)(void *), void *arg) -{ - mca_btl_openib_event_t *event = malloc (sizeof (mca_btl_openib_event_t)); - - if (OPAL_UNLIKELY(NULL == event)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - event->fn = fn; - event->arg = arg; - - opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ, - mca_btl_openib_run_once_cb, event); - - opal_event_active (&event->super, OPAL_EV_READ, 1); - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_eager_rdma.h b/opal/mca/btl/openib/btl_openib_eager_rdma.h deleted file mode 100644 index 808178a457..0000000000 --- a/opal/mca/btl/openib/btl_openib_eager_rdma.h +++ /dev/null @@ -1,118 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_OPENIB_EAGER_RDMA_BUF_H -#define MCA_BTL_OPENIB_EAGER_RDMA_BUF_H - -#include "opal_config.h" -#include "btl_openib.h" - -BEGIN_C_DECLS - -struct mca_btl_openib_eager_rdma_local_t { - opal_ptr_t base; /**< buffer for RDMAing eager messages */ - void *alloc_base; /**< allocated base */ - mca_btl_openib_recv_frag_t *frags; - mca_btl_openib_reg_t *reg; - uint16_t head; /**< RDMA buffer to poll */ - uint16_t tail; /**< Needed for credit managment */ - opal_atomic_int32_t credits; /**< number of RDMA credits */ - int32_t rd_win; -#if OPAL_ENABLE_DEBUG - uint32_t seq; -#endif - opal_mutex_t lock; /**< guard access to RDMA buffer */ - int32_t rd_low; -}; -typedef struct mca_btl_openib_eager_rdma_local_t mca_btl_openib_eager_rdma_local_t; - -struct mca_btl_openib_eager_rdma_remote_t { - opal_ptr_t base; /**< address of remote buffer */ - uint32_t rkey; /**< RKey for accessing remote buffer */ - opal_atomic_int32_t head; /**< RDMA buffer to post to */ - opal_atomic_int32_t tokens; /**< number of rdma tokens */ -#if OPAL_ENABLE_DEBUG - uint32_t seq; -#endif -}; -typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t; - -#define MCA_BTL_OPENIB_RDMA_FRAG(F) \ - (openib_frag_type(F) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) - -#define EAGER_RDMA_BUFFER_REMOTE (0) -#define EAGER_RDMA_BUFFER_LOCAL (0xff) - -#ifdef WORDS_BIGENDIAN -#define MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(F) ((F)->u.size >> 8) -#define MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(F, S) \ - ((F)->u.size = (S) << 8) -#else -#define MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(F) ((F)->u.size & 0x00ffffff) -#define MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(F, S) \ - ((F)->u.size = (S) & 0x00ffffff) -#endif - -#define MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(F) \ - (((volatile uint8_t*)(F)->ftr->u.buf)[3] != EAGER_RDMA_BUFFER_REMOTE) - -#define MCA_BTL_OPENIB_RDMA_FRAG_REMOTE(F) \ - (!MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(F)) - -#define MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(F) do { \ - ((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_REMOTE; \ - }while (0) - -#define MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(F) do { \ - ((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_LOCAL; \ - }while (0) - -#define MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(E, I) \ - (&(E)->eager_rdma_local.frags[(I)]) - -#define MCA_BTL_OPENIB_RDMA_NEXT_INDEX(I) do { \ - (I) = ((I) + 1); \ - if((I) == \ - mca_btl_openib_component.eager_rdma_num) \ - (I) = 0; \ - } while (0) - - -#if OPAL_ENABLE_DEBUG - -/** - * @brief read and increment the remote head index and generate a sequence - * number - */ - -#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD, SEQ) \ - do { \ - (SEQ) = OPAL_THREAD_ADD_FETCH32(&(HEAD), 1) - 1; \ - (OLD_HEAD) = (SEQ) % mca_btl_openib_component.eager_rdma_num; \ - } while(0) - -#else - -/** - * @brief read and increment the remote head index - */ - -#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD) \ - do { \ - (OLD_HEAD) = (OPAL_THREAD_ADD_FETCH32((opal_atomic_int32_t *) &(HEAD), 1) - 1) % mca_btl_openib_component.eager_rdma_num; \ - } while(0) - -#endif - -END_C_DECLS -#endif - diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c deleted file mode 100644 index 6fbf276d9f..0000000000 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ /dev/null @@ -1,1056 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2010-2011 IBM Corporation. All rights reserved. - * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved - * Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#ifdef HAVE_SYS_TIME_H -#include -#endif -#include -#include -#include - -#include "opal_stdint.h" -#include "opal/util/output.h" -#include "opal/util/proc.h" -#include "opal/util/show_help.h" -#include "opal/class/opal_free_list.h" - -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_xrc.h" -#include "btl_openib_async.h" -#include "connect/connect.h" - -static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); -static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); - -static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep, - mca_btl_openib_send_frag_t *frag) -{ - int qp = to_base_frag(frag)->base.order; - int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); - - if(qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio], - (opal_list_item_t *)frag); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - return OPAL_SUCCESS; -} - -/* this function is called with endpoint->endpoint_lock held */ -int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, - mca_btl_openib_send_frag_t *frag) -{ - int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY; - mca_btl_openib_header_t *hdr = frag->hdr; - mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; - int qp, ib_rc, rc; - bool do_rdma = false; - size_t size; - - if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) - des->order = frag->qp_idx; - - qp = des->order; - - if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS) - return OPAL_ERR_RESOURCE_BUSY; - - size = des->des_segments->seg_len + frag->coalesced_length; - - rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size, - &do_rdma, frag, true); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - qp_put_wqe(endpoint, qp); - return OPAL_ERR_RESOURCE_BUSY; - } - - qp_reset_signal_count(endpoint, qp); - ib_rc = post_send(endpoint, frag, do_rdma, 1); - - if(!ib_rc) - return OPAL_SUCCESS; - - if(endpoint->nbo) - BTL_OPENIB_HEADER_NTOH(*hdr); - - mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag); - - qp_put_wqe(endpoint, qp); - - BTL_ERROR(("error posting send request error %d: %s. size = %lu\n", - ib_rc, strerror(ib_rc), size)); - return OPAL_ERROR; -} - - - -OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t, - opal_list_item_t, mca_btl_openib_endpoint_construct, - mca_btl_openib_endpoint_destruct); - -/* - * Initialize state of the endpoint instance. - * - */ -static mca_btl_openib_qp_t *endpoint_alloc_qp(void) -{ - mca_btl_openib_qp_t *qp = (mca_btl_openib_qp_t *) calloc(1, sizeof(mca_btl_openib_qp_t)); - if(!qp) { - BTL_ERROR(("Failed to allocate memory for qp")); - return NULL; - } - - OBJ_CONSTRUCT(&qp->lock, opal_mutex_t); - - return qp; -} - -static void -endpoint_init_qp_pp(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp) -{ - mca_btl_openib_qp_info_t *qp_info = &mca_btl_openib_component.qp_infos[qp]; - ep_qp->qp = endpoint_alloc_qp(); - ep_qp->qp->users++; - - /* local credits are set here such that on initial posting - * of the receive buffers we end up with zero credits to return - * to our peer. The peer initializes his sd_credits to reflect this - * below. Note that this may be a problem for iWARP as the sender - * now has credits even if the receive buffers are not yet posted - */ - ep_qp->u.pp_qp.rd_credits = -qp_info->rd_num; - - ep_qp->u.pp_qp.rd_posted = 0; - ep_qp->u.pp_qp.cm_sent = 0; - ep_qp->u.pp_qp.cm_return = -qp_info->u.pp_qp.rd_rsv; - ep_qp->u.pp_qp.cm_received = qp_info->u.pp_qp.rd_rsv; - - /* initialize the local view of credits */ - ep_qp->u.pp_qp.sd_credits = qp_info->rd_num; - - /* number of available send WQEs */ - ep_qp->qp->sd_wqe = qp_info->rd_num; -} - -static void -endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp) -{ - ep_qp->qp = endpoint_alloc_qp(); - ep_qp->qp->users++; - - /* number of available send WQEs */ - ep_qp->qp->sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; -} - -static void -endpoint_init_qp_xrc(mca_btl_base_endpoint_t *ep, const int qp) -{ - int max = ep->endpoint_btl->device->ib_dev_attr.max_qp_wr - - (mca_btl_openib_component.use_eager_rdma ? - mca_btl_openib_component.max_eager_rdma : 0); - mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp]; - int32_t wqe, incr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; - int rc; - - opal_mutex_lock (&ep->ib_addr->addr_lock); - - ep_qp->qp = ep->ib_addr->qp; - if (ep->ib_addr->max_wqe + incr > max) { - /* make sure that we don't overrun maximum supported by device */ - incr = max - ep->ib_addr->max_wqe; - } - - wqe = ep->ib_addr->max_wqe + incr + - (mca_btl_openib_component.use_eager_rdma ? - mca_btl_openib_component.max_eager_rdma : 0); - - ep->ib_addr->max_wqe += incr; - - if (NULL != ep_qp->qp->lcl_qp) { - struct ibv_qp_attr qp_attr; - - /* if this is modified the code in udcm_xrc_send_qp_create may - * need to be updated as well */ - qp_attr.cap.max_recv_wr = 0; - qp_attr.cap.max_send_wr = wqe; - qp_attr.cap.max_inline_data = ep->endpoint_btl->device->max_inline_data; - qp_attr.cap.max_send_sge = 1; - qp_attr.cap.max_recv_sge = 1; /* we do not use SG list */ - rc = ibv_modify_qp (ep_qp->qp->lcl_qp, &qp_attr, IBV_QP_CAP); - if (0 == rc) { - opal_atomic_add_fetch_32 (&ep_qp->qp->sd_wqe, incr); - } - } else { - ep_qp->qp->sd_wqe = ep->ib_addr->max_wqe; - } - ep_qp->qp->users++; - opal_mutex_unlock (&ep->ib_addr->addr_lock); -} - -static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp) -{ - mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp]; - - ep_qp->rd_credit_send_lock = 0; - ep_qp->credit_frag = NULL; - - OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t); - OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t); - - OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[0], opal_list_t); - OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[1], opal_list_t); - - switch(BTL_OPENIB_QP_TYPE(qp)) { - case MCA_BTL_OPENIB_PP_QP: - endpoint_init_qp_pp(ep_qp, qp); - break; - case MCA_BTL_OPENIB_SRQ_QP: - endpoint_init_qp_srq(ep_qp, qp); - break; - case MCA_BTL_OPENIB_XRC_QP: - if (NULL == ep->ib_addr->qp) { - ep->ib_addr->qp = endpoint_alloc_qp(); - } - endpoint_init_qp_xrc(ep, qp); - break; - default: - BTL_ERROR(("Wrong QP type")); - return; - } - - ep_qp->qp->sd_wqe_inflight = 0; - ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT; -} - -void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl, - mca_btl_base_endpoint_t *ep, - opal_btl_openib_connect_base_module_t *local_cpc, - mca_btl_openib_proc_modex_t *remote_proc_info, - opal_btl_openib_connect_base_module_data_t *remote_cpc_data) -{ - int qp; - - ep->endpoint_btl = btl; - ep->use_eager_rdma = btl->device->use_eager_rdma & - mca_btl_openib_component.use_eager_rdma; - ep->subnet_id = btl->port_info.subnet_id; - ep->endpoint_local_cpc = local_cpc; - ep->endpoint_remote_cpc_data = remote_cpc_data; - - ep->rem_info.rem_lid = remote_proc_info->pm_port_info.lid; - ep->rem_info.rem_subnet_id = remote_proc_info->pm_port_info.subnet_id; - ep->rem_info.rem_mtu = remote_proc_info->pm_port_info.mtu; - opal_output(-1, "Got remote LID, subnet, MTU: %d, 0x%" PRIx64 ", %d", - ep->rem_info.rem_lid, - ep->rem_info.rem_subnet_id, - ep->rem_info.rem_mtu); - - ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id; - ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id; - - ep->rem_info.rem_transport_type = - (mca_btl_openib_transport_type_t) (remote_proc_info->pm_port_info).transport_type; - - for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - endpoint_init_qp(ep, qp); - } -} - -static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) -{ - /* setup qp structures */ - endpoint->qps = (mca_btl_openib_endpoint_qp_t*) - calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_endpoint_qp_t)); - if (MCA_BTL_XRC_ENABLED) { - endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*) - calloc(1, sizeof(mca_btl_openib_rem_qp_info_t)); - endpoint->rem_info.rem_srqs = (mca_btl_openib_rem_srq_info_t*) - calloc(mca_btl_openib_component.num_xrc_qps, - sizeof(mca_btl_openib_rem_srq_info_t)); - } else { - endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*) - calloc(mca_btl_openib_component.num_qps, - sizeof(mca_btl_openib_rem_qp_info_t)); - endpoint->rem_info.rem_srqs = NULL; - } - - endpoint->ib_addr = NULL; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - endpoint->xrc_recv_qp = NULL; -#else - endpoint->xrc_recv_qp_num = 0; -#endif - endpoint->endpoint_btl = 0; - endpoint->endpoint_proc = 0; - endpoint->endpoint_local_cpc = NULL; - endpoint->endpoint_remote_cpc_data = NULL; - endpoint->endpoint_initiator = false; - endpoint->endpoint_tstamp = 0.0; - endpoint->endpoint_state = MCA_BTL_IB_CLOSED; - endpoint->endpoint_retries = 0; - OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t); - OBJ_CONSTRUCT(&endpoint->pending_lazy_frags, opal_list_t); - OBJ_CONSTRUCT(&endpoint->pending_get_frags, opal_list_t); - OBJ_CONSTRUCT(&endpoint->pending_put_frags, opal_list_t); - - endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom; - - /* initialize RDMA eager related parts */ - endpoint->eager_recv_count = 0; - memset(&endpoint->eager_rdma_remote, 0, - sizeof(mca_btl_openib_eager_rdma_remote_t)); - memset(&endpoint->eager_rdma_local, 0, - sizeof(mca_btl_openib_eager_rdma_local_t)); - OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t); - - endpoint->rem_info.rem_lid = 0; - endpoint->rem_info.rem_subnet_id = 0; - endpoint->rem_info.rem_mtu = 0; - endpoint->nbo = false; - endpoint->use_eager_rdma = false; - endpoint->eager_rdma_remote.tokens = 0; - endpoint->eager_rdma_local.credits = 0; - endpoint->endpoint_cts_mr = NULL; - endpoint->endpoint_cts_frag.super.super.base.super.registration = NULL; - endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL; - endpoint->endpoint_posted_recvs = false; - endpoint->endpoint_cts_received = false; - endpoint->endpoint_cts_sent = false; -} - -/* - * Destroy a endpoint - * - */ - -static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) -{ - bool pval_clean = false; - int qp; - - /* If the CPC has an endpoint_finalize function, call it */ - if (NULL != endpoint->endpoint_local_cpc->cbm_endpoint_finalize) { - endpoint->endpoint_local_cpc->cbm_endpoint_finalize(endpoint); - } - - /* Release CTS buffer */ - opal_btl_openib_connect_base_free_cts(endpoint); - - /* Release memory resources */ - do { - void *_tmp_ptr = NULL; - /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma () - * was not in "connect" or "bad" flow (failed to allocate memory) - * and changed the pointer back to NULL - */ - if(!opal_atomic_compare_exchange_strong_ptr((opal_atomic_intptr_t *) &endpoint->eager_rdma_local.base.pval, (intptr_t *) &_tmp_ptr, 1)) { - if (NULL != endpoint->eager_rdma_local.reg) { - endpoint->endpoint_btl->device->rcache->rcache_deregister (endpoint->endpoint_btl->device->rcache, - &endpoint->eager_rdma_local.reg->base); - endpoint->eager_rdma_local.reg = NULL; - } - - void *alloc_base = opal_atomic_swap_ptr (&endpoint->eager_rdma_local.alloc_base, NULL); - if (alloc_base) { - endpoint->endpoint_btl->super.btl_mpool->mpool_free (endpoint->endpoint_btl->super.btl_mpool, alloc_base); - pval_clean = true; - } - } else { - pval_clean=true; - } - } while (!pval_clean); - - /* Close opened QPs if we have them*/ - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[0]); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[1]); - OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[0]); - OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[1]); - - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &endpoint->qps[qp].no_wqe_pending_frags[0]); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &endpoint->qps[qp].no_wqe_pending_frags[1]); - OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]); - OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]); - - - if(--endpoint->qps[qp].qp->users != 0) - continue; - - if(endpoint->qps[qp].qp->lcl_qp != NULL) - if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp)) - BTL_ERROR(("Failed to destroy QP:%d\n", qp)); - - free(endpoint->qps[qp].qp); - } - - /* free the qps */ - free(endpoint->qps); - endpoint->qps = NULL; - - free(endpoint->rem_info.rem_qps); - free(endpoint->rem_info.rem_srqs); - - /* unregister xrc recv qp */ -#if HAVE_XRC -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - if (NULL != endpoint->xrc_recv_qp) { - if(ibv_destroy_qp(endpoint->xrc_recv_qp)) { - BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp->qp_num)); - } else { - BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp->qp_num)); - } - } -#else - if (0 != endpoint->xrc_recv_qp_num) { - if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain, - endpoint->xrc_recv_qp_num)) { - BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num)); - } else { - BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp_num)); - } - } -#endif -#endif - - OBJ_DESTRUCT(&endpoint->endpoint_lock); - /* Clean pending lists */ - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags); - OBJ_DESTRUCT(&endpoint->pending_lazy_frags); - - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags); - OBJ_DESTRUCT(&endpoint->pending_get_frags); - - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags); - OBJ_DESTRUCT(&endpoint->pending_put_frags); -} - - -/* - * Called when the connect module has created all the qp's on an - * endpoint and needs to have some receive buffers posted. - */ -int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint) -{ - int qp; - - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - mca_btl_openib_endpoint_post_rr_nolock(endpoint, qp); - } else { - mca_btl_openib_post_srr(endpoint->endpoint_btl, qp); - } - } - - return OPAL_SUCCESS; -} - -static void cts_sent(mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) -{ - /* Nothing to do/empty function (we can't pass in a NULL pointer - for the des_cbfunc) */ - OPAL_OUTPUT((-1, "CTS send to %s completed", - opal_get_proc_hostname(ep->endpoint_proc->proc_opal))); -} - -/* - * Send CTS control fragment - */ -void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint) -{ - mca_btl_openib_send_control_frag_t *sc_frag; - mca_btl_base_descriptor_t *base_des; - mca_btl_openib_frag_t *openib_frag; - mca_btl_openib_com_frag_t *com_frag; - mca_btl_openib_control_header_t *ctl_hdr; - int rc; - - OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - mca_btl_openib_component.credits_qp, - endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num)); - sc_frag = alloc_control_frag(endpoint->endpoint_btl); - if (OPAL_UNLIKELY(NULL == sc_frag)) { - BTL_ERROR(("Failed to allocate control buffer")); - mca_btl_openib_endpoint_invoke_error(endpoint); - return; - } - - /* I dislike using the "to_()" macros; I prefer using the - explicit member fields to ensure I get the types right. Since - this is not a performance-criticial part of the code, it's - ok. */ - com_frag = &(sc_frag->super.super); - openib_frag = &(com_frag->super); - base_des = &(openib_frag->base); - - base_des->des_cbfunc = cts_sent; - base_des->des_cbdata = NULL; - base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - base_des->order = mca_btl_openib_component.credits_qp; - openib_frag->segment.seg_len = sizeof(mca_btl_openib_control_header_t); - com_frag->endpoint = endpoint; - - sc_frag->hdr->tag = MCA_BTL_TAG_IB; - sc_frag->hdr->cm_seen = 0; - sc_frag->hdr->credits = 0; - - ctl_hdr = (mca_btl_openib_control_header_t*) - openib_frag->segment.seg_addr.pval; - ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS; - - /* Send the fragment */ - if (OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, sc_frag))) { - if( OPAL_ERR_RESOURCE_BUSY != rc ) { - BTL_ERROR(("Failed to post CTS send")); - mca_btl_openib_endpoint_invoke_error(endpoint); - } - } else { - endpoint->endpoint_cts_sent = true; - } -} - -/* - * Called when the CPC has established a connection on an endpoint - */ -void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint) -{ - /* If the CPC uses the CTS protocol, then start it up */ - if (endpoint->endpoint_local_cpc->cbm_uses_cts) { - int transport_type_ib_p = 0; - /* Post our receives, which will make credit management happy - (i.e., rd_credits will be 0) */ - if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_recvs(endpoint)) { - BTL_ERROR(("Failed to post receive buffers")); - mca_btl_openib_endpoint_invoke_error(endpoint); - return; - } - endpoint->endpoint_posted_recvs = true; - - /* If this is IB, send the CTS immediately. If this is iWARP, - then only send the CTS if this endpoint was the initiator - of the connection (the receiver will send its CTS when it - receives this side's CTS). Also send the CTS if we already - received the peer's CTS (e.g., if this process was slow to - call cpc_complete(). */ -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type); -#endif - OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - transport_type_ib_p, - endpoint->endpoint_initiator, - endpoint->endpoint_cts_received)); - if (transport_type_ib_p || - endpoint->endpoint_initiator || - endpoint->endpoint_cts_received) { - mca_btl_openib_endpoint_send_cts(endpoint); - - /* If we've already got the CTS from the other side, then - mark us as connected */ - if (endpoint->endpoint_cts_received) { - OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - mca_btl_openib_endpoint_connected(endpoint); - } else { - /* the caller hold the lock and expects us to drop it */ - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - } - } - - OPAL_OUTPUT((-1, "cpc_complete to %s -- done", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - return; - } - - /* Otherwise, just set the endpoint to "connected" */ - mca_btl_openib_endpoint_connected(endpoint); -} - -/* - * called when the connect module has completed setup of an endpoint - */ -void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) -{ - opal_list_item_t *frag_item, *ep_item; - mca_btl_openib_send_frag_t *frag; - mca_btl_openib_endpoint_t *ep; - bool master = false; - int rc; - - if (MCA_BTL_XRC_ENABLED) { - opal_mutex_lock (&endpoint->ib_addr->addr_lock); - if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) { - /* We are not xrc master */ - /* set our qp pointer to master qp */ - master = false; - } else { - /* I'm master of XRC */ - endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED; - master = true; - } - } - - /* Run over all qps and load alternative path */ - if (APM_ENABLED) { - int i; - if (MCA_BTL_XRC_ENABLED) { - if (master) { - mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint); - } - } else { - for(i = 0; i < mca_btl_openib_component.num_qps; i++) { - mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint); - } - } - } - - endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; - endpoint->endpoint_btl->device->non_eager_rdma_endpoints++; - - if(MCA_BTL_XRC_ENABLED) { - if (master) { - while (NULL != (ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep))) { - ep = (mca_btl_openib_endpoint_t *)ep_item; - if (OPAL_SUCCESS != - opal_btl_openib_connect_base_start(endpoint->endpoint_local_cpc, ep)) { - BTL_ERROR(("Failed to connect pending endpoint\n")); - } - } - } - opal_mutex_unlock (&endpoint->ib_addr->addr_lock); - } - - - /* Process pending packet on the endpoint */ - - /* While there are frags in the list, process them */ - while (NULL != (frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)))) { - frag = to_send_frag(frag_item); - /* We need to post this one */ - - if(OPAL_SUCCESS != (rc = mca_btl_openib_endpoint_post_send(endpoint, frag))) { - /* if we are out of resources, let's try to reschedule everything later */ - if( OPAL_ERR_RESOURCE_BUSY != rc ) { - BTL_ERROR(("Error posting send")); - } - } - } - OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - - /* if upper layer called put or get before connection moved to connected - * state then we restart them here */ - mca_btl_openib_frag_progress_pending_put_get(endpoint, - mca_btl_openib_component.rdma_qp); -} - -/* - * Attempt to send a fragment using a given endpoint. If the endpoint is not - * connected, queue the fragment and start the connection as required. - */ -int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep, - mca_btl_openib_send_frag_t* frag) -{ - int rc; - - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, &to_base_frag(frag)->base, - &ep->pending_lazy_frags); - - if(OPAL_LIKELY(OPAL_SUCCESS == rc)) { - rc = mca_btl_openib_endpoint_post_send(ep, frag); - } - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_UNLIKELY(OPAL_ERR_RESOURCE_BUSY == rc)) { - rc = OPAL_SUCCESS; - } - - return rc; -} - -/** - * Return control fragment. - */ - -static void mca_btl_openib_endpoint_credits( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) -{ - - int qp; - - mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des); - - qp = frag->qp_idx; - - /* we don't acquire a WQE for credit message - so decrement. - * Note: doing it for QP used for credit management */ - (void) qp_get_wqe(ep, des->order); - - if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep)) - mca_btl_openib_endpoint_send_credits(ep, qp); - else { - BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp); - /* check one more time if credits are available after unlock */ - send_credits(ep, qp); - } -} - -/** - * Return credits to peer - */ - -void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, - const int qp) -{ - mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - mca_btl_openib_send_control_frag_t* frag; - mca_btl_openib_rdma_credits_header_t *credits_hdr; - int rc; - bool do_rdma = false; - int32_t cm_return; - - frag = endpoint->qps[qp].credit_frag; - - if(OPAL_UNLIKELY(NULL == frag)) { - frag = alloc_control_frag(openib_btl); - frag->qp_idx = qp; - endpoint->qps[qp].credit_frag = frag; - /* set those once and forever */ - to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; - to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; - to_base_frag(frag)->base.des_cbdata = NULL; - to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;; - to_com_frag(frag)->endpoint = endpoint; - frag->hdr->tag = MCA_BTL_TAG_IB; - to_base_frag(frag)->segment.seg_len = - sizeof(mca_btl_openib_rdma_credits_header_t); - } - - assert(frag->qp_idx == qp); - credits_hdr = (mca_btl_openib_rdma_credits_header_t*) - to_base_frag(frag)->segment.seg_addr.pval; - if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) { - do_rdma = true; - } else { - if(OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > - (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); - BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); - return; - } - } - - BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); - - frag->hdr->cm_seen = 0; - BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); - if(cm_return > 255) { - frag->hdr->cm_seen = 255; - cm_return -= 255; - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); - } else { - frag->hdr->cm_seen = cm_return; - } - - BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); - credits_hdr->qpn = qp; - credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS; - - if(endpoint->nbo) - BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); - - qp_reset_signal_count(endpoint, qp); - if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0) - return; - - if(endpoint->nbo) { - BTL_OPENIB_HEADER_NTOH(*frag->hdr); - BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); - } - BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.rd_credits, - frag->hdr->credits); - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_local.credits, - credits_hdr->rdma_credits); - if(do_rdma) - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_remote.tokens, 1); - else - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); - - BTL_ERROR(("error posting send request errno %d says %s", rc, - strerror(errno))); -} - -/* local callback function for completion of eager rdma connect */ -static void mca_btl_openib_endpoint_eager_rdma_connect_cb( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; - OPAL_THREAD_ADD_FETCH32(&device->non_eager_rdma_endpoints, -1); - assert(device->non_eager_rdma_endpoints >= 0); - MCA_BTL_IB_FRAG_RETURN(descriptor); -} - -/* send the eager rdma connect message to the remote endpoint */ -static int mca_btl_openib_endpoint_send_eager_rdma( - mca_btl_base_endpoint_t* endpoint) -{ - mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - mca_btl_openib_eager_rdma_header_t *rdma_hdr; - mca_btl_openib_send_control_frag_t* frag; - int rc; - - frag = alloc_control_frag(openib_btl); - if(NULL == frag) { - return -1; - } - - to_base_frag(frag)->base.des_cbfunc = - mca_btl_openib_endpoint_eager_rdma_connect_cb; - to_base_frag(frag)->base.des_cbdata = NULL; - to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; - to_base_frag(frag)->segment.seg_len = - sizeof(mca_btl_openib_eager_rdma_header_t); - to_com_frag(frag)->endpoint = endpoint; - - frag->hdr->tag = MCA_BTL_TAG_IB; - rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; - rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; - rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; - rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval); - BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 - ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n", - rdma_hdr->rkey, - rdma_hdr->rdma_start.lval, - rdma_hdr->rdma_start.pval, - rdma_hdr->rdma_start.ival, - rdma_hdr->control.type, - (int) sizeof(mca_btl_openib_eager_rdma_header_t) - )); - - if(endpoint->nbo) { - BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); - - BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n", - rdma_hdr->rkey, - rdma_hdr->rdma_start.lval, - rdma_hdr->rdma_start.pval, - rdma_hdr->rdma_start.ival - )); - } - rc = mca_btl_openib_endpoint_send(endpoint, frag); - if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) - return OPAL_SUCCESS; - - MCA_BTL_IB_FRAG_RETURN(frag); - BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno))); - return rc; -} - -/* Setup eager RDMA buffers and notify the remote endpoint*/ -void mca_btl_openib_endpoint_connect_eager_rdma( - mca_btl_openib_endpoint_t* endpoint) -{ - mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - char *buf, *alloc_base; - mca_btl_openib_recv_frag_t *headers_buf; - int i, rc; - uint32_t flag = MCA_RCACHE_FLAGS_CACHE_BYPASS; - void *_tmp_ptr = NULL; - - /* Set local rdma pointer to 1 temporarily so other threads will not try - * to enter the function */ - if(!opal_atomic_compare_exchange_strong_ptr ((opal_atomic_intptr_t *) &endpoint->eager_rdma_local.base.pval, (intptr_t *) &_tmp_ptr, 1)) { - return; - } - - headers_buf = (mca_btl_openib_recv_frag_t*) - malloc(sizeof(mca_btl_openib_recv_frag_t) * - mca_btl_openib_component.eager_rdma_num); - - if(NULL == headers_buf) - goto unlock_rdma_local; - -#if HAVE_DECL_IBV_ACCESS_SO - /* Solaris implements the Relaxed Ordering feature defined in the - PCI Specification. With this in mind any memory region which - relies on a buffer being written in a specific order, for - example the eager rdma connections created in this routinue, - must set a strong order flag when registering the memory for - rdma operations. - - The following flag will be interpreted and the appropriate - steps will be taken when the memory is registered in - openib_reg_mr(). */ - flag |= MCA_RCACHE_FLAGS_SO_MEM; -#endif - - alloc_base = buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, - openib_btl->eager_rdma_frag_size * - mca_btl_openib_component.eager_rdma_num, - mca_btl_openib_component.buffer_alignment, - 0); - - if(!buf) - goto free_headers_buf; - - rc = openib_btl->device->rcache->rcache_register (openib_btl->device->rcache, buf, openib_btl->eager_rdma_frag_size * - mca_btl_openib_component.eager_rdma_num, flag, MCA_RCACHE_ACCESS_ANY, - (mca_rcache_base_registration_t**)&endpoint->eager_rdma_local.reg); - if (OPAL_SUCCESS != rc) { - openib_btl->super.btl_mpool->mpool_free (openib_btl->super.btl_mpool, alloc_base); - goto free_headers_buf; - } - - buf = buf + openib_btl->eager_rdma_frag_size - - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - - sizeof(mca_btl_openib_header_t); - - for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { - opal_free_list_item_t *item; - mca_btl_openib_recv_frag_t * frag; - mca_btl_openib_frag_init_data_t init_data; - - item = (opal_free_list_item_t*)&headers_buf[i]; - item->registration = (mca_rcache_base_registration_t *)endpoint->eager_rdma_local.reg; - item->ptr = buf + i * openib_btl->eager_rdma_frag_size; - OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); - - init_data.order = mca_btl_openib_component.credits_qp; - init_data.list = NULL; - - mca_btl_openib_frag_init(item, &init_data); - frag = to_recv_frag(item); - to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; - to_com_frag(frag)->endpoint = endpoint; - frag->ftr = (mca_btl_openib_footer_t*) - ((char*)to_base_frag(frag)->segment.seg_addr.pval + - mca_btl_openib_component.eager_limit); - - MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); - } - - endpoint->eager_rdma_local.frags = headers_buf; - - endpoint->eager_rdma_local.rd_win = - mca_btl_openib_component.eager_rdma_num >> 2; - endpoint->eager_rdma_local.rd_win = - endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; - - /* set local rdma pointer to real value */ - endpoint->eager_rdma_local.base.pval = buf; - endpoint->eager_rdma_local.alloc_base = alloc_base; - - if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) { - mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; - mca_btl_openib_endpoint_t **p; - void *_tmp_ptr; - OBJ_RETAIN(endpoint); - assert(((opal_object_t*)endpoint)->obj_reference_count == 2); - do { - _tmp_ptr = NULL; - p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count]; - } while(!opal_atomic_compare_exchange_strong_ptr ((opal_atomic_intptr_t *) p, (intptr_t *) &_tmp_ptr, (intptr_t) endpoint)); - - OPAL_THREAD_ADD_FETCH32(&openib_btl->eager_rdma_channels, 1); - /* from this point progress function starts to poll new buffer */ - OPAL_THREAD_ADD_FETCH32(&device->eager_rdma_buffers_count, 1); - return; - } - - openib_btl->device->rcache->rcache_deregister (openib_btl->device->rcache, - (mca_rcache_base_registration_t*)endpoint->eager_rdma_local.reg); - openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf); -free_headers_buf: - free(headers_buf); -unlock_rdma_local: - /* set local rdma pointer back to zero. Will retry later */ - endpoint->eager_rdma_local.base.pval = NULL; - endpoint->eager_rdma_local.frags = NULL; -} - -/* - * Invoke an error on the btl associated with an endpoint. If we - * don't have an endpoint, then just use the first one on the - * component list of BTLs. - */ -void *mca_btl_openib_endpoint_invoke_error(void *context) -{ - mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context; - mca_btl_openib_module_t *btl = NULL; - - if (NULL == endpoint) { - int i; - for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i) { - if (NULL != mca_btl_openib_component.openib_btls[i] && - NULL != mca_btl_openib_component.openib_btls[i]->error_cb) { - btl = mca_btl_openib_component.openib_btls[i]; - break; - } - } - } else { - btl = endpoint->endpoint_btl; - endpoint->endpoint_state = MCA_BTL_IB_FAILED; - } - - /* If we didn't find a BTL, then just bail :-( */ - if (NULL == btl || NULL == btl->error_cb) { - opal_show_help("help-mpi-btl-openib.txt", - "cannot raise btl error", true, - opal_process_info.nodename, - __FILE__, __LINE__); - exit(1); - } - - /* Invoke the callback to the upper layer */ - btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); - - /* Will likely never get here */ - return NULL; -} diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h deleted file mode 100644 index b3901c5665..0000000000 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ /dev/null @@ -1,720 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 NVIDIA Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_IB_ENDPOINT_H -#define MCA_BTL_IB_ENDPOINT_H - -#include -#include -#include "opal/class/opal_list.h" -#include "opal/mca/event/event.h" -#include "opal/util/output.h" -#include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/btl_base_error.h" -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_eager_rdma.h" -#include "connect/base.h" - -#define QP_TX_BATCH_COUNT 64 - -#define QP_TX_BATCH_COUNT 64 - -BEGIN_C_DECLS - -struct mca_btl_openib_frag_t; -struct mca_btl_openib_proc_modex_t; - -/** - * State of IB endpoint connection. - */ - -typedef enum { - /* Defines the state in which this BTL instance - * has started the process of connection */ - MCA_BTL_IB_CONNECTING, - - /* Waiting for ack from endpoint */ - MCA_BTL_IB_CONNECT_ACK, - - /*Waiting for final connection ACK from endpoint */ - MCA_BTL_IB_WAITING_ACK, - - /* Connected ... both sender & receiver have - * buffers associated with this connection */ - MCA_BTL_IB_CONNECTED, - - /* Connection is closed, there are no resources - * associated with this */ - MCA_BTL_IB_CLOSED, - - /* Maximum number of retries have been used. - * Report failure on send to upper layer */ - MCA_BTL_IB_FAILED -} mca_btl_openib_endpoint_state_t; - -typedef struct mca_btl_openib_rem_qp_info_t { - uint32_t rem_qp_num; - /* Remote QP number */ - uint32_t rem_psn; - /* Remote processes port sequence number */ -} mca_btl_openib_rem_qp_info_t; - -typedef struct mca_btl_openib_rem_srq_info_t { - /* Remote SRQ number */ - uint32_t rem_srq_num; -} mca_btl_openib_rem_srq_info_t; - -typedef struct mca_btl_openib_rem_info_t { - /* Local identifier of the remote process */ - uint16_t rem_lid; - /* subnet id of remote process */ - uint64_t rem_subnet_id; - /* MTU of remote process */ - uint32_t rem_mtu; - /* index of remote endpoint in endpoint array */ - uint32_t rem_index; - /* Remote QPs */ - mca_btl_openib_rem_qp_info_t *rem_qps; - /* Remote xrc_srq info, used only with XRC connections */ - mca_btl_openib_rem_srq_info_t *rem_srqs; - /* Vendor id of remote HCA */ - uint32_t rem_vendor_id; - /* Vendor part id of remote HCA */ - uint32_t rem_vendor_part_id; - /* Transport type of remote port */ - mca_btl_openib_transport_type_t rem_transport_type; -} mca_btl_openib_rem_info_t; - - -/** - * Agggregates all per peer qp info for an endpoint - */ -typedef struct mca_btl_openib_endpoint_pp_qp_t { - opal_atomic_int32_t sd_credits; /**< this rank's view of the credits - * available for sending: - * this is the credits granted by the - * remote peer which has some relation to the - * number of receive buffers posted remotely - */ - opal_atomic_int32_t rd_posted; /**< number of descriptors posted to the nic*/ - opal_atomic_int32_t rd_credits; /**< number of credits to return to peer */ - opal_atomic_int32_t cm_received; /**< Credit messages received */ - opal_atomic_int32_t cm_return; /**< how may credits to return */ - opal_atomic_int32_t cm_sent; /**< Outstanding number of credit messages */ -} mca_btl_openib_endpoint_pp_qp_t; - - -/** - * Aggregates all srq qp info for an endpoint - */ -typedef struct mca_btl_openib_endpoint_srq_qp_t { - int32_t dummy; -} mca_btl_openib_endpoint_srq_qp_t; - -typedef struct mca_btl_openib_qp_t { - struct ibv_qp *lcl_qp; - uint32_t lcl_psn; - opal_atomic_int32_t sd_wqe; /**< number of available send wqe entries */ - opal_atomic_int32_t sd_wqe_inflight; - int wqe_count; - int users; - opal_mutex_t lock; -} mca_btl_openib_qp_t; - -typedef struct mca_btl_openib_endpoint_qp_t { - mca_btl_openib_qp_t *qp; - opal_list_t no_credits_pending_frags[2]; /**< put fragment here if there is no credits - available */ - opal_list_t no_wqe_pending_frags[2]; /**< put fragments here if there is no wqe - available */ - opal_atomic_int32_t rd_credit_send_lock; /**< Lock credit send fragment */ - mca_btl_openib_send_control_frag_t *credit_frag; - size_t ib_inline_max; /**< max size of inline send*/ - union { - mca_btl_openib_endpoint_srq_qp_t srq_qp; - mca_btl_openib_endpoint_pp_qp_t pp_qp; - } u; -} mca_btl_openib_endpoint_qp_t; - -/** - * An abstraction that represents a connection to a endpoint process. - * An instance of mca_btl_base_endpoint_t is associated w/ each process - * and BTL pair at startup. However, connections to the endpoint - * are established dynamically on an as-needed basis: - */ - -struct mca_btl_base_endpoint_t { - opal_list_item_t super; - - /** BTL module that created this connection */ - struct mca_btl_openib_module_t* endpoint_btl; - - /** proc structure corresponding to endpoint */ - struct mca_btl_openib_proc_t* endpoint_proc; - - /** local CPC to connect to this endpoint */ - opal_btl_openib_connect_base_module_t *endpoint_local_cpc; - - /** hook for local CPC to hang endpoint-specific data */ - void *endpoint_local_cpc_data; - - /** If endpoint_local_cpc->cbm_uses_cts is true and this endpoint - is iWARP, then endpoint_initiator must be true on the side - that actually initiates the QP, false on the other side. This - bool is used to know which way to send the first CTS - message. */ - bool endpoint_initiator; - - /** pointer to remote proc's CPC data (essentially its CPC modex - message) */ - opal_btl_openib_connect_base_module_data_t *endpoint_remote_cpc_data; - - /** current state of the connection */ - mca_btl_openib_endpoint_state_t endpoint_state; - - /** number of connection retries attempted */ - size_t endpoint_retries; - - /** timestamp of when the first connection was attempted */ - double endpoint_tstamp; - - /** lock for concurrent access to endpoint state */ - opal_mutex_t endpoint_lock; - - /** list of pending frags due to lazy connection establishment - for this endpotint */ - opal_list_t pending_lazy_frags; - - mca_btl_openib_endpoint_qp_t *qps; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_qp *xrc_recv_qp; -#else - uint32_t xrc_recv_qp_num; /* in xrc we will use it as recv qp */ -#endif - uint32_t xrc_recv_psn; - - /** list of pending rget ops */ - opal_list_t pending_get_frags; - /** list of pending rput ops */ - opal_list_t pending_put_frags; - - /** number of available get tokens */ - opal_atomic_int32_t get_tokens; - - /** subnet id of this endpoint*/ - uint64_t subnet_id; - /** used only for xrc; pointer to struct that keeps remote port - info */ - struct ib_address_t *ib_addr; - - /** number of eager received */ - opal_atomic_int32_t eager_recv_count; - /** info about remote RDMA buffer */ - mca_btl_openib_eager_rdma_remote_t eager_rdma_remote; - /** info about local RDMA buffer */ - mca_btl_openib_eager_rdma_local_t eager_rdma_local; - /** index of the endpoint in endpoints array */ - int32_t index; - - /** does the endpoint require network byte ordering? */ - bool nbo; - /** use eager rdma for this peer? */ - bool use_eager_rdma; - - /** information about the remote port */ - mca_btl_openib_rem_info_t rem_info; - - /** Frag for initial wireup CTS protocol; will be NULL if CPC - indicates that it does not want to use CTS */ - mca_btl_openib_recv_frag_t endpoint_cts_frag; - /** Memory registration info for the CTS frag */ - struct ibv_mr *endpoint_cts_mr; - - /** Whether we've posted receives on this EP or not (only used in - CTS protocol) */ - bool endpoint_posted_recvs; - - /** Whether we've received the CTS from the peer or not (only used - in CTS protocol) */ - bool endpoint_cts_received; - - /** Whether we've send out CTS to the peer or not (only used in - CTS protocol) */ - bool endpoint_cts_sent; -}; - -typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; -typedef mca_btl_base_endpoint_t mca_btl_openib_endpoint_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t); - -static inline int32_t qp_get_wqe(mca_btl_openib_endpoint_t *ep, const int qp) -{ - return OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].qp->sd_wqe, -1); -} - -static inline int32_t qp_put_wqe(mca_btl_openib_endpoint_t *ep, const int qp) -{ - return OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].qp->sd_wqe, 1); -} - - -static inline int32_t qp_inc_inflight_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) -{ - frag->n_wqes_inflight = 0; - return OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].qp->sd_wqe_inflight, 1); -} - -static inline void qp_inflight_wqe_to_frag(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) -{ - - frag->n_wqes_inflight = ep->qps[qp].qp->sd_wqe_inflight; - ep->qps[qp].qp->sd_wqe_inflight = 0; -} - -static inline int qp_frag_to_wqe(mca_btl_openib_endpoint_t *ep, const int qp, mca_btl_openib_com_frag_t *frag) -{ - int n; - n = frag->n_wqes_inflight; - OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].qp->sd_wqe, n); - frag->n_wqes_inflight = 0; - - return n; -} - -static inline int qp_need_signal(mca_btl_openib_endpoint_t *ep, const int qp, size_t size, int rdma) -{ - - /* note that size here is payload only */ - if (ep->qps[qp].qp->sd_wqe <= 0 || - size + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) > ep->qps[qp].ib_inline_max || - (!BTL_OPENIB_QP_TYPE_PP(qp) && ep->endpoint_btl->qps[qp].u.srq_qp.sd_credits <= 0)) { - ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; - return 1; - } - - if (0 < --ep->qps[qp].qp->wqe_count) { - return 0; - } - - ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; - return 1; -} - -static inline void qp_reset_signal_count(mca_btl_openib_endpoint_t *ep, const int qp) -{ - ep->qps[qp].qp->wqe_count = QP_TX_BATCH_COUNT; -} - - - -int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*, - mca_btl_openib_send_frag_t*); -int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*, - mca_btl_openib_send_frag_t*); -void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int); -void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*); -int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*); - -/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */ -void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint); -void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*); - -void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*); -void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*, - mca_btl_base_endpoint_t*, - opal_btl_openib_connect_base_module_t *local_cpc, - struct mca_btl_openib_proc_modex_t *remote_proc_info, - opal_btl_openib_connect_base_module_data_t *remote_cpc_data); - -/* - * Invoke an error on the btl associated with an endpoint. If we - * don't have an endpoint, then just use the first one on the - * component list of BTLs. - */ -void *mca_btl_openib_endpoint_invoke_error(void *endpoint); - -static inline int post_recvs(mca_btl_base_endpoint_t *ep, const int qp, - const int num_post) -{ - int i, rc; - struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL; - mca_btl_openib_module_t *openib_btl = ep->endpoint_btl; - - if(0 == num_post) - return OPAL_SUCCESS; - - for(i = 0; i < num_post; i++) { - opal_free_list_item_t* item; - item = opal_free_list_wait (&openib_btl->device->qps[qp].recv_free); - to_base_frag(item)->base.order = qp; - to_com_frag(item)->endpoint = ep; - if(NULL == wr) - wr = wr_list = &to_recv_frag(item)->rd_desc; - else - wr = wr->next = &to_recv_frag(item)->rd_desc; - OPAL_OUTPUT((-1, "Posting recv (QP num %d): WR ID %p, SG addr %p, len %d, lkey %d", - ep->qps[qp].qp->lcl_qp->qp_num, - (void*) ((uintptr_t*)wr->wr_id), - (void*)((uintptr_t*) wr->sg_list[0].addr), - wr->sg_list[0].length, - wr->sg_list[0].lkey)); - } - - wr->next = NULL; - - rc = ibv_post_recv(ep->qps[qp].qp->lcl_qp, wr_list, &bad_wr); - if (0 == rc) - return OPAL_SUCCESS; - - BTL_ERROR(("error %d posting receive on qp %d", rc, qp)); - return OPAL_ERROR; -} - -static inline int mca_btl_openib_endpoint_post_rr_nolock( - mca_btl_base_endpoint_t *ep, const int qp) -{ - int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; - int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num; - int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low; - int cqp = mca_btl_openib_component.credits_qp, rc; - int cm_received = 0, num_post = 0; - - assert(BTL_OPENIB_QP_TYPE_PP(qp)); - - if(ep->qps[qp].u.pp_qp.rd_posted <= rd_low) - num_post = rd_num - ep->qps[qp].u.pp_qp.rd_posted; - - assert(num_post >= 0); - - if(ep->qps[qp].u.pp_qp.cm_received >= (rd_rsv >> 2)) - cm_received = ep->qps[qp].u.pp_qp.cm_received; - - if((rc = post_recvs(ep, qp, num_post)) != OPAL_SUCCESS) { - return rc; - } - OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].u.pp_qp.rd_posted, num_post); - OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].u.pp_qp.rd_credits, num_post); - - /* post buffers for credit management on credit management qp */ - if((rc = post_recvs(ep, cqp, cm_received)) != OPAL_SUCCESS) { - return rc; - } - OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].u.pp_qp.cm_return, cm_received); - OPAL_THREAD_ADD_FETCH32(&ep->qps[qp].u.pp_qp.cm_received, -cm_received); - - assert(ep->qps[qp].u.pp_qp.rd_credits <= rd_num && - ep->qps[qp].u.pp_qp.rd_credits >= 0); - - return OPAL_SUCCESS; -} - -static inline int mca_btl_openib_endpoint_post_rr( - mca_btl_base_endpoint_t *ep, const int qp) -{ - int ret; - OPAL_THREAD_LOCK(&ep->endpoint_lock); - ret = mca_btl_openib_endpoint_post_rr_nolock(ep, qp); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - return ret; -} - -static inline __opal_attribute_always_inline__ bool btl_openib_credits_send_trylock (mca_btl_openib_endpoint_t *ep, int qp) -{ - int32_t _tmp_value = 0; - return OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_32(&ep->qps[qp].rd_credit_send_lock, &_tmp_value, 1); -} - -#define BTL_OPENIB_CREDITS_SEND_UNLOCK(E, Q) \ - OPAL_ATOMIC_SWAP_32 (&(E)->qps[(Q)].rd_credit_send_lock, 0) -#define BTL_OPENIB_GET_CREDITS(FROM, TO) \ - TO = OPAL_ATOMIC_SWAP_32(&FROM, 0) - - -static inline bool check_eager_rdma_credits(const mca_btl_openib_endpoint_t *ep) -{ - return (ep->eager_rdma_local.credits > ep->eager_rdma_local.rd_win) ? true : - false; -} - -static inline bool -check_send_credits(const mca_btl_openib_endpoint_t *ep, const int qp) -{ - - if(!BTL_OPENIB_QP_TYPE_PP(qp)) - return false; - - return (ep->qps[qp].u.pp_qp.rd_credits >= - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) ? true : false; -} - -static inline void send_credits(mca_btl_openib_endpoint_t *ep, int qp) -{ - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - if(check_send_credits(ep, qp)) - goto try_send; - } else { - qp = mca_btl_openib_component.credits_qp; - } - - if(!check_eager_rdma_credits(ep)) - return; - -try_send: - if(btl_openib_credits_send_trylock(ep, qp)) - mca_btl_openib_endpoint_send_credits(ep, qp); -} - -static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, - mca_btl_base_descriptor_t *des, opal_list_t *pending_list) -{ - int rc = OPAL_ERR_RESOURCE_BUSY; - - switch(ep->endpoint_state) { - case MCA_BTL_IB_CLOSED: - rc = ep->endpoint_local_cpc->cbm_start_connect(ep->endpoint_local_cpc, ep); - if (OPAL_SUCCESS == rc) { - rc = OPAL_ERR_RESOURCE_BUSY; - } - /* fall through */ - default: - opal_list_append(pending_list, (opal_list_item_t *)des); - break; - case MCA_BTL_IB_FAILED: - rc = OPAL_ERR_UNREACH; - break; - case MCA_BTL_IB_CONNECTED: - rc = OPAL_SUCCESS; - break; - } - - return rc; -} - -static inline __opal_attribute_always_inline__ int -ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp, int do_signal) -{ - if (do_signal) { - return IBV_SEND_SIGNALED | - ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); - } else { - return ((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0); - } -} - -static inline int -acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint) -{ - if(OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_remote.tokens, -1) < 0) { - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_remote.tokens, 1); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - return OPAL_SUCCESS; -} - -static inline int post_send(mca_btl_openib_endpoint_t *ep, - mca_btl_openib_send_frag_t *frag, const bool rdma, int do_signal) -{ - mca_btl_openib_module_t *openib_btl = ep->endpoint_btl; - mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment; - struct ibv_sge *sg = &to_com_frag(frag)->sg_entry; - struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc; - struct ibv_send_wr *bad_wr; - int qp = to_base_frag(frag)->base.order; - - sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) + - (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length; - - sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]), do_signal); - - if(ep->nbo) - BTL_OPENIB_HEADER_HTON(*frag->hdr); - - if(rdma) { - int32_t head; - mca_btl_openib_footer_t* ftr = - (mca_btl_openib_footer_t*)(((char*)frag->hdr) + sg->length + - BTL_OPENIB_FTR_PADDING(sg->length) - sizeof(mca_btl_openib_footer_t)); - sr_desc->opcode = IBV_WR_RDMA_WRITE; - MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length); - MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr); -#if OPAL_ENABLE_DEBUG - /* NTH: generate the sequence from the remote head index to ensure that the - * wrong sequence isn't set. The way this code used to look the sequence number - * and head were updated independently and it led to false positives for incorrect - * sequence numbers. */ - MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head, ftr->seq); -#else - MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head); -#endif - if(ep->nbo) - BTL_OPENIB_FOOTER_HTON(*ftr); - - sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey; - sr_desc->wr.rdma.remote_addr = - ep->eager_rdma_remote.base.lval + - head * openib_btl->eager_rdma_frag_size + - sizeof(mca_btl_openib_header_t) + - mca_btl_openib_component.eager_limit + - sizeof(mca_btl_openib_footer_t); - sr_desc->wr.rdma.remote_addr -= sg->length + BTL_OPENIB_FTR_PADDING(sg->length); - } else { - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - sr_desc->opcode = IBV_WR_SEND; - } else { - sr_desc->opcode = IBV_WR_SEND_WITH_IMM; -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - sr_desc->imm_data = htonl(ep->rem_info.rem_index); -#else - sr_desc->imm_data = ep->rem_info.rem_index; -#endif - } - } - -#if HAVE_XRC -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - if(BTL_OPENIB_QP_TYPE_XRC(qp)) - sr_desc->qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; -#else - if(BTL_OPENIB_QP_TYPE_XRC(qp)) - sr_desc->xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif -#endif - assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr); - - if (sr_desc->send_flags & IBV_SEND_SIGNALED) { - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - } else { - qp_inc_inflight_wqe(ep, qp, to_com_frag(frag)); - } - - return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr); -} - -/* called with the endpoint lock held */ -static inline int mca_btl_openib_endpoint_credit_acquire (struct mca_btl_base_endpoint_t *endpoint, int qp, - int prio, size_t size, bool *do_rdma, - mca_btl_openib_send_frag_t *frag, bool queue_frag) -{ - mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; - mca_btl_openib_header_t *hdr = frag->hdr; - size_t eager_limit; - int32_t cm_return; - - eager_limit = mca_btl_openib_component.eager_limit + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t); - - if (!(prio && size < eager_limit && acquire_eager_rdma_send_credit(endpoint) == OPAL_SUCCESS)) { - *do_rdma = false; - prio = !prio; - - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - if (OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) { - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); - if (queue_frag) { - opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio], - (opal_list_item_t *)frag); - } - - return OPAL_ERR_OUT_OF_RESOURCE; - } - } else { - if(OPAL_THREAD_ADD_FETCH32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) { - OPAL_THREAD_ADD_FETCH32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); - if (queue_frag) { - OPAL_THREAD_LOCK(&openib_btl->ib_lock); - opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio], - (opal_list_item_t *)frag); - OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - } - - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - } else { - /* High priority frag. Try to send over eager RDMA */ - *do_rdma = true; - } - - /* Set all credits */ - BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); - if (hdr->credits) { - hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; - } - - if (!*do_rdma) { - if (BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) { - BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); - } - } else { - hdr->credits |= (qp << 11); - } - - BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); - /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ - if(cm_return > 255) { - hdr->cm_seen = 255; - cm_return -= 255; - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); - } else { - hdr->cm_seen = cm_return; - } - - return OPAL_SUCCESS; -} - -/* called with the endpoint lock held. */ -static inline void mca_btl_openib_endpoint_credit_release (struct mca_btl_base_endpoint_t *endpoint, int qp, - bool do_rdma, mca_btl_openib_send_frag_t *frag) -{ - mca_btl_openib_header_t *hdr = frag->hdr; - - if (BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits)); - } - - if (do_rdma) { - OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_remote.tokens, 1); - } else { - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_THREAD_ADD_FETCH32 (&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); - OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); - } else if BTL_OPENIB_QP_TYPE_SRQ(qp){ - mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; - OPAL_THREAD_ADD_FETCH32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); - } - } -} - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_frag.c b/opal/mca/btl/openib/btl_openib_frag.c deleted file mode 100644 index d8d94b939b..0000000000 --- a/opal/mca/btl/openib/btl_openib_frag.c +++ /dev/null @@ -1,222 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_eager_rdma.h" - -int mca_btl_openib_frag_init(opal_free_list_item_t* item, void* ctx) -{ - mca_btl_openib_frag_init_data_t* init_data = (mca_btl_openib_frag_init_data_t *) ctx; - mca_btl_openib_frag_t *frag = to_base_frag(item); - - if(MCA_BTL_OPENIB_FRAG_RECV == frag->type) { - to_recv_frag(frag)->qp_idx = init_data->order; - to_com_frag(frag)->sg_entry.length = - mca_btl_openib_component.qp_infos[init_data->order].size + - sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t); - } - - if(MCA_BTL_OPENIB_FRAG_SEND == frag->type) - to_send_frag(frag)->qp_idx = init_data->order; - - frag->list = init_data->list; - - return OPAL_SUCCESS; -} - -static void base_constructor(mca_btl_openib_frag_t *frag) -{ - frag->base.order = MCA_BTL_NO_ORDER; -} - -static void com_constructor(mca_btl_openib_com_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - mca_btl_openib_reg_t* reg = - (mca_btl_openib_reg_t*)base_frag->base.super.registration; - - frag->registration = reg; - - if(reg) { - frag->sg_entry.lkey = reg->mr->lkey; - } - frag->n_wqes_inflight = 0; -} - -static void out_constructor(mca_btl_openib_out_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - - base_frag->base.des_segments = &base_frag->segment; - base_frag->base.des_segment_count = 1; - - frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; - frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; - frag->sr_desc.num_sge = 1; - frag->sr_desc.opcode = IBV_WR_SEND; - frag->sr_desc.send_flags = IBV_SEND_SIGNALED; - frag->sr_desc.next = NULL; -} - -static void in_constructor(mca_btl_openib_in_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - - base_frag->base.des_segments = &base_frag->segment; - base_frag->base.des_segment_count = 1; -} - -static void send_constructor(mca_btl_openib_send_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - - base_frag->type = MCA_BTL_OPENIB_FRAG_SEND; - - frag->chdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; - frag->hdr = (mca_btl_openib_header_t*) - (((unsigned char*)base_frag->base.super.ptr) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t)); - base_frag->segment.seg_addr.pval = frag->hdr + 1; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr; - frag->coalesced_length = 0; - OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t); -} - -static void recv_constructor(mca_btl_openib_recv_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - - base_frag->type = MCA_BTL_OPENIB_FRAG_RECV; - - frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; - base_frag->segment.seg_addr.pval = - ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr; - - frag->rd_desc.wr_id = (uint64_t)(uintptr_t)frag; - frag->rd_desc.sg_list = &to_com_frag(frag)->sg_entry; - frag->rd_desc.num_sge = 1; - frag->rd_desc.next = NULL; -} - -static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag) -{ - to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL; - /* adjusting headers because there is no coalesce header in control messages */ - frag->hdr = frag->chdr; - to_base_frag(frag)->segment.seg_addr.pval = frag->hdr + 1; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr; -} - -static void put_constructor(mca_btl_openib_put_frag_t *frag) -{ - to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; - to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->cb.func = NULL; -} - -static void get_constructor(mca_btl_openib_get_frag_t *frag) -{ - to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER; - - frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; - frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; - frag->sr_desc.num_sge = 1; - frag->sr_desc.opcode = IBV_WR_RDMA_READ; - frag->sr_desc.send_flags = IBV_SEND_SIGNALED; - frag->sr_desc.next = NULL; -} - -static void coalesced_constructor(mca_btl_openib_coalesced_frag_t *frag) -{ - mca_btl_openib_frag_t *base_frag = to_base_frag(frag); - - base_frag->type = MCA_BTL_OPENIB_FRAG_COALESCED; - - base_frag->base.des_segments = &base_frag->segment; - base_frag->base.des_segment_count = 1; -} - -OBJ_CLASS_INSTANCE( - mca_btl_openib_frag_t, - mca_btl_base_descriptor_t, - base_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_com_frag_t, - mca_btl_openib_frag_t, - com_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_out_frag_t, - mca_btl_openib_com_frag_t, - out_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_in_frag_t, - mca_btl_openib_com_frag_t, - in_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_send_frag_t, - mca_btl_openib_out_frag_t, - send_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_recv_frag_t, - mca_btl_openib_in_frag_t, - recv_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_send_control_frag_t, - mca_btl_openib_send_frag_t, - send_control_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_put_frag_t, - mca_btl_openib_out_frag_t, - put_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_get_frag_t, - mca_btl_openib_in_frag_t, - get_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_coalesced_frag_t, - mca_btl_openib_frag_t, - coalesced_constructor, - NULL); diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h deleted file mode 100644 index d140fe4a8a..0000000000 --- a/opal/mca/btl/openib/btl_openib_frag.h +++ /dev/null @@ -1,422 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_IB_FRAG_H -#define MCA_BTL_IB_FRAG_H - -#include "opal_config.h" -#include "opal/align.h" -#include "opal/mca/btl/btl.h" - -#include - -BEGIN_C_DECLS - -struct mca_btl_openib_reg_t; - -struct mca_btl_openib_header_t { - mca_btl_base_tag_t tag; - uint8_t cm_seen; - uint16_t credits; -#if OPAL_OPENIB_PAD_HDR - uint8_t padding[4]; -#endif -}; -typedef struct mca_btl_openib_header_t mca_btl_openib_header_t; -#define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15) -#define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG) -#define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG) - -#define BTL_OPENIB_HEADER_HTON(h) \ -do { \ - (h).credits = htons((h).credits); \ -} while (0) - -#define BTL_OPENIB_HEADER_NTOH(h) \ -do { \ - (h).credits = ntohs((h).credits); \ -} while (0) - -typedef struct mca_btl_openib_header_coalesced_t { - mca_btl_base_tag_t tag; - uint32_t size; - uint32_t alloc_size; -#if OPAL_OPENIB_PAD_HDR - uint8_t padding[4]; -#endif -} mca_btl_openib_header_coalesced_t; - -#define BTL_OPENIB_HEADER_COALESCED_NTOH(h) \ - do { \ - (h).size = ntohl((h).size); \ - (h).alloc_size = ntohl((h).alloc_size); \ - } while(0) - -#define BTL_OPENIB_HEADER_COALESCED_HTON(h) \ - do { \ - (h).size = htonl((h).size); \ - (h).alloc_size = htonl((h).alloc_size); \ - } while(0) - -#if OPAL_OPENIB_PAD_HDR -/* BTL_OPENIB_FTR_PADDING - * This macro is used to keep the pointer to openib footers aligned for - * systems like SPARC64 that take a big performance hit when addresses - * are not aligned (and by default sigbus instead of coercing the type on - * an unaligned address). - * - * We assure alignment of a packet's structures when OPAL_OPENIB_PAD_HDR - * is set to 1. When this is the case then several structures are padded - * to assure alignment and the mca_btl_openib_footer_t structure itself - * will uses the BTL_OPENIB_FTR_PADDING macro to shift the location of the - * pointer to assure proper alignment after the PML Header and data. - * For example sending a 1 byte data packet the memory layout without - * footer alignment would look something like the following: - * - * 0x00 : mca_btl_openib_coalesced_header_t (12 bytes + 4 byte pad) - * 0x10 : mca_btl_openib_control_header_t (1 byte + 7 byte pad) - * 0x18 : mca_btl_openib_header_t (4 bytes + 4 byte pad) - * 0x20 : PML Header and data (16 bytes PML + 1 byte data) - * 0x29 : mca_btl_openib_footer_t (4 bytes + 4 byte pad) - * 0x31 : end of packet - * - * By applying the BTL_OPENIB_FTR_PADDING() in the progress_one_device - * and post_send routines we adjust the pointer to mca_btl_openib_footer_t - * from 0x29 to 0x2C thus correctly aligning the start of the - * footer pointer. This adjustment will cause the padding field of - * mca_btl_openib_footer_t to overlap with the neighboring memory but since - * we never use the padding we do not end up inadvertently overwriting - * memory that does not belong to the fragment. - */ -#define BTL_OPENIB_FTR_PADDING(size) \ - OPAL_ALIGN_PAD_AMOUNT(size, sizeof(uint64_t)) - -/* BTL_OPENIB_ALIGN_COALESCE_HDR - * This macro is used in btl_openib.c, while creating a coalesce fragment, - * to align the coalesce headers. - */ -#define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) \ - OPAL_ALIGN_PTR(ptr, sizeof(uint32_t), unsigned char*) - -/* BTL_OPENIB_COALESCE_HDR_PADDING - * This macro is used in btl_openib_component.c, while parsing an incoming - * coalesce fragment, to determine the padding amount used to align the - * mca_btl_openib_coalesce_hdr_t. - */ -#define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) \ - OPAL_ALIGN_PAD_AMOUNT(ptr, sizeof(uint32_t)) -#else -#define BTL_OPENIB_FTR_PADDING(size) 0 -#define BTL_OPENIB_ALIGN_COALESCE_HDR(ptr) ptr -#define BTL_OPENIB_COALESCE_HDR_PADDING(ptr) 0 -#endif - -struct mca_btl_openib_footer_t { -#if OPAL_ENABLE_DEBUG - uint32_t seq; -#endif - union { - uint32_t size; - uint8_t buf[4]; - } u; -#if OPAL_OPENIB_PAD_HDR -#if OPAL_ENABLE_DEBUG - /* this footer needs to be of a 8-byte multiple so by adding the - * seq field you throw this off and you cannot just remove the - * padding because the padding is needed in order to adjust the alignment - * and not overwrite other packets. - */ - uint8_t padding[12]; -#else - uint8_t padding[8]; -#endif -#endif -}; -typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; - -#ifdef WORDS_BIGENDIAN -#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) -#else -#define MCA_BTL_OPENIB_FTR_SIZE_REVERSE(ftr) \ - do { \ - uint8_t tmp = (ftr).u.buf[0]; \ - (ftr).u.buf[0]=(ftr).u.buf[2]; \ - (ftr).u.buf[2]=tmp; \ - } while (0) -#endif - -#if OPAL_ENABLE_DEBUG -#define BTL_OPENIB_FOOTER_SEQ_HTON(h) ((h).seq = htonl((h).seq)) -#define BTL_OPENIB_FOOTER_SEQ_NTOH(h) ((h).seq = ntohl((h).seq)) -#else -#define BTL_OPENIB_FOOTER_SEQ_HTON(h) -#define BTL_OPENIB_FOOTER_SEQ_NTOH(h) -#endif - -#define BTL_OPENIB_FOOTER_HTON(h) \ - do { \ - BTL_OPENIB_FOOTER_SEQ_HTON(h); \ - MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ - } while (0) - -#define BTL_OPENIB_FOOTER_NTOH(h) \ - do { \ - BTL_OPENIB_FOOTER_SEQ_NTOH(h); \ - MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ - } while (0) - -#define MCA_BTL_OPENIB_CONTROL_CREDITS 0 -#define MCA_BTL_OPENIB_CONTROL_RDMA 1 -#define MCA_BTL_OPENIB_CONTROL_COALESCED 2 -#define MCA_BTL_OPENIB_CONTROL_CTS 3 - -struct mca_btl_openib_control_header_t { - uint8_t type; -#if OPAL_OPENIB_PAD_HDR - uint8_t padding[7]; -#endif -}; -typedef struct mca_btl_openib_control_header_t mca_btl_openib_control_header_t; - -struct mca_btl_openib_eager_rdma_header_t { - mca_btl_openib_control_header_t control; - uint32_t rkey; - opal_ptr_t rdma_start; -}; -typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t; - -#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON(h) \ - do { \ - (h).rkey = htonl((h).rkey); \ - (h).rdma_start.lval = hton64((h).rdma_start.lval); \ - } while (0) - -#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \ - do { \ - (h).rkey = ntohl((h).rkey); \ - (h).rdma_start.lval = ntoh64((h).rdma_start.lval); \ - } while (0) - - -struct mca_btl_openib_rdma_credits_header_t { - mca_btl_openib_control_header_t control; -#if OPAL_OPENIB_PAD_HDR - uint8_t padding[1]; -#endif - uint8_t qpn; - uint16_t rdma_credits; -}; -typedef struct mca_btl_openib_rdma_credits_header_t mca_btl_openib_rdma_credits_header_t; - -#define BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(h) \ -do { \ - (h).rdma_credits = htons((h).rdma_credits); \ -} while (0) - -#define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \ -do { \ - (h).rdma_credits = ntohs((h).rdma_credits); \ -} while (0) - -enum mca_btl_openib_frag_type_t { - MCA_BTL_OPENIB_FRAG_RECV, - MCA_BTL_OPENIB_FRAG_RECV_USER, - MCA_BTL_OPENIB_FRAG_SEND, - MCA_BTL_OPENIB_FRAG_SEND_USER, - MCA_BTL_OPENIB_FRAG_EAGER_RDMA, - MCA_BTL_OPENIB_FRAG_CONTROL, - MCA_BTL_OPENIB_FRAG_COALESCED -}; -typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t; - -#define openib_frag_type(f) (to_base_frag(f)->type) -/** - * IB fragment derived type. - */ -/* base openib frag */ -typedef struct mca_btl_openib_frag_t { - mca_btl_base_descriptor_t base; - mca_btl_base_segment_t segment; - mca_btl_openib_frag_type_t type; - opal_free_list_t* list; -} mca_btl_openib_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t); - -#define to_base_frag(f) ((mca_btl_openib_frag_t*)(f)) - -/* frag used for communication */ -typedef struct mca_btl_openib_com_frag_t { - mca_btl_openib_frag_t super; - struct ibv_sge sg_entry; - struct mca_btl_openib_reg_t *registration; - struct mca_btl_base_endpoint_t *endpoint; - /* number of unsignaled frags sent before this frag. */ - uint32_t n_wqes_inflight; -} mca_btl_openib_com_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t); - -#define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f)) - -typedef struct mca_btl_openib_out_frag_t { - mca_btl_openib_com_frag_t super; - struct ibv_send_wr sr_desc; -} mca_btl_openib_out_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t); - -#define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f)) - -typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t); - -#define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f)) - -typedef struct mca_btl_openib_send_frag_t { - mca_btl_openib_out_frag_t super; - mca_btl_openib_header_t *hdr, *chdr; - mca_btl_openib_footer_t *ftr; - uint8_t qp_idx; - uint32_t coalesced_length; - opal_list_t coalesced_frags; -} mca_btl_openib_send_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t); - -#define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f)) - -typedef struct mca_btl_openib_recv_frag_t { - mca_btl_openib_in_frag_t super; - mca_btl_openib_header_t *hdr; - mca_btl_openib_footer_t *ftr; - struct ibv_recv_wr rd_desc; - uint8_t qp_idx; -} mca_btl_openib_recv_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); - -#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) - -typedef struct mca_btl_openib_put_frag_t { - mca_btl_openib_out_frag_t super; - struct { - mca_btl_base_rdma_completion_fn_t func; - mca_btl_base_registration_handle_t *local_handle; - void *context; - void *data; - } cb; -} mca_btl_openib_put_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); - -#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) - -typedef struct mca_btl_openib_get_frag_t { - mca_btl_openib_in_frag_t super; - struct ibv_send_wr sr_desc; - struct { - mca_btl_base_rdma_completion_fn_t func; - mca_btl_base_registration_handle_t *local_handle; - void *context; - void *data; - } cb; -} mca_btl_openib_get_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); - -#define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f)) - -typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); - -#define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f)) - -typedef struct mca_btl_openib_coalesced_frag_t { - mca_btl_openib_frag_t super; - mca_btl_openib_send_frag_t *send_frag; - mca_btl_openib_header_coalesced_t *hdr; - bool sent; -} mca_btl_openib_coalesced_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t); - -#define to_coalesced_frag(f) ((mca_btl_openib_coalesced_frag_t*)(f)) - -/* - * Allocate an IB send descriptor - * - */ - -static inline mca_btl_openib_send_control_frag_t * -alloc_control_frag(mca_btl_openib_module_t *btl) -{ - return to_send_control_frag(opal_free_list_wait (&btl->device->send_free_control)); -} - -static inline uint8_t frag_size_to_order(mca_btl_openib_module_t* btl, - size_t size) -{ - int qp; - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) - if(mca_btl_openib_component.qp_infos[qp].size >= size) - return qp; - - return MCA_BTL_NO_ORDER; -} - -static inline mca_btl_openib_com_frag_t *alloc_send_user_frag(void) -{ - return to_com_frag(opal_free_list_get (&mca_btl_openib_component.send_user_free)); -} - -static inline mca_btl_openib_com_frag_t *alloc_recv_user_frag(void) -{ - return to_com_frag(opal_free_list_get (&mca_btl_openib_component.recv_user_free)); -} - -static inline mca_btl_openib_coalesced_frag_t *alloc_coalesced_frag(void) -{ - return to_coalesced_frag(opal_free_list_get (&mca_btl_openib_component.send_free_coalesced)); -} - -#define MCA_BTL_IB_FRAG_RETURN(frag) \ - do { \ - opal_free_list_return (to_base_frag(frag)->list, \ - (opal_free_list_item_t*)(frag)); \ - } while(0) - -#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \ - do { \ - opal_list_item_t *_frag_item; \ - while (NULL != (_frag_item = opal_list_remove_first(list))) { \ - MCA_BTL_IB_FRAG_RETURN(_frag_item); \ - } \ - } while (0) - -struct mca_btl_openib_module_t; - -struct mca_btl_openib_frag_init_data_t { - uint8_t order; - opal_free_list_t* list; -}; -typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t; - -int mca_btl_openib_frag_init(opal_free_list_item_t* item, void* ctx); - - -END_C_DECLS -#endif diff --git a/opal/mca/btl/openib/btl_openib_get.c b/opal/mca/btl/openib/btl_openib_get.c deleted file mode 100644 index 6dc73bc6e4..0000000000 --- a/opal/mca/btl/openib/btl_openib_get.c +++ /dev/null @@ -1,167 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_xrc.h" - -/* - * RDMA READ remote buffer to local buffer address. - */ - -int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_openib_get_frag_t* frag = NULL; - int qp = order; - int rc; - - if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { - return OPAL_ERR_BAD_PARAM; - } - - frag = to_get_frag(alloc_recv_user_frag()); - if (OPAL_UNLIKELY(NULL == frag)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - if (MCA_BTL_NO_ORDER == qp) { - qp = mca_btl_openib_component.rdma_qp; - } - - /* set base descriptor flags */ - to_base_frag(frag)->base.order = qp; - /* free this descriptor when the operation is complete */ - to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - - /* set up scatter-gather entry */ - to_com_frag(frag)->sg_entry.length = size; - to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; - to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; - to_com_frag(frag)->endpoint = ep; - - /* set up rdma callback */ - frag->cb.func = cbfunc; - frag->cb.context = cbcontext; - frag->cb.data = cbdata; - frag->cb.local_handle = local_handle; - - /* set up descriptor */ - frag->sr_desc.wr.rdma.remote_addr = remote_address; - /* the opcode may have been changed by an atomic operation */ - frag->sr_desc.opcode = IBV_WR_RDMA_READ; - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); - } else -#endif - { - frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; - } - - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_ERR_RESOURCE_BUSY == rc) { - return OPAL_SUCCESS; - } - - if (OPAL_SUCCESS != rc) { - MCA_BTL_IB_FRAG_RETURN (frag); - return rc; - } - } - - rc = mca_btl_openib_get_internal (btl, ep, frag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { - rc = OPAL_SUCCESS; - - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - } else { - MCA_BTL_IB_FRAG_RETURN (frag); - } - } - - return rc; -} - -int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - mca_btl_openib_get_frag_t *frag) -{ - int qp = to_base_frag(frag)->base.order; - struct ibv_send_wr *bad_wr; - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { - /* NTH: the remote SRQ number is only available once the endpoint is connected. By - * setting the value here instead of mca_btl_openib_get we guarantee the rem_srqs - * array is initialized. */ -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; -#else - frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif - } -#endif - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* check for a get token */ - if (OPAL_THREAD_ADD_FETCH32(&ep->get_tokens,-1) < 0) { - qp_put_wqe(ep, qp); - OPAL_THREAD_ADD_FETCH32(&ep->get_tokens,1); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { - qp_put_wqe(ep, qp); - OPAL_THREAD_ADD_FETCH32(&ep->get_tokens,1); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_ini.c b/opal/mca/btl/openib/btl_openib_ini.c deleted file mode 100644 index ec74193c0f..0000000000 --- a/opal/mca/btl/openib/btl_openib_ini.c +++ /dev/null @@ -1,664 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2008 Mellanox Technologies. All rights reserved. - * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include -#include - -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "opal/util/show_help.h" -#include "opal/util/string_copy.h" - -#include "btl_openib.h" -#include "btl_openib_lex.h" -#include "btl_openib_ini.h" - -static const char *ini_filename = NULL; -static bool initialized = false; -static opal_list_t devices; -static char *key_buffer = NULL; -static size_t key_buffer_len = 0; - - -/* - * Struct to hold the section name, vendor ID, and list of vendor part - * ID's and a corresponding set of values (parsed from an INI file). - */ -typedef struct parsed_section_values_t { - char *name; - - uint32_t *vendor_ids; - int vendor_ids_len; - - uint32_t *vendor_part_ids; - int vendor_part_ids_len; - - opal_btl_openib_ini_values_t values; -} parsed_section_values_t; - -/* - * Struct to hold the final values. Different from above in a few ways: - * - * - The vendor and part IDs will always be set properly - * - There will only be one part ID (i.e., the above struct is - * exploded into multiple of these for each of searching) - * - There is a super of opal_list_item_t so that we can have a list - * of these - */ -typedef struct device_values_t { - opal_list_item_t super; - - char *section_name; - uint32_t vendor_id; - uint32_t vendor_part_id; - - opal_btl_openib_ini_values_t values; -} device_values_t; - -static void device_values_constructor(device_values_t *s); -static void device_values_destructor(device_values_t *s); - -OBJ_CLASS_INSTANCE(device_values_t, - opal_list_item_t, - device_values_constructor, - device_values_destructor); - - -/* - * Local functions - */ -static int parse_file(char *filename); -static int parse_line(parsed_section_values_t *item); -static void reset_section(bool had_previous_value, parsed_section_values_t *s); -static void reset_values(opal_btl_openib_ini_values_t *v); -static int save_section(parsed_section_values_t *s); - - -/* - * Read the INI files for device-specific values and save them in - * internal data structures for later lookup. - */ -int opal_btl_openib_ini_init(void) -{ - int ret = OPAL_ERR_NOT_FOUND; - char *colon; - char separator = ':'; - - OBJ_CONSTRUCT(&devices, opal_list_t); - - colon = strchr(mca_btl_openib_component.device_params_file_names, separator); - if (NULL == colon) { - /* If we've only got 1 file (i.e., no colons found), parse it - and be done */ - ret = parse_file(mca_btl_openib_component.device_params_file_names); - } else { - /* Otherwise, loop over all the files and parse them */ - char *orig = strdup(mca_btl_openib_component.device_params_file_names); - char *str = orig; - - while (NULL != (colon = strchr(str, ':'))) { - *colon = '\0'; - ret = parse_file(str); - /* Note that NOT_FOUND and SUCCESS are not fatal errors - and we keep going. Other errors are treated as - fatal */ - if (OPAL_ERR_NOT_FOUND != ret && OPAL_SUCCESS != ret) { - break; - } - str = colon + 1; - } - /* Parse the last file if we didn't have a fatal error above */ - if (OPAL_ERR_NOT_FOUND != ret && OPAL_SUCCESS != ret) { - ret = parse_file(str); - } - - /* All done */ - free(orig); - } - - /* Return SUCCESS unless we got a fatal error */ - - initialized = true; - return (OPAL_SUCCESS == ret || OPAL_ERR_NOT_FOUND == ret) ? - OPAL_SUCCESS : ret; -} - - -/* - * The component found a device and is querying to see if an INI file - * specified any parameters for it. - */ -int opal_btl_openib_ini_query(uint32_t vendor_id, uint32_t vendor_part_id, - opal_btl_openib_ini_values_t *values) -{ - int ret; - device_values_t *h; - - if (!initialized) { - if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_init())) { - return ret; - } - } - - if (mca_btl_openib_component.verbose) { - BTL_OUTPUT(("Querying INI files for vendor 0x%04x, part ID %d", - vendor_id, vendor_part_id)); - } - - reset_values(values); - - /* Iterate over all the saved devices */ - OPAL_LIST_FOREACH(h, &devices, device_values_t) { - if (vendor_id == h->vendor_id && - vendor_part_id == h->vendor_part_id) { - /* Found it! */ - /* NOTE: There is a bug in the PGI 6.2 series that causes - the compiler to choke when copying structs containing - bool members by value. So do a memcpy here instead. */ - memcpy(values, &h->values, sizeof(h->values)); - if (mca_btl_openib_component.verbose) { - BTL_OUTPUT(("Found corresponding INI values: %s", - h->section_name)); - } - return OPAL_SUCCESS; - } - } - - /* If we fall through to here, we didn't find it */ - if (mca_btl_openib_component.verbose) { - BTL_OUTPUT(("Did not find corresponding INI values")); - } - return OPAL_ERR_NOT_FOUND; -} - - -/* - * The component is shutting down; release all internal state - */ -int opal_btl_openib_ini_finalize(void) -{ - if (initialized) { - OPAL_LIST_DESTRUCT(&devices); - initialized = true; - } - - return OPAL_SUCCESS; -} - -/**************************************************************************/ - -/* - * Parse a single file - */ -static int parse_file(char *filename) -{ - int val; - int ret = OPAL_SUCCESS; - bool showed_no_section_warning = false; - bool showed_unexpected_tokens_warning = false; - parsed_section_values_t section; - - reset_section(false, §ion); - - /* Open the file */ - ini_filename = filename; - btl_openib_ini_yyin = fopen(filename, "r"); - if (NULL == btl_openib_ini_yyin) { - opal_show_help("help-mpi-btl-openib.txt", "ini file:file not found", - true, filename); - ret = OPAL_ERR_NOT_FOUND; - goto cleanup; - } - - /* Do the parsing */ - btl_openib_ini_parse_done = false; - btl_openib_ini_yynewlines = 1; - btl_openib_ini_init_buffer(btl_openib_ini_yyin); - while (!btl_openib_ini_parse_done) { - val = btl_openib_ini_yylex(); - switch (val) { - case BTL_OPENIB_INI_PARSE_DONE: - /* This will also set btl_openib_ini_parse_done to true, so just - break here */ - break; - - case BTL_OPENIB_INI_PARSE_NEWLINE: - /* blank line! ignore it */ - break; - - case BTL_OPENIB_INI_PARSE_SECTION: - /* We're starting a new section; if we have previously - parsed a section, go see if we can use its values. */ - save_section(§ion); - - reset_section(true, §ion); - section.name = strdup(btl_openib_ini_yytext); - break; - - case BTL_OPENIB_INI_PARSE_SINGLE_WORD: - if (NULL == section.name) { - /* Warn that there is no current section, and ignore - this parameter */ - if (!showed_no_section_warning) { - opal_show_help("help-mpi-btl-openib.txt", "ini file:not in a section", true); - showed_no_section_warning = true; - } - /* Parse it and then dump it */ - parse_line(§ion); - reset_section(true, §ion); - } else { - parse_line(§ion); - } - break; - - default: - /* anything else is an error */ - if (!showed_unexpected_tokens_warning) { - opal_show_help("help-mpi-btl-openib.txt", "ini file:unexpected token", true); - showed_unexpected_tokens_warning = true; - } - break; - } - } - save_section(§ion); - fclose(btl_openib_ini_yyin); - btl_openib_ini_yylex_destroy (); - -cleanup: - reset_section(true, §ion); - if (NULL != key_buffer) { - free(key_buffer); - key_buffer = NULL; - key_buffer_len = 0; - } - return ret; -} - - -/* - * Parse a single line in the INI file - */ -static int parse_line(parsed_section_values_t *sv) -{ - int val, ret = OPAL_SUCCESS; - char *value = NULL; - bool showed_unknown_field_warning = false; - - /* Save the name name */ - if (key_buffer_len < strlen(btl_openib_ini_yytext) + 1) { - char *tmp; - key_buffer_len = strlen(btl_openib_ini_yytext) + 1; - tmp = (char *) realloc(key_buffer, key_buffer_len); - if (NULL == tmp) { - free(key_buffer); - key_buffer_len = 0; - key_buffer = NULL; - return OPAL_ERR_TEMP_OUT_OF_RESOURCE; - } - key_buffer = tmp; - } - opal_string_copy(key_buffer, btl_openib_ini_yytext, key_buffer_len); - - /* The first thing we have to see is an "=" */ - val = btl_openib_ini_yylex(); - if (btl_openib_ini_parse_done || BTL_OPENIB_INI_PARSE_EQUAL != val) { - opal_show_help("help-mpi-btl-openib.txt", "ini file:expected equals", true); - return OPAL_ERROR; - } - - /* Next we get the value */ - val = btl_openib_ini_yylex(); - if (BTL_OPENIB_INI_PARSE_SINGLE_WORD != val && BTL_OPENIB_INI_PARSE_VALUE != val) { - return OPAL_ERROR; - } - - value = strdup(btl_openib_ini_yytext); - - /* Now we need to see the newline */ - val = btl_openib_ini_yylex(); - - /* If we did not get EOL or EOF, something is wrong */ - if (BTL_OPENIB_INI_PARSE_NEWLINE != val && BTL_OPENIB_INI_PARSE_DONE != val) { - opal_show_help("help-mpi-btl-openib.txt", "ini file:expected newline", true); - free(value); - return OPAL_ERROR; - } - - /* Ok, we got a good parse. Now figure out what it is and save - the value. Note that the flex already took care of trimming - all whitespace at the beginning and ending of the value. */ - - if (0 == strcasecmp(key_buffer, "vendor_id")) { - if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_intify_list(value, &sv->vendor_ids, - &sv->vendor_ids_len))) { - return ret; - } - } - - else if (0 == strcasecmp(key_buffer, "vendor_part_id")) { - if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_intify_list(value, &sv->vendor_part_ids, - &sv->vendor_part_ids_len))) { - return ret; - } - } - - else if (0 == strcasecmp(key_buffer, "mtu")) { - /* Single value */ - sv->values.mtu = (uint32_t) opal_btl_openib_ini_intify(value); - sv->values.mtu_set = true; - } - - else if (0 == strcasecmp(key_buffer, "use_eager_rdma")) { - /* Single value */ - sv->values.use_eager_rdma = (uint32_t) opal_btl_openib_ini_intify(value); - sv->values.use_eager_rdma_set = true; - } - - else if (0 == strcasecmp(key_buffer, "receive_queues")) { - /* Single value (already strdup'ed) */ - sv->values.receive_queues = value; - value = NULL; - } - - else if (0 == strcasecmp(key_buffer, "max_inline_data")) { - /* Single value */ - sv->values.max_inline_data = (int32_t) opal_btl_openib_ini_intify(value); - sv->values.max_inline_data_set = true; - } - - else if (0 == strcasecmp(key_buffer, "rdmacm_reject_causes_connect_error")) { - /* Single value */ - sv->values.rdmacm_reject_causes_connect_error = - (bool) opal_btl_openib_ini_intify(value); - sv->values.rdmacm_reject_causes_connect_error_set = true; - } - - else if (0 == strcasecmp(key_buffer, "ignore_device")) { - /* Single value */ - sv->values.ignore_device = (bool) opal_btl_openib_ini_intify(value); - sv->values.ignore_device_set = true; - } - - else { - /* Have no idea what this parameter is. Not an error -- just - ignore it */ - if (!showed_unknown_field_warning) { - opal_show_help("help-mpi-btl-openib.txt", - "ini file:unknown field", true, - ini_filename, btl_openib_ini_yynewlines, - key_buffer); - showed_unknown_field_warning = true; - } - } - - /* All done */ - - if (NULL != value) { - free(value); - } - return ret; -} - - -/* - * Construct an device_values_t and set all of its values to known states - */ -static void device_values_constructor(device_values_t *s) -{ - s->section_name = NULL; - s->vendor_id = 0; - s->vendor_part_id = 0; - reset_values(&s->values); -} - - -/* - * Destruct an device_values_t and free any memory that it has - */ -static void device_values_destructor(device_values_t *s) -{ - if (NULL != s->section_name) { - free(s->section_name); - } - if (NULL != s->values.receive_queues) { - free(s->values.receive_queues); - } -} - - -/* - * Reset a parsed section; free any memory that it may have had - */ -static void reset_section(bool had_previous_value, parsed_section_values_t *s) -{ - if (had_previous_value) { - if (NULL != s->name) { - free(s->name); - } - if (NULL != s->vendor_ids) { - free(s->vendor_ids); - } - if (NULL != s->vendor_part_ids) { - free(s->vendor_part_ids); - } - } - - s->name = NULL; - s->vendor_ids = NULL; - s->vendor_ids_len = 0; - s->vendor_part_ids = NULL; - s->vendor_part_ids_len = 0; - - reset_values(&s->values); -} - - -/* - * Reset the values to known states - */ -static void reset_values(opal_btl_openib_ini_values_t *v) -{ - v->mtu = 0; - v->mtu_set = false; - - v->use_eager_rdma = 0; - v->use_eager_rdma_set = false; - - v->receive_queues = NULL; - - v->max_inline_data = 0; - v->max_inline_data_set = false; - - v->rdmacm_reject_causes_connect_error = false; - v->rdmacm_reject_causes_connect_error_set = false; - - v->ignore_device = false; - v->ignore_device_set = false; -} - - -/* - * If we have a valid section, see if we have a matching section - * somewhere (i.e., same vendor ID and vendor part ID). If we do, - * update the values. If not, save the values in a new instance and - * add it to the list. - */ -static int save_section(parsed_section_values_t *s) -{ - int i, j; - device_values_t *h; - bool found; - - /* Is the parsed section valid? */ - if (NULL == s->name || 0 == s->vendor_ids_len || - 0 == s->vendor_part_ids_len) { - return OPAL_ERR_BAD_PARAM; - } - - /* Iterate over each of the vendor/part IDs in the parsed - values */ - for (i = 0; i < s->vendor_ids_len; ++i) { - for (j = 0; j < s->vendor_part_ids_len; ++j) { - found = false; - - /* Iterate over all the saved devices */ - OPAL_LIST_FOREACH(h, &devices, device_values_t) { - if (s->vendor_ids[i] == h->vendor_id && - s->vendor_part_ids[j] == h->vendor_part_id) { - /* Found a match. Update any newly-set values. */ - if (s->values.mtu_set) { - h->values.mtu = s->values.mtu; - h->values.mtu_set = true; - } - - if (s->values.use_eager_rdma_set) { - h->values.use_eager_rdma = s->values.use_eager_rdma; - h->values.use_eager_rdma_set = true; - } - - if (NULL != s->values.receive_queues) { - h->values.receive_queues = - strdup(s->values.receive_queues); - } - - if (s->values.max_inline_data_set) { - h->values.max_inline_data = s->values.max_inline_data; - h->values.max_inline_data_set = true; - } - - if (s->values.rdmacm_reject_causes_connect_error_set) { - h->values.rdmacm_reject_causes_connect_error = - s->values.rdmacm_reject_causes_connect_error; - h->values.rdmacm_reject_causes_connect_error_set = - true; - } - - if (s->values.ignore_device_set) { - h->values.ignore_device = s->values.ignore_device; - h->values.ignore_device_set = true; - } - - found = true; - break; - } - } - - /* Did we find/update it in the exising list? If not, - create a new one. */ - if (!found) { - h = OBJ_NEW(device_values_t); - h->section_name = strdup(s->name); - h->vendor_id = s->vendor_ids[i]; - h->vendor_part_id = s->vendor_part_ids[j]; - /* NOTE: There is a bug in the PGI 6.2 series that - causes the compiler to choke when copying structs - containing bool members by value. So do a memcpy - here instead. */ - memcpy(&h->values, &s->values, sizeof(s->values)); - /* Need to strdup the string, though */ - if (NULL != s->values.receive_queues) { - h->values.receive_queues = strdup(s->values.receive_queues); - } - opal_list_append(&devices, &h->super); - } - } - } - - /* All done */ - - return OPAL_SUCCESS; -} - - -/* - * Do string-to-integer conversion, for both hex and decimal numbers - */ -int opal_btl_openib_ini_intify(char *str) -{ - return strtol (str, NULL, 0); -} - - -/* - * Take a comma-delimited list and infity them all - */ -int opal_btl_openib_ini_intify_list(char *value, uint32_t **values, int *len) -{ - char *comma; - char *str = value; - - *len = 0; - - /* Comma-delimited list of values */ - comma = strchr(str, ','); - if (NULL == comma) { - /* If we only got one value (i.e., no comma found), then - just make an array of one value and save it */ - *values = (uint32_t *) malloc(sizeof(uint32_t)); - if (NULL == *values) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - *values[0] = (uint32_t) opal_btl_openib_ini_intify(str); - *len = 1; - } else { - int newsize = 1; - - /* Count how many values there are and allocate enough space - for them */ - while (NULL != comma) { - ++newsize; - str = comma + 1; - comma = strchr(str, ','); - } - *values = (uint32_t *) malloc(sizeof(uint32_t) * newsize); - if (NULL == *values) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Iterate over the values and save them */ - str = value; - comma = strchr(str, ','); - while (NULL != comma) { - *comma = '\0'; - (*values)[*len] = (uint32_t) opal_btl_openib_ini_intify(str); - ++(*len); - str = comma + 1; - comma = strchr(str, ','); - } - /* Get the last value (i.e., the value after the last - comma, because it won't have been snarfed in the - loop) */ - (*values)[*len] = (uint32_t) opal_btl_openib_ini_intify(str); - ++(*len); - } - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_ini.h b/opal/mca/btl/openib/btl_openib_ini.h deleted file mode 100644 index 95131ace97..0000000000 --- a/opal/mca/btl/openib/btl_openib_ini.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2008 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#ifndef MCA_PTL_IB_PARAMS_H -#define MCA_PTL_IB_PARAMS_H - -#include "btl_openib.h" - - -/* - * Struct to hold the settable values that may be specified in the INI - * file - */ -typedef struct opal_btl_openib_ini_values_t { - uint32_t mtu; - bool mtu_set; - - uint32_t use_eager_rdma; - bool use_eager_rdma_set; - - char *receive_queues; - - int32_t max_inline_data; - bool max_inline_data_set; - - bool rdmacm_reject_causes_connect_error; - bool rdmacm_reject_causes_connect_error_set; - - bool ignore_device; - bool ignore_device_set; -} opal_btl_openib_ini_values_t; - - -BEGIN_C_DECLS - - /** - * Read in the INI files containing device params - */ - int opal_btl_openib_ini_init(void); - - /** - * Query the read-in params for a given device - */ - int opal_btl_openib_ini_query(uint32_t vendor_id, - uint32_t vendor_part_id, - opal_btl_openib_ini_values_t *values); - - /** - * Shut down / release all internal state - */ - int opal_btl_openib_ini_finalize(void); - - /** - * string to int convertors with dec/hex autodetection - */ - int opal_btl_openib_ini_intify(char *string); - int opal_btl_openib_ini_intify_list(char *str, uint32_t **values, int *len); - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_ip.c b/opal/mca/btl/openib/btl_openib_ip.c deleted file mode 100644 index 8a9e5992ec..0000000000 --- a/opal/mca/btl/openib/btl_openib_ip.c +++ /dev/null @@ -1,433 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2008 Chelsio, Inc. All rights reserved. - * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. - * Copyright (c) 2017 Los Alamos National Security, LLC. All rights - * reserved. - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#include "opal_config.h" - -#include - -#if OPAL_HAVE_RDMACM -#include -#include -#include - -#include "opal/util/argv.h" -#include "opal/util/if.h" -#include "opal/util/proc.h" -#include "opal/util/show_help.h" - -#include "connect/connect.h" -#endif -/* Always want to include this file */ -#include "btl_openib_endpoint.h" -#include "btl_openib_ip.h" -#if OPAL_HAVE_RDMACM - -/* - * The cruft below maintains the linked list of rdma ipv4 addresses and their - * associated rdma device names and device port numbers. - */ -struct rdma_addr_list { - opal_list_item_t super; - uint32_t addr; - uint32_t subnet; - char addr_str[16]; - char dev_name[IBV_SYSFS_NAME_MAX]; - uint8_t dev_port; -}; -typedef struct rdma_addr_list rdma_addr_list_t; - -static OBJ_CLASS_INSTANCE(rdma_addr_list_t, opal_list_item_t, - NULL, NULL); -static opal_list_t *myaddrs = NULL; - -#if OPAL_ENABLE_DEBUG -static char *stringify(uint32_t addr) -{ - static char line[64]; - memset(line, 0, sizeof(line)); - snprintf(line, sizeof(line) - 1, "%d.%d.%d.%d (0x%x)", -#if defined(WORDS_BIGENDIAN) - (addr >> 24), - (addr >> 16) & 0xff, - (addr >> 8) & 0xff, - addr & 0xff, -#else - addr & 0xff, - (addr >> 8) & 0xff, - (addr >> 16) & 0xff, - (addr >> 24), -#endif - addr); - return line; -} -#endif - -/* Note that each device port can have multiple IP addresses - * associated with it (aka IP aliasing). However, the openib module - * only knows about (device,port) tuples -- not IP addresses (only the - * RDMA CM CPC knows which IP addresses are associated with each - * (device,port) tuple). Thus, any searching of device list for the - * IP Address or subnets may not work as one might expect. The - * current behavior is to return the IP address (or subnet) of the - * *first* instance of the device on the list. This behavior is - * uniform for subnet and IP addresses and thus should not cause any - * mismatches. If this behavior is not preferred by the user, the MCA - * parameters to include/exclude specific IP addresses can be used to - * precisely specify which addresses are used (e.g., to effect - * specific subnet routing). - */ -uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev, - uint8_t port) -{ - struct rdma_addr_list *addr; - - /* In the off chance that the user forces a non-RDMACM CPC and an - * IP-based mechanism, the list will be uninitialized. Return 0 - * to prevent crashes, and the lack of it actually working will be - * caught at a later stage. - */ - if (NULL == myaddrs) { - return 0; - } - - OPAL_LIST_FOREACH(addr, myaddrs, struct rdma_addr_list) { - if (!strcmp(addr->dev_name, ib_dev->name) && - port == addr->dev_port) { - return addr->subnet; - } - } - - return 0; -} - -/* This function should not be necessary, as rdma_get_local_addr would - * be more correct in returning the IP address given the cm_id (and - * not necessitate having to do a list look up). Unfortunately, the - * subnet and IP address look up needs to match or there could be a - * mismatch if IP Aliases are being used. For more information on - * this, please read comment above mca_btl_openib_get_ip_subnet_id. - */ -uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs, - uint8_t port) -{ - struct rdma_addr_list *addr; - - /* Sanity check */ - if (NULL == myaddrs) { - return 0; - } - - BTL_VERBOSE(("Looking for %s:%d in IP address list", - ibv_get_device_name(verbs->device), port)); - OPAL_LIST_FOREACH(addr, myaddrs, struct rdma_addr_list) { - if (!strcmp(addr->dev_name, verbs->device->name) && - port == addr->dev_port) { - BTL_VERBOSE(("FOUND: %s:%d is %s", - ibv_get_device_name(verbs->device), port, - stringify(addr->addr))); - return addr->addr; - } - } - return 0; -} - -static int dev_specified(char *name, int port) -{ - char **list; - - if (NULL != mca_btl_openib_component.if_include) { - int i; - - list = opal_argv_split(mca_btl_openib_component.if_include, ','); - for (i = 0; NULL != list[i]; i++) { - char **temp = opal_argv_split(list[i], ':'); - if (0 == strcmp(name, temp[0]) && - (NULL == temp[1] || port == atoi(temp[1]))) { - return 0; - } - } - - return 1; - } - - if (NULL != mca_btl_openib_component.if_exclude) { - int i; - - list = opal_argv_split(mca_btl_openib_component.if_exclude, ','); - for (i = 0; NULL != list[i]; i++) { - char **temp = opal_argv_split(list[i], ':'); - if (0 == strcmp(name, temp[0]) && - (NULL == temp[1] || port == atoi(temp[1]))) { - return 1; - } - } - } - - return 0; -} - -static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask) -{ - uint32_t all = ~((uint32_t) 0); - - if (NULL != mca_btl_openib_component.ipaddr_include) { - char **list; - int i; - - list = opal_argv_split(mca_btl_openib_component.ipaddr_include, ','); - for (i = 0; NULL != list[i]; i++) { - uint32_t subnet, list_subnet; - struct in_addr ipae; - char **temp = opal_argv_split(list[i], '/'); - - if (NULL == temp || NULL == temp[0] || NULL == temp[1] || - NULL != temp[2]) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid ipaddr_inexclude", true, "include", - opal_process_info.nodename, list[i], - "Invalid specification (missing \"/\")"); - if (NULL != temp) { - opal_argv_free(temp); - } - continue; - } - - if (1 != inet_pton(ipaddr->sin_family, temp[0], &ipae)) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid ipaddr_inexclude", true, "include", - opal_process_info.nodename, list[i], - "Invalid specification (inet_pton() failed)"); - opal_argv_free(temp); - continue; - } - list_subnet = ntohl(ipae.s_addr) & ~(all >> atoi(temp[1])); - subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(all >> netmask); - opal_argv_free(temp); - - if (subnet == list_subnet) { - return 0; - } - } - - return 1; - } - - if (NULL != mca_btl_openib_component.ipaddr_exclude) { - char **list; - int i; - - list = opal_argv_split(mca_btl_openib_component.ipaddr_exclude, ','); - for (i = 0; NULL != list[i]; i++) { - uint32_t subnet, list_subnet; - struct in_addr ipae; - char **temp = opal_argv_split(list[i], '/'); - - if (NULL == temp || NULL == temp[0] || NULL == temp[1] || - NULL != temp[2]) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid ipaddr_inexclude", true, "exclude", - opal_process_info.nodename, list[i], - "Invalid specification (missing \"/\")"); - if (NULL != temp) { - opal_argv_free(temp); - } - continue; - } - - if (1 != inet_pton(ipaddr->sin_family, temp[0], &ipae)) { - opal_show_help("help-mpi-btl-openib.txt", - "invalid ipaddr_inexclude", true, "exclude", - opal_process_info.nodename, list[i], - "Invalid specification (inet_pton() failed)"); - opal_argv_free(temp); - continue; - } - list_subnet = ntohl(ipae.s_addr) & ~(all >> atoi(temp[1])); - subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(all >> netmask); - opal_argv_free(temp); - - if (subnet == list_subnet) { - return 1; - } - } - } - - return 0; -} - -static int add_rdma_addr(struct sockaddr *ipaddr, uint32_t netmask) -{ - struct sockaddr_in *sinp; - struct rdma_cm_id *cm_id; - struct rdma_event_channel *ch; - int rc = OPAL_SUCCESS; - struct rdma_addr_list *myaddr; - uint32_t all = ~((uint32_t) 0); - - /* Ensure that this IP address is not in 127.0.0.1/8. If it is, - skip it because we never want loopback addresses to be - considered RDMA devices that remote peers can use to connect - to. - - This check is necessary because of a change that almost went - into RDMA CM in OFED 1.5.1. We asked for a delay so that we - could get a release of Open MPI out that includes the - 127-ignoring logic; hence, this change will likely be in a - future version of OFED (perhaps OFED 1.6?). - - OMPI uses rdma_bind_addr() to determine if a local IP address - is an RDMA device or not. If it succeeds and we get a non-NULL - verbs pointer back in the return, we say that it's a valid RDMA - device. Up through OFED 1.5, rdma_bind_addr(127.0.0.1), would - succeed, but the verbs pointer returned would be NULL. Hence, - we knew it was loopback, and therefore we skipped it. - - The proposed RDMA CM change would return a non-NULL/valid verbs - pointer when binding to 127.0.0.1/8. This, of course, screws - up OMPI because we then advertise 127.0.0.1 in the modex as an - address that remote peers can use to contact this process via - RDMA. Hence, we have to specifically exclude 127.0.0.1/8 -- - don't even both trying to rdma_bind_addr() to it because we - know we don't want loopback addresses at all. */ - sinp = (struct sockaddr_in *)ipaddr; - if ((sinp->sin_addr.s_addr & htonl(0xff000000)) == htonl(0x7f000000)) { - rc = OPAL_SUCCESS; - goto out1; - } - - ch = rdma_create_event_channel(); - if (NULL == ch) { - BTL_VERBOSE(("failed creating RDMA CM event channel")); - rc = OPAL_ERROR; - goto out1; - } - - rc = rdma_create_id(ch, &cm_id, NULL, RDMA_PS_TCP); - if (rc) { - BTL_VERBOSE(("rdma_create_id returned %d", rc)); - rc = OPAL_ERROR; - goto out2; - } - - /* Bind the newly created cm_id to the IP address. This will, - amongst other things, verify that the device is verbs - capable */ - rc = rdma_bind_addr(cm_id, ipaddr); - if (rc || !cm_id->verbs) { - rc = OPAL_SUCCESS; - goto out3; - } - - /* Verify that the device has not been excluded */ - rc = dev_specified(cm_id->verbs->device->name, cm_id->port_num); - if (rc) { - rc = OPAL_SUCCESS; - goto out3; - } - - /* Verify that the device has a valid IP address */ - if (0 == ((struct sockaddr_in *)ipaddr)->sin_addr.s_addr || - ipaddr_specified((struct sockaddr_in *)ipaddr, netmask)) { - rc = OPAL_SUCCESS; - goto out3; - } - - myaddr = OBJ_NEW(rdma_addr_list_t); - if (NULL == myaddr) { - BTL_ERROR(("malloc failed!")); - rc = OPAL_ERROR; - goto out3; - } - - myaddr->addr = sinp->sin_addr.s_addr; - myaddr->subnet = ntohl(myaddr->addr) & ~(all >> netmask); - inet_ntop(sinp->sin_family, &sinp->sin_addr, - myaddr->addr_str, sizeof(myaddr->addr_str)); - memcpy(myaddr->dev_name, cm_id->verbs->device->name, IBV_SYSFS_NAME_MAX); - myaddr->dev_port = cm_id->port_num; - BTL_VERBOSE(("Adding addr %s (0x%x) subnet 0x%x as %s:%d", - myaddr->addr_str, myaddr->addr, myaddr->subnet, - myaddr->dev_name, myaddr->dev_port)); - - opal_list_append(myaddrs, &(myaddr->super)); - -out3: - rdma_destroy_id(cm_id); -out2: - rdma_destroy_event_channel(ch); -out1: - return rc; -} - -int mca_btl_openib_build_rdma_addr_list(void) -{ - int rc = OPAL_SUCCESS, i; - - myaddrs = OBJ_NEW(opal_list_t); - if (NULL == myaddrs) { - BTL_ERROR(("malloc failed!")); - return OPAL_ERROR; - } - - for (i = opal_ifbegin(); i >= 0; i = opal_ifnext(i)) { - struct sockaddr ipaddr; - uint32_t netmask; - - opal_ifindextoaddr(i, &ipaddr, sizeof(struct sockaddr)); - opal_ifindextomask(i, &netmask, sizeof(uint32_t)); - - if (ipaddr.sa_family == AF_INET) { - rc = add_rdma_addr(&ipaddr, netmask); - if (OPAL_SUCCESS != rc) { - break; - } - } - } - return rc; -} - -void mca_btl_openib_free_rdma_addr_list(void) -{ - if (NULL != myaddrs) { - OPAL_LIST_RELEASE(myaddrs); - myaddrs = NULL; - } -} - -#else -/* !OPAL_HAVE_RDMACM case */ - -uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev, - uint8_t port) -{ - return 0; -} - -uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs, - uint8_t port) -{ - return 0; -} - -int mca_btl_openib_build_rdma_addr_list(void) -{ - return OPAL_SUCCESS; -} - -void mca_btl_openib_free_rdma_addr_list(void) -{ -} -#endif diff --git a/opal/mca/btl/openib/btl_openib_ip.h b/opal/mca/btl/openib/btl_openib_ip.h deleted file mode 100644 index 1cb7afd5ac..0000000000 --- a/opal/mca/btl/openib/btl_openib_ip.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2008 Chelsio, Inc. All rights reserved. - * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#ifndef MCA_BTL_OPENIB_IP_H -#define MCA_BTL_OPENIB_IP_H - -#include "opal_config.h" - -BEGIN_C_DECLS - -/** - * Get an IP equivalent of a subnet ID. - * - * @param ib_dev (IN) IBV device - * @return Value of the IPv4 Address bitwise-and'ed with the Netmask - */ -extern uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev, - uint8_t port); - -/** - * Get the IPv4 address of the specified HCA/RNIC device and physical port. - * - * @param verbs (IN) cm_id verbs of the IBV device - * @param port (IN) physical port of the IBV device - * @return IPv4 Address - */ -extern uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs, - uint8_t port); - -/** - * Create a list of all available IBV devices and each device's - * relevant information. This is necessary for - * mca_btl_openib_rdma_get_ipv4addr to work. - * - * @return OPAL_SUCCESS or failure status - */ -extern int mca_btl_openib_build_rdma_addr_list(void); - -/** - * Free the list of all available IBV devices created by - * mca_btl_openib_build_rdma_addr_list. - */ -extern void mca_btl_openib_free_rdma_addr_list(void); - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_lex.h b/opal/mca/btl/openib/btl_openib_lex.h deleted file mode 100644 index 8ae931cb72..0000000000 --- a/opal/mca/btl/openib/btl_openib_lex.h +++ /dev/null @@ -1,74 +0,0 @@ -/* -*- C -*- - * - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_INI_LEX_H_ -#define BTL_OPENIB_INI_LEX_H_ - -#include "opal_config.h" - -#ifdef malloc -#undef malloc -#endif -#ifdef realloc -#undef realloc -#endif -#ifdef free -#undef free -#endif - -#include - -BEGIN_C_DECLS - -int btl_openib_ini_yylex(void); -int btl_openib_ini_init_buffer(FILE *file); -int btl_openib_ini_yylex_destroy(void); - -extern FILE *btl_openib_ini_yyin; -extern bool btl_openib_ini_parse_done; -extern char *btl_openib_ini_yytext; -extern int btl_openib_ini_yynewlines; - -/* - * Make lex-generated files not issue compiler warnings - */ -#define YY_STACK_USED 0 -#define YY_ALWAYS_INTERACTIVE 0 -#define YY_NEVER_INTERACTIVE 0 -#define YY_MAIN 0 -#define YY_NO_UNPUT 1 -#define YY_SKIP_YYWRAP 1 - -enum { - BTL_OPENIB_INI_PARSE_DONE, - BTL_OPENIB_INI_PARSE_ERROR, - - BTL_OPENIB_INI_PARSE_NEWLINE, - BTL_OPENIB_INI_PARSE_SECTION, - BTL_OPENIB_INI_PARSE_EQUAL, - BTL_OPENIB_INI_PARSE_SINGLE_WORD, - BTL_OPENIB_INI_PARSE_VALUE, - - BTL_OPENIB_INI_PARSE_MAX -}; - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_lex.l b/opal/mca/btl/openib/btl_openib_lex.l deleted file mode 100644 index 2e6df13f27..0000000000 --- a/opal/mca/btl/openib/btl_openib_lex.l +++ /dev/null @@ -1,148 +0,0 @@ -%option nounput -%option noinput - -%{ /* -*- C -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif - -#include "btl_openib_lex.h" - -BEGIN_C_DECLS - -/* - * local functions - */ -static int btl_openib_ini_yywrap(void); - -END_C_DECLS - -/* - * global variables - */ -int btl_openib_ini_yynewlines = 1; -bool btl_openib_ini_parse_done = false; -char *btl_openib_ini_string = NULL; - -%} - -WHITE [\f\t\v ] -CHAR [A-Za-z0-9_\-\.] -NAME_CHAR [A-Za-z0-9_\-\.\\\/] - -%x comment -%x section_name -%x section_end -%x value - -%% - -{WHITE}*\n { ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } -#.*\n { ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } -"//".*\n { ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } - -"/*" { BEGIN(comment); - return BTL_OPENIB_INI_PARSE_NEWLINE; } -[^*\n]* ; /* Eat up non '*'s */ -"*"+[^*/\n]* ; /* Eat '*'s not followed by a '/' */ -\n { ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } -"*"+"/" { BEGIN(INITIAL); /* Done with block comment */ - return BTL_OPENIB_INI_PARSE_NEWLINE; } - -{WHITE}*\[{WHITE}* { BEGIN(section_name); } -({NAME_CHAR}|{WHITE})*{NAME_CHAR}/{WHITE}*\] { - BEGIN(section_end); - return BTL_OPENIB_INI_PARSE_SECTION; } -\n { ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_ERROR; } -. { return BTL_OPENIB_INI_PARSE_ERROR; } -{WHITE}*\]{WHITE}*\n { BEGIN(INITIAL); - ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } - -{WHITE}*"="{WHITE}* { BEGIN(value); - return BTL_OPENIB_INI_PARSE_EQUAL; } -{WHITE}+ ; /* whitespace */ -{CHAR}+ { return BTL_OPENIB_INI_PARSE_SINGLE_WORD; } - -{WHITE}*\n { BEGIN(INITIAL); - ++btl_openib_ini_yynewlines; - return BTL_OPENIB_INI_PARSE_NEWLINE; } -[^\n]*[^\t \n]/[\t ]* { - return BTL_OPENIB_INI_PARSE_VALUE; } - -. { return BTL_OPENIB_INI_PARSE_ERROR; } - -%% - -/* Old flex (2.5.4a? and older) does not define a destroy function */ -#if !defined(YY_FLEX_SUBMINOR_VERSION) -#define YY_FLEX_SUBMINOR_VERSION 0 -#endif - -#if (YY_FLEX_MAJOR_VERSION < 2) || (YY_FLEX_MAJOR_VERSION == 2 && (YY_FLEX_MINOR_VERSION < 5 || (YY_FLEX_MINOR_VERSION == 5 && YY_FLEX_SUBMINOR_VERSION < 5))) -int btl_openib_ini_yylex_destroy(void) -{ - if (NULL != YY_CURRENT_BUFFER) { - yy_delete_buffer(YY_CURRENT_BUFFER); -#if defined(YY_CURRENT_BUFFER_LVALUE) - YY_CURRENT_BUFFER_LVALUE = NULL; -#else - YY_CURRENT_BUFFER = NULL; -#endif /* YY_CURRENT_BUFFER_LVALUE */ - } - return YY_NULL; -} -#endif - -static int btl_openib_ini_yywrap(void) -{ - btl_openib_ini_parse_done = true; - return 1; -} - - -/* - * Ensure that we have a valid yybuffer to use. Specifically, if this - * scanner is invoked a second time, finish_parsing() (above) will - * have been executed, and the current buffer will have been freed. - * Flex doesn't recognize this fact because as far as it's concerned, - * its internal state was already initialized, so it thinks it should - * have a valid buffer. Hence, here we ensure to give it a valid - * buffer. - */ -int btl_openib_ini_init_buffer(FILE *file) -{ - YY_BUFFER_STATE buf = yy_create_buffer(file, YY_BUF_SIZE); - yy_switch_to_buffer(buf); - - return 0; -} diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c deleted file mode 100644 index 7896287c62..0000000000 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ /dev/null @@ -1,799 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2018 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include "opal/util/bit_ops.h" -#include "opal/util/printf.h" -#include "opal/mca/common/verbs/common_verbs.h" -#include "opal/mca/installdirs/installdirs.h" -#include "opal/util/os_dirpath.h" -#include "opal/util/output.h" -#include "opal/util/show_help.h" -#include "opal/util/proc.h" - -#include "btl_openib.h" -#include "btl_openib_mca.h" -#include "btl_openib_ini.h" -#include "connect/base.h" - -#ifdef HAVE_IBV_FORK_INIT -#define OPAL_HAVE_IBV_FORK_INIT 1 -#else -#define OPAL_HAVE_IBV_FORK_INIT 0 -#endif - -/* - * Local flags - */ -enum { - REGINT_NEG_ONE_OK = 0x01, - REGINT_GE_ZERO = 0x02, - REGINT_GE_ONE = 0x04, - REGINT_NONZERO = 0x08, - - REGINT_MAX = 0x88 -}; - - -enum { - REGSTR_EMPTY_OK = 0x01, - - REGSTR_MAX = 0x88 -}; - -static mca_base_var_enum_value_t ib_mtu_values[] = { - {IBV_MTU_256, "256B"}, - {IBV_MTU_512, "512B"}, - {IBV_MTU_1024, "1k"}, - {IBV_MTU_2048, "2k"}, - {IBV_MTU_4096, "4k"}, - {0, NULL} -}; - -static mca_base_var_enum_value_t device_type_values[] = { - {BTL_OPENIB_DT_IB, "infiniband"}, - {BTL_OPENIB_DT_IB, "ib"}, - {BTL_OPENIB_DT_IWARP, "iwarp"}, - {BTL_OPENIB_DT_IWARP, "iw"}, - {BTL_OPENIB_DT_ALL, "all"}, - {0, NULL} -}; - -static int btl_openib_cq_size; -static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT; - -/* - * utility routine for string parameter registration - */ -static int reg_string(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - const char* default_value, char **storage, - int flags) -{ - int index; - - assert (NULL != storage); - - /* The MCA variable system will not change this pointer */ - *storage = (char *) default_value; - index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGSTR_EMPTY_OK) && (NULL == *storage || 0 == strlen(*storage))) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OPAL_ERR_BAD_PARAM; - } - - return OPAL_SUCCESS; -} - - -/* - * utility routine for integer parameter registration - */ -static int reg_int(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - int default_value, int *storage, int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == *storage) { - return OPAL_SUCCESS; - } - - if ((0 != (flags & REGINT_GE_ZERO) && *storage < 0) || - (0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OPAL_ERR_BAD_PARAM; - } - - return OPAL_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_uint(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - unsigned int default_value, unsigned int *storage, - int flags) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_UNSIGNED_INT, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - if ((0 != (flags & REGINT_GE_ONE) && *storage < 1) || - (0 != (flags & REGINT_NONZERO) && 0 == *storage)) { - opal_output(0, "Bad parameter value for parameter \"%s\"", - param_name); - return OPAL_ERR_BAD_PARAM; - } - - return OPAL_SUCCESS; -} - -/* - * utility routine for integer parameter registration - */ -static int reg_bool(const char* param_name, - const char* deprecated_param_name, - const char* param_desc, - bool default_value, bool *storage) -{ - int index; - - *storage = default_value; - index = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); - if (NULL != deprecated_param_name) { - (void) mca_base_var_register_synonym(index, "ompi", "btl", "openib", - deprecated_param_name, - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } - - return OPAL_SUCCESS; -} - -/* - * Register and check all MCA parameters - */ -int btl_openib_register_mca_params(void) -{ - mca_base_var_enum_t *new_enum; - char *default_qps; - uint32_t mid_qp_size; - char *msg, *str; - int ret, tmp; - - ret = OPAL_SUCCESS; -#define CHECK(expr) do {\ - tmp = (expr); \ - if (OPAL_SUCCESS != tmp) ret = tmp; \ - } while (0) - - /* register openib component parameters */ - CHECK(reg_bool("verbose", NULL, - "Output some verbose OpenIB BTL information " - "(0 = no output, nonzero = output)", false, - &mca_btl_openib_component.verbose)); - - CHECK(reg_bool("warn_no_device_params_found", - "warn_no_hca_params_found", - "Warn when no device-specific parameters are found in the INI file specified by the btl_openib_device_param_files MCA parameter " - "(0 = do not warn; any other value = warn)", - true, &mca_btl_openib_component.warn_no_device_params_found)); - - CHECK(reg_bool("warn_default_gid_prefix", NULL, - "Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured " - "(0 = do not warn; any other value = warn)", - true, &mca_btl_openib_component.warn_default_gid_prefix)); - - CHECK(reg_bool("warn_nonexistent_if", NULL, - "Warn if non-existent devices and/or ports are specified in the btl_openib_if_[in|ex]clude MCA parameters " - "(0 = do not warn; any other value = warn)", - true, &mca_btl_openib_component.warn_nonexistent_if)); - - /* If we print a warning about not having enough registered memory - available, do we want to abort? */ - CHECK(reg_bool("abort_not_enough_reg_mem", NULL, - "If there is not enough registered memory available on the system for Open MPI to function properly, Open MPI will issue a warning. If this MCA parameter is set to true, then Open MPI will also abort all MPI jobs " - "(0 = warn, but do not abort; any other value = warn and abort)", - false, &mca_btl_openib_component.abort_not_enough_reg_mem)); - - CHECK(reg_uint("poll_cq_batch", NULL, - "Retrieve up to poll_cq_batch completions from CQ", - MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT, &mca_btl_openib_component.cq_poll_batch, - REGINT_GE_ONE)); - - opal_asprintf(&str, "%s/mca-btl-openib-device-params.ini", - opal_install_dirs.opaldatadir); - if (NULL == str) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - CHECK(reg_string("device_param_files", "hca_param_files", - "Colon-delimited list of INI-style files that contain device vendor/part-specific parameters (use semicolon for Windows)", - str, &mca_btl_openib_component.device_params_file_names, - 0)); - free(str); - - (void)mca_base_var_enum_create("btl_openib_device_types", device_type_values, &new_enum); - mca_btl_openib_component.device_type = BTL_OPENIB_DT_ALL; - tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "device_type", "Specify to only use IB or iWARP " - "network adapters (infiniband = only use InfiniBand " - "HCAs; iwarp = only use iWARP NICs; all = use any " - "available adapters)", MCA_BASE_VAR_TYPE_INT, new_enum, - 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_btl_openib_component.device_type); - if (0 > tmp) ret = tmp; - OBJ_RELEASE(new_enum); - - /* - * Provide way for using to override policy of ignoring IB HCAs - */ - - mca_btl_openib_component.allow_ib = false; - tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "allow_ib", - "Override policy since Open MPI 4.0 of ignoring IB HCAs for openib BTL", - MCA_BASE_VAR_TYPE_BOOL, NULL, - 0, 0, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_btl_openib_component.allow_ib); - - CHECK(reg_int("max_btls", NULL, - "Maximum number of device ports to use " - "(-1 = use all available, otherwise must be >= 1)", - -1, &mca_btl_openib_component.ib_max_btls, - REGINT_NEG_ONE_OK | REGINT_GE_ONE)); - CHECK(reg_int("free_list_num", NULL, - "Initial size of free lists " - "(must be >= 1)", - 8, &mca_btl_openib_component.ib_free_list_num, - REGINT_GE_ONE)); - CHECK(reg_int("free_list_max", NULL, - "Maximum size of free lists " - "(-1 = infinite, otherwise must be >= 0)", - -1, &mca_btl_openib_component.ib_free_list_max, - REGINT_NEG_ONE_OK | REGINT_GE_ONE)); - CHECK(reg_int("free_list_inc", NULL, - "Increment size of free lists " - "(must be >= 1)", - 32, &mca_btl_openib_component.ib_free_list_inc, - REGINT_GE_ONE)); - CHECK(reg_string("mpool_hints", NULL, "hints for selecting a memory pool (default: none)", - NULL, &mca_btl_openib_component.ib_mpool_hints, - 0)); - CHECK(reg_string("rcache", NULL, - "Name of the registration cache to be used (it is unlikely that you will ever want to change this)", - "grdma", &mca_btl_openib_component.ib_rcache_name, - 0)); - CHECK(reg_int("reg_mru_len", NULL, - "Length of the registration cache most recently used list " - "(must be >= 1)", - 16, (int*) &mca_btl_openib_component.reg_mru_len, - REGINT_GE_ONE)); - - CHECK(reg_int("cq_size", "ib_cq_size", - "Minimum size of the OpenFabrics completion queue " - "(CQs are automatically sized based on the number " - "of peer MPI processes; this value determines the " - "*minimum* size of all CQs)", - 8192, &btl_openib_cq_size, REGINT_GE_ONE)); - mca_btl_openib_component.ib_cq_size[BTL_OPENIB_LP_CQ] = - mca_btl_openib_component.ib_cq_size[BTL_OPENIB_HP_CQ] = (uint32_t) btl_openib_cq_size; - - CHECK(reg_int("max_inline_data", "ib_max_inline_data", - "Maximum size of inline data segment " - "(-1 = run-time probe to discover max value, otherwise must be >= 0). " - "If not explicitly set, use max_inline_data from " - "the INI file containing device-specific parameters", - -1, &mca_btl_openib_component.ib_max_inline_data, - REGINT_NEG_ONE_OK | REGINT_GE_ZERO)); - - CHECK(reg_uint("pkey", "ib_pkey_val", - "OpenFabrics partition key (pkey) value. " - "Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB partition key value (0x7fff)", - 0, &mca_btl_openib_component.ib_pkey_val, 0)); - - CHECK(reg_uint("psn", "ib_psn", - "OpenFabrics packet sequence starting number " - "(must be >= 0)", - 0, &mca_btl_openib_component.ib_psn, 0)); - - CHECK(reg_uint("ib_qp_ous_rd_atom", NULL, - "InfiniBand outstanding atomic reads " - "(must be >= 0)", - 4, &mca_btl_openib_component.ib_qp_ous_rd_atom, 0)); - - opal_asprintf(&msg, "OpenFabrics MTU, in bytes (if not specified in INI files). Valid values are: %d=256 bytes, %d=512 bytes, %d=1024 bytes, %d=2048 bytes, %d=4096 bytes", - IBV_MTU_256, - IBV_MTU_512, - IBV_MTU_1024, - IBV_MTU_2048, - IBV_MTU_4096); - if (NULL == msg) { - /* Don't try to recover from this */ - return OPAL_ERR_OUT_OF_RESOURCE; - } - mca_btl_openib_component.ib_mtu = 0; - (void) mca_base_var_enum_create("btl_openib_mtus", ib_mtu_values, &new_enum); - tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "mtu", msg, MCA_BASE_VAR_TYPE_INT, new_enum, - 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_btl_openib_component.ib_mtu); - if (0 <= tmp) { - (void) mca_base_var_register_synonym(tmp, "ompi", "btl", "openib", "ib_mtu", - MCA_BASE_VAR_SYN_FLAG_DEPRECATED); - } else { - ret = tmp; - } - - OBJ_RELEASE(new_enum); - free(msg); - - CHECK(reg_uint("ib_min_rnr_timer", NULL, "InfiniBand minimum " - "\"receiver not ready\" timer, in seconds " - "(must be >= 0 and <= 31)", - 25, &mca_btl_openib_component.ib_min_rnr_timer, 0)); - - CHECK(reg_uint("ib_timeout", NULL, - "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * (2^btl_openib_ib_timeout) " - "(must be >= 0 and <= 31)", - 20, &mca_btl_openib_component.ib_timeout, 0)); - - CHECK(reg_uint("ib_retry_count", NULL, - "InfiniBand transmit retry count " - "(must be >= 0 and <= 7)", - 7, &mca_btl_openib_component.ib_retry_count, 0)); - - CHECK(reg_uint("ib_rnr_retry", NULL, - "InfiniBand \"receiver not ready\" " - "retry count; applies *only* to SRQ/XRC queues. PP queues " - "use RNR retry values of 0 because Open MPI performs " - "software flow control to guarantee that RNRs never occur " - "(must be >= 0 and <= 7; 7 = \"infinite\")", - 7, &mca_btl_openib_component.ib_rnr_retry, 0)); - - CHECK(reg_uint("ib_max_rdma_dst_ops", NULL, "InfiniBand maximum pending RDMA " - "destination operations " - "(must be >= 0)", - 4, &mca_btl_openib_component.ib_max_rdma_dst_ops, 0)); - - CHECK(reg_uint("ib_service_level", NULL, "InfiniBand service level " - "(must be >= 0 and <= 15)", - 0, &mca_btl_openib_component.ib_service_level, 0)); - -#if (ENABLE_DYNAMIC_SL) - CHECK(reg_uint("ib_path_record_service_level", NULL, - "Enable getting InfiniBand service level from PathRecord " - "(must be >= 0, 0 = disabled, positive = try to get the " - "service level from PathRecord)", - 0, &mca_btl_openib_component.ib_path_record_service_level, 0)); -#endif - - CHECK(reg_int("use_eager_rdma", NULL, "Use RDMA for eager messages " - "(-1 = use device default, 0 = do not use eager RDMA, " - "1 = use eager RDMA)", - -1, &mca_btl_openib_component.use_eager_rdma, 0)); - - CHECK(reg_int("eager_rdma_threshold", NULL, - "Use RDMA for short messages after this number of " - "messages are received from a given peer " - "(must be >= 1)", - 16, &mca_btl_openib_component.eager_rdma_threshold, REGINT_GE_ONE)); - - CHECK(reg_int("max_eager_rdma", NULL, "Maximum number of peers allowed to use " - "RDMA for short messages (RDMA is used for all long " - "messages, except if explicitly disabled, such as " - "with the \"dr\" pml) " - "(must be >= 0)", - 16, &mca_btl_openib_component.max_eager_rdma, REGINT_GE_ZERO)); - - CHECK(reg_int("eager_rdma_num", NULL, "Number of RDMA buffers to allocate " - "for small messages " - "(must be >= 1)", - 16, &mca_btl_openib_component.eager_rdma_num, REGINT_GE_ONE)); - mca_btl_openib_component.eager_rdma_num++; - - CHECK(reg_uint("btls_per_lid", NULL, "Number of BTLs to create for each " - "InfiniBand LID " - "(must be >= 1)", - 1, &mca_btl_openib_component.btls_per_lid, REGINT_GE_ONE)); - - CHECK(reg_uint("max_lmc", NULL, "Maximum number of LIDs to use for each device port " - "(must be >= 0, where 0 = use all available)", - 1, &mca_btl_openib_component.max_lmc, 0)); - - CHECK(reg_int("enable_apm_over_lmc", NULL, "Maximum number of alternative paths for each device port " - "(must be >= -1, where 0 = disable apm, -1 = all available alternative paths )", - 0, &mca_btl_openib_component.apm_lmc, REGINT_NEG_ONE_OK|REGINT_GE_ZERO)); - - CHECK(reg_int("enable_apm_over_ports", NULL, "Enable alternative path migration (APM) over different ports of the same device " - "(must be >= 0, where 0 = disable APM over ports, 1 = enable APM over ports of the same device)", - 0, &mca_btl_openib_component.apm_ports, REGINT_GE_ZERO)); - - CHECK(reg_bool("use_async_event_thread", NULL, - "If nonzero, use the thread that will handle InfiniBand asynchronous events", - true, &mca_btl_openib_component.use_async_event_thread)); - - CHECK(reg_bool("enable_srq_resize", NULL, - "Enable/Disable on demand SRQ resize. " - "(0 = without resizing, nonzero = with resizing)", 1, - &mca_btl_openib_component.enable_srq_resize)); - -#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET - CHECK(reg_bool("rroce_enable", NULL, - "Enable/Disable routing between different subnets" - "(0 = disable, nonzero = enable)", false, - &mca_btl_openib_component.rroce_enable)); -#endif - - CHECK(reg_uint("buffer_alignment", NULL, - "Preferred communication buffer alignment, in bytes " - "(must be > 0 and power of two)", - 64, &mca_btl_openib_component.buffer_alignment, 0)); - - CHECK(reg_bool("use_message_coalescing", NULL, - "If nonzero, use message coalescing", false, - &mca_btl_openib_component.use_message_coalescing)); - - CHECK(reg_uint("cq_poll_ratio", NULL, - "How often to poll high priority CQ versus low priority CQ", - 100, &mca_btl_openib_component.cq_poll_ratio, REGINT_GE_ONE)); - CHECK(reg_uint("eager_rdma_poll_ratio", NULL, - "How often to poll eager RDMA channel versus CQ", - 100, &mca_btl_openib_component.eager_rdma_poll_ratio, REGINT_GE_ONE)); - CHECK(reg_uint("hp_cq_poll_per_progress", NULL, - "Max number of completion events to process for each call " - "of BTL progress engine", - 10, &mca_btl_openib_component.cq_poll_progress, REGINT_GE_ONE)); - - CHECK(reg_uint("max_hw_msg_size", NULL, - "Maximum size (in bytes) of a single fragment of a long message when using the RDMA protocols (must be > 0 and <= hw capabilities).", - 0, &mca_btl_openib_component.max_hw_msg_size, 0)); - - CHECK(reg_bool("allow_max_memory_registration", NULL, - "Allow maximum possible memory to register with HCA", - 1, &mca_btl_openib_component.allow_max_memory_registration)); - - /* Help debug memory registration issues */ - CHECK(reg_int("memory_registration_verbose", NULL, - "Output some verbose memory registration information " - "(0 = no output, nonzero = output)", 0, - &mca_btl_openib_component.memory_registration_verbose_level, 0)); - - CHECK(reg_int("ignore_locality", NULL, - "Ignore any locality information and use all devices " - "(0 = use locality informaiton and use only close devices, nonzero = ignore locality information)", 0, - &mca_btl_openib_component.ignore_locality, REGINT_GE_ZERO)); - - /* Info only */ - tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "have_fork_support", - "Whether the OpenFabrics stack supports applications that invoke the \"fork()\" system call or not (0 = no, 1 = yes). " - "Note that this value does NOT indicate whether the system being run on supports \"fork()\" with OpenFabrics applications or not.", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_CONSTANT, - &btl_openib_have_fork_support); - - mca_btl_openib_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT; - - mca_btl_openib_module.super.btl_eager_limit = 12 * 1024; - mca_btl_openib_module.super.btl_rndv_eager_limit = 12 * 1024; - mca_btl_openib_module.super.btl_max_send_size = 64 * 1024; - mca_btl_openib_module.super.btl_rdma_pipeline_send_length = 1024 * 1024; - mca_btl_openib_module.super.btl_rdma_pipeline_frag_size = 1024 * 1024; - mca_btl_openib_module.super.btl_min_rdma_pipeline_size = 256 * 1024; - mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | - MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA | - MCA_BTL_FLAGS_SEND; -#if HAVE_DECL_IBV_ATOMIC_HCA - mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS; - mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; -#endif - - /* Default to bandwidth auto-detection */ - mca_btl_openib_module.super.btl_bandwidth = 0; - mca_btl_openib_module.super.btl_latency = 4; -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ - /* Default is enabling CUDA asynchronous send copies */ - CHECK(reg_bool("cuda_async_send", NULL, - "Enable or disable CUDA async send copies " - "(true = async; false = sync)", - true, &mca_btl_openib_component.cuda_async_send)); - - /* Default is enabling CUDA asynchronous receive copies */ - CHECK(reg_bool("cuda_async_recv", NULL, - "Enable or disable CUDA async recv copies " - "(true = async; false = sync)", - false, &mca_btl_openib_component.cuda_async_recv)); - /* Also make the max send size larger for better GPU buffer performance */ - mca_btl_openib_module.super.btl_max_send_size = 128 * 1024; - /* Turn of message coalescing - not sure if it works with GPU buffers */ - mca_btl_openib_component.use_message_coalescing = 0; - - /* Indicates if library was built with GPU Direct RDMA support. Not changeable. */ - mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_GDR_SUPPORT); - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr", - "Whether CUDA GPU Direct RDMA support is built into library or not", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &mca_btl_openib_component.cuda_have_gdr); - - /* Indicates if driver has GPU Direct RDMA support. Not changeable. */ - if (OPAL_SUCCESS == opal_os_dirpath_access("/sys/kernel/mm/memory_peers/nv_mem/version", S_IRUSR)) { - mca_btl_openib_component.driver_have_gdr = 1; - } else { - mca_btl_openib_component.driver_have_gdr = 0; - } - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_driver_gdr", - "Whether Infiniband driver has GPU Direct RDMA support", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_CONSTANT, - &mca_btl_openib_component.driver_have_gdr); - - /* Default for GPU Direct RDMA is off for now */ - CHECK(reg_bool("want_cuda_gdr", NULL, - "Enable or disable CUDA GPU Direct RDMA support " - "(true = enabled; false = disabled)", - false, &mca_btl_openib_component.cuda_want_gdr)); - - if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) { - opal_show_help("help-mpi-btl-openib.txt", - "CUDA_no_gdr_support", true, - opal_process_info.nodename); - return OPAL_ERROR; - } - if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.driver_have_gdr) { - opal_show_help("help-mpi-btl-openib.txt", - "driver_no_gdr_support", true, - opal_process_info.nodename); - return OPAL_ERROR; - } -#if OPAL_CUDA_GDR_SUPPORT - if (mca_btl_openib_component.cuda_want_gdr) { - mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET; - mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */ - mca_btl_openib_module.super.btl_cuda_rdma_limit = 30000; /* default switchover is 30,000 to pipeline */ - } else { - mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */ - mca_btl_openib_module.super.btl_cuda_rdma_limit = 0; /* Unused */ - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ -#endif /* OPAL_CUDA_SUPPORT */ - CHECK(mca_btl_base_param_register( - &mca_btl_openib_component.super.btl_version, - &mca_btl_openib_module.super)); - - /* setup all the qp stuff */ - /* round mid_qp_size to smallest power of two */ - mid_qp_size = opal_next_poweroftwo (mca_btl_openib_module.super.btl_eager_limit / 4) >> 1; - - /* mid_qp_size = MAX (mid_qp_size, 1024); ?! */ - if(mid_qp_size <= 128) { - mid_qp_size = 1024; - } - - opal_asprintf(&default_qps, - "S,128,256,192,128:S,%u,1024,1008,64:S,%u,1024,1008,64:S,%u,1024,1008,64", - mid_qp_size, - (uint32_t)mca_btl_openib_module.super.btl_eager_limit, - (uint32_t)mca_btl_openib_module.super.btl_max_send_size); - if (NULL == default_qps) { - /* Don't try to recover from this */ - return OPAL_ERR_OUT_OF_RESOURCE; - } - if (NULL != mca_btl_openib_component.default_recv_qps) { - free(mca_btl_openib_component.default_recv_qps); - } - mca_btl_openib_component.default_recv_qps = default_qps; - CHECK(reg_string("receive_queues", NULL, - "Colon-delimited, comma-delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4", - default_qps, &mca_btl_openib_component.receive_queues, - 0 - )); - - CHECK(reg_string("if_include", NULL, - "Comma-delimited list of devices/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.", - NULL, &mca_btl_openib_component.if_include, - 0)); - - CHECK(reg_string("if_exclude", NULL, - "Comma-delimited list of device/ports to be excluded (empty value means to not exclude any ports). Mutually exclusive with btl_openib_if_include.", - NULL, &mca_btl_openib_component.if_exclude, - 0)); - - CHECK(reg_string("ipaddr_include", NULL, - "Comma-delimited list of IP Addresses to be used (e.g. \"192.168.1.0/24\"). Mutually exclusive with btl_openib_ipaddr_exclude.", - NULL, &mca_btl_openib_component.ipaddr_include, - 0)); - - CHECK(reg_string("ipaddr_exclude", NULL, - "Comma-delimited list of IP Addresses to be excluded (e.g. \"192.168.1.0/24\"). Mutually exclusive with btl_openib_ipaddr_include.", - NULL, &mca_btl_openib_component.ipaddr_exclude, - 0)); - - CHECK(reg_int("gid_index", NULL, - "GID index to use on verbs device ports", - 0, &mca_btl_openib_component.gid_index, - REGINT_GE_ZERO)); - - CHECK(reg_bool("allow_different_subnets", NULL, - "Allow connecting processes from different IB subnets." - "(0 = do not allow; 1 = allow)", - false, &mca_btl_openib_component.allow_different_subnets)); - - /* Register any MCA params for the connect pseudo-components */ - if (OPAL_SUCCESS == ret) { - ret = opal_btl_openib_connect_base_register(); - } - - return btl_openib_verify_mca_params(); -} - -int btl_openib_verify_mca_params (void) -{ - if (mca_btl_openib_component.cq_poll_batch > MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT) { - mca_btl_openib_component.cq_poll_batch = MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT; - } - -#if !HAVE_IBV_FORK_INIT - if (1 == mca_btl_openib_component.want_fork_support) { - opal_show_help("help-mpi-btl-openib.txt", - "ibv_fork requested but not supported", true, - opal_process_info.nodename); - return OPAL_ERR_BAD_PARAM; - } -#endif - - mca_btl_openib_component.ib_pkey_val &= MCA_BTL_IB_PKEY_MASK; - - if (mca_btl_openib_component.ib_min_rnr_timer > 31) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "btl_openib_ib_min_rnr_timer > 31", - "btl_openib_ib_min_rnr_timer reset to 31"); - mca_btl_openib_component.ib_min_rnr_timer = 31; - } - - if (mca_btl_openib_component.ib_timeout > 31) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "btl_openib_ib_timeout > 31", - "btl_openib_ib_timeout reset to 31"); - mca_btl_openib_component.ib_timeout = 31; - } - - if (mca_btl_openib_component.ib_retry_count > 7) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "btl_openib_ib_retry_count > 7", - "btl_openib_ib_retry_count reset to 7"); - mca_btl_openib_component.ib_retry_count = 7; - } - - if (mca_btl_openib_component.ib_rnr_retry > 7) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "btl_openib_ib_rnr_retry > 7", - "btl_openib_ib_rnr_retry reset to 7"); - mca_btl_openib_component.ib_rnr_retry = 7; - } - - if (mca_btl_openib_component.ib_service_level > 15) { - opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value", - true, "btl_openib_ib_service_level > 15", - "btl_openib_ib_service_level reset to 15"); - mca_btl_openib_component.ib_service_level = 15; - } - - if(mca_btl_openib_component.buffer_alignment <= 1 || - (mca_btl_openib_component.buffer_alignment & (mca_btl_openib_component.buffer_alignment - 1))) { - opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment", - true, mca_btl_openib_component.buffer_alignment, opal_process_info.nodename, 64); - mca_btl_openib_component.buffer_alignment = 64; - } - -#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ - if (mca_btl_openib_component.cuda_async_send) { - mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND; - } else { - mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_SEND; - } - - if (mca_btl_openib_component.cuda_async_recv) { - mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV; - } else { - mca_btl_openib_module.super.btl_flags &= ~MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV; - } -#if 0 /* Disable this check for now while fork support code is worked out. */ - /* Cannot have fork support and GDR on at the same time. If the user asks for both, - * then print a message and return error. If the user does not explicitly ask for - * fork support, then turn it off in the presence of GDR. */ - if (mca_btl_openib_component.cuda_want_gdr && mca_btl_openib_component.cuda_have_gdr && - mca_btl_openib_component.driver_have_gdr) { - if (1 == opal_common_verbs_want_fork_support) { - opal_show_help("help-mpi-btl-openib.txt", "no_fork_with_gdr", - true, opal_process_info.nodename); - return OPAL_ERR_BAD_PARAM; - } - } -#endif /* Workaround */ - if (0 != mca_btl_openib_module.super.btl_cuda_max_send_size) { - opal_show_help("help-mpi-btl-openib.txt", "do_not_set_openib_value", - true, opal_process_info.nodename); - mca_btl_openib_module.super.btl_cuda_max_send_size = 0; - } -#endif - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_mca.h b/opal/mca/btl/openib/btl_openib_mca.h deleted file mode 100644 index 49635644e8..0000000000 --- a/opal/mca/btl/openib/btl_openib_mca.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_IB_MCA_H -#define MCA_BTL_IB_MCA_H - -BEGIN_C_DECLS - - /** - * Function to register MCA params and check for sane values - */ - int btl_openib_register_mca_params(void); - int btl_openib_verify_mca_params (void); - -END_C_DECLS -#endif diff --git a/opal/mca/btl/openib/btl_openib_proc.c b/opal/mca/btl/openib/btl_openib_proc.c deleted file mode 100644 index 9e891fb55c..0000000000 --- a/opal/mca/btl/openib/btl_openib_proc.c +++ /dev/null @@ -1,405 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2015-2018 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/util/arch.h" -#include "opal/mca/pmix/pmix.h" - -#include "btl_openib.h" -#include "btl_openib_proc.h" -#include "connect/base.h" -#include "connect/connect.h" - -static void mca_btl_openib_proc_btl_construct(mca_btl_openib_proc_btlptr_t* elem); -static void mca_btl_openib_proc_btl_destruct(mca_btl_openib_proc_btlptr_t* elem); - -OBJ_CLASS_INSTANCE(mca_btl_openib_proc_btlptr_t, - opal_list_item_t, mca_btl_openib_proc_btl_construct, - mca_btl_openib_proc_btl_destruct); - -static void mca_btl_openib_proc_btl_construct(mca_btl_openib_proc_btlptr_t* elem) -{ - elem->openib_btl = NULL; -} - -static void mca_btl_openib_proc_btl_destruct(mca_btl_openib_proc_btlptr_t* elem) -{ - elem->openib_btl = NULL; -} - -static void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* proc); -static void mca_btl_openib_proc_destruct(mca_btl_openib_proc_t* proc); - -OBJ_CLASS_INSTANCE(mca_btl_openib_proc_t, - opal_list_item_t, mca_btl_openib_proc_construct, - mca_btl_openib_proc_destruct); - -void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* ib_proc) -{ - ib_proc->proc_opal = 0; - ib_proc->proc_ports = NULL; - ib_proc->proc_port_count = 0; - ib_proc->proc_endpoints = 0; - ib_proc->proc_endpoint_count = 0; - OBJ_CONSTRUCT(&ib_proc->proc_lock, opal_mutex_t); - OBJ_CONSTRUCT(&ib_proc->openib_btls, opal_list_t); -} - -/* - * Cleanup ib proc instance - */ - -void mca_btl_openib_proc_destruct(mca_btl_openib_proc_t* ib_proc) -{ - /* release resources */ - if(NULL != ib_proc->proc_endpoints) { - free(ib_proc->proc_endpoints); - } - if (NULL != ib_proc->proc_ports) { - int i, j; - for (i = 0; i < ib_proc->proc_port_count; ++i) { - for (j = 0; j < ib_proc->proc_ports[i].pm_cpc_data_count; ++j) { - if (NULL != ib_proc->proc_ports[i].pm_cpc_data[j].cbm_modex_message) { - free(ib_proc->proc_ports[i].pm_cpc_data[j].cbm_modex_message); - } - } - } - free(ib_proc->proc_ports); - } - OBJ_DESTRUCT(&ib_proc->proc_lock); - - OPAL_LIST_DESTRUCT(&ib_proc->openib_btls); -} - - -/* - * Look for an existing IB process instances based on the associated - * opal_proc_t instance. - */ -static mca_btl_openib_proc_t* ibproc_lookup_no_lock(opal_proc_t* proc) -{ - mca_btl_openib_proc_t* ib_proc; - - OPAL_LIST_FOREACH(ib_proc, &mca_btl_openib_component.ib_procs, mca_btl_openib_proc_t) { - if(ib_proc->proc_opal == proc) { - return ib_proc; - } - } - return NULL; -} - -static mca_btl_openib_proc_t* ibproc_lookup_and_lock(opal_proc_t* proc) -{ - mca_btl_openib_proc_t* ib_proc; - - /* get the process from the list */ - opal_mutex_lock(&mca_btl_openib_component.ib_lock); - ib_proc = ibproc_lookup_no_lock(proc); - opal_mutex_unlock(&mca_btl_openib_component.ib_lock); - if( NULL != ib_proc ){ - /* if we were able to find it - lock it. - * NOTE: we want to lock it outside of list locked region */ - opal_mutex_lock(&ib_proc->proc_lock); - } - return ib_proc; -} - -static void inline unpack8(char **src, uint8_t *value) -{ - /* Copy one character */ - *value = (uint8_t) **src; - /* Most the src ahead one */ - ++*src; -} - -/* - * Create a IB process structure. There is a one-to-one correspondence - * between a opal_proc_t and a mca_btl_openib_proc_t instance. We - * cache additional data (specifically the list of - * mca_btl_openib_endpoint_t instances, and published addresses) - * associated w/ a given destination on this datastructure. - */ - -mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc) -{ - mca_btl_openib_proc_t *ib_proc = NULL, *ib_proc_ret = NULL; - size_t msg_size; - uint32_t size; - int rc, i, j; - void *message; - char *offset; - int modex_message_size; - mca_btl_openib_modex_message_t dummy; - bool is_new = false; - - /* Check if we have already created a IB proc - * structure for this ompi process */ - ib_proc = ibproc_lookup_and_lock(proc); - if (NULL != ib_proc) { - /* Gotcha! */ - return ib_proc; - } - - /* All initialization has to be an atomic operation. we do the following assumption: - * - we let all concurent threads to try to do the initialization; - * - when one has finished it locks ib_lock and checks if corresponding - * process is still missing; - * - if so - new proc is added, otherwise - initialized proc struct is released. - */ - - /* First time, gotta create a new IB proc - * out of the opal_proc ... */ - ib_proc = OBJ_NEW(mca_btl_openib_proc_t); - if (NULL == ib_proc) { - return NULL; - } - - /* Initialize number of peer */ - ib_proc->proc_endpoint_count = 0; - ib_proc->proc_opal = proc; - - /* query for the peer address info */ - OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version, - &proc->proc_name, &message, &msg_size); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("[%s:%d] opal_modex_recv failed for peer %s", - __FILE__, __LINE__, - OPAL_NAME_PRINT(proc->proc_name))); - goto no_err_exit; - } - if (0 == msg_size) { - goto no_err_exit; - } - - /* Message was packed in btl_openib_component.c; the format is - listed in a comment in that file */ - modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy); - - /* Unpack the number of modules in the message */ - offset = (char *) message; - unpack8(&offset, &(ib_proc->proc_port_count)); - BTL_VERBOSE(("unpack: %d btls", ib_proc->proc_port_count)); - if (ib_proc->proc_port_count > 0) { - ib_proc->proc_ports = (mca_btl_openib_proc_modex_t *) - malloc(sizeof(mca_btl_openib_proc_modex_t) * - ib_proc->proc_port_count); - } else { - ib_proc->proc_ports = NULL; - } - - /* Loop over unpacking all the ports */ - for (i = 0; i < ib_proc->proc_port_count; i++) { - - /* Unpack the modex comment message struct */ - size = modex_message_size; - memcpy(&(ib_proc->proc_ports[i].pm_port_info), offset, size); -#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT - MCA_BTL_OPENIB_MODEX_MSG_NTOH(ib_proc->proc_ports[i].pm_port_info); -#endif - offset += size; - BTL_VERBOSE(("unpacked btl %d: modex message, offset now %d", - i, (int)(offset-((char*)message)))); - - /* Unpack the number of CPCs that follow */ - unpack8(&offset, &(ib_proc->proc_ports[i].pm_cpc_data_count)); - BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)", - i, ib_proc->proc_ports[i].pm_cpc_data_count, - (int)(offset-((char*)message)))); - ib_proc->proc_ports[i].pm_cpc_data = (opal_btl_openib_connect_base_module_data_t *) - calloc(ib_proc->proc_ports[i].pm_cpc_data_count, - sizeof(opal_btl_openib_connect_base_module_data_t)); - if (NULL == ib_proc->proc_ports[i].pm_cpc_data) { - goto err_exit; - } - - /* Unpack the CPCs */ - for (j = 0; j < ib_proc->proc_ports[i].pm_cpc_data_count; ++j) { - uint8_t u8; - opal_btl_openib_connect_base_module_data_t *cpcd; - cpcd = ib_proc->proc_ports[i].pm_cpc_data + j; - unpack8(&offset, &u8); - BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)", - i, j, u8, (int)(offset-(char*)message))); - cpcd->cbm_component = - opal_btl_openib_connect_base_get_cpc_byindex(u8); - BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s", - i, j, cpcd->cbm_component->cbc_name)); - - unpack8(&offset, &cpcd->cbm_priority); - unpack8(&offset, &cpcd->cbm_modex_message_len); - BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)", - i, j, cpcd->cbm_priority, - cpcd->cbm_modex_message_len, - (int)(offset-(char*)message))); - if (cpcd->cbm_modex_message_len > 0) { - cpcd->cbm_modex_message = malloc(cpcd->cbm_modex_message_len); - if (NULL == cpcd->cbm_modex_message) { - BTL_ERROR(("Failed to malloc")); - goto err_exit; - } - memcpy(cpcd->cbm_modex_message, offset, - cpcd->cbm_modex_message_len); - offset += cpcd->cbm_modex_message_len; - BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)", - i, j, - ((uint32_t*)cpcd->cbm_modex_message)[0], - ((uint32_t*)cpcd->cbm_modex_message)[1], - (int)(offset-((char*)message)))); - } - } - } - - if (0 == ib_proc->proc_port_count) { - ib_proc->proc_endpoints = NULL; - goto no_err_exit; - } else { - ib_proc->proc_endpoints = (volatile mca_btl_base_endpoint_t**) - malloc(ib_proc->proc_port_count * - sizeof(mca_btl_base_endpoint_t*)); - } - if (NULL == ib_proc->proc_endpoints) { - goto err_exit; - } - - BTL_VERBOSE(("unpacking done!")); - - /* Finally add this process to the initialized procs list */ - opal_mutex_lock(&mca_btl_openib_component.ib_lock); - - ib_proc_ret = ibproc_lookup_no_lock(proc); - if (NULL == ib_proc_ret) { - /* if process can't be found in this list - insert it locked - * it is safe to lock ib_proc here because this thread is - * the only one who knows about it so far */ - opal_mutex_lock(&ib_proc->proc_lock); - opal_list_append(&mca_btl_openib_component.ib_procs, &ib_proc->super); - ib_proc_ret = ib_proc; - is_new = true; - } else { - /* otherwise - release module_proc */ - OBJ_RELEASE(ib_proc); - } - opal_mutex_unlock(&mca_btl_openib_component.ib_lock); - - /* if we haven't insert the process - lock it here so we - * won't lock mca_btl_openib_component.ib_lock */ - if( !is_new ){ - opal_mutex_lock(&ib_proc_ret->proc_lock); - } - - return ib_proc_ret; - -err_exit: - - BTL_ERROR(("%d: error exit from mca_btl_openib_proc_create", OPAL_PROC_MY_NAME.vpid)); - -no_err_exit: - - OBJ_RELEASE(ib_proc); - return NULL; -} - -int mca_btl_openib_proc_remove(opal_proc_t *proc, - mca_btl_base_endpoint_t *endpoint) -{ - size_t i; - mca_btl_openib_proc_t* ib_proc = NULL; - - /* Remove endpoint from the openib BTL version of the proc as - well */ - ib_proc = ibproc_lookup_and_lock(proc); - if (NULL != ib_proc) { - for (i = 0; i < ib_proc->proc_endpoint_count; ++i) { - if (ib_proc->proc_endpoints[i] == endpoint) { - ib_proc->proc_endpoints[i] = NULL; - if (i == ib_proc->proc_endpoint_count - 1) { - --ib_proc->proc_endpoint_count; - } - opal_mutex_unlock(&ib_proc->proc_lock); - return OPAL_SUCCESS; - } - } - } - - return OPAL_ERR_NOT_FOUND; -} - -/* - * Note that this routine must be called with the lock on the process - * already held. Insert a btl instance into the proc array and assign - * it an address. - */ -int mca_btl_openib_proc_insert(mca_btl_openib_proc_t* module_proc, - mca_btl_base_endpoint_t* module_endpoint) -{ - /* insert into endpoint array */ - - -#ifndef WORDS_BIGENDIAN - /* if we are little endian and our peer is not so lucky, then we - need to put all information sent to him in big endian (aka - Network Byte Order) and expect all information received to - be in NBO. Since big endian machines always send and receive - in NBO, we don't care so much about that case. */ - if (module_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) { - module_endpoint->nbo = true; - } -#endif - - /* only allow eager rdma if the peers agree on the size of a long */ - if((module_proc->proc_opal->proc_arch & OPAL_ARCH_LONGISxx) != - (opal_proc_local_get()->proc_arch & OPAL_ARCH_LONGISxx)) { - module_endpoint->use_eager_rdma = false; - } - - module_endpoint->endpoint_proc = module_proc; - module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint; - return OPAL_SUCCESS; -} - -int mca_btl_openib_proc_reg_btl(mca_btl_openib_proc_t* ib_proc, - mca_btl_openib_module_t* openib_btl) -{ - mca_btl_openib_proc_btlptr_t* elem; - - OPAL_LIST_FOREACH(elem, &ib_proc->openib_btls, mca_btl_openib_proc_btlptr_t) { - if(elem->openib_btl == openib_btl) { - /* this is normal return meaning that this BTL has already touched this ib_proc */ - return OPAL_ERR_RESOURCE_BUSY; - } - } - - elem = OBJ_NEW(mca_btl_openib_proc_btlptr_t); - if( NULL == elem ){ - return OPAL_ERR_OUT_OF_RESOURCE; - } - elem->openib_btl = openib_btl; - opal_list_append(&ib_proc->openib_btls, &elem->super); - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_proc.h b/opal/mca/btl/openib/btl_openib_proc.h deleted file mode 100644 index 576018e5aa..0000000000 --- a/opal/mca/btl/openib/btl_openib_proc.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Mellanox Technologies. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_IB_PROC_H -#define MCA_BTL_IB_PROC_H - -#include "opal/class/opal_object.h" -#include "opal/util/proc.h" -#include "btl_openib.h" -#include "btl_openib_endpoint.h" - -BEGIN_C_DECLS - -/* Must forward reference this to avoid include file loop */ -struct opal_btl_openib_connect_base_module_data_t; - -/** - * Data received from the modex. For each openib BTL module/port in - * the peer, we'll receive two things: - * - * 1. Data about the peer's port - * 2. An array of CPCs that the peer has available on that port, each - * of which has its own meta data - * - * Hence, these two items need to be bundled together; - */ -typedef struct mca_btl_openib_proc_modex_t { - /** Information about the peer's port */ - mca_btl_openib_modex_message_t pm_port_info; - - /** Array of the peer's CPCs available on this port */ - opal_btl_openib_connect_base_module_data_t *pm_cpc_data; - - /** Length of the pm_cpc_data array */ - uint8_t pm_cpc_data_count; -} mca_btl_openib_proc_modex_t; - -/** - * The list element to hold pointers to openin_btls that are using this - * ib_proc. - */ - -struct mca_btl_openib_proc_btlptr_t { - opal_list_item_t super; - mca_btl_openib_module_t* openib_btl; -}; -typedef struct mca_btl_openib_proc_btlptr_t mca_btl_openib_proc_btlptr_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_proc_btlptr_t); - -/** - * Represents the state of a remote process and the set of addresses - * that it exports. Also cache an instance of mca_btl_base_endpoint_t for - * each - * BTL instance that attempts to open a connection to the process. - */ -struct mca_btl_openib_proc_t { - /** allow proc to be placed on a list */ - opal_list_item_t super; - - /** pointer to corresponding opal_proc_t */ - const opal_proc_t *proc_opal; - - /** modex messages from this proc; one for each port in the peer */ - mca_btl_openib_proc_modex_t *proc_ports; - - /** length of proc_ports array */ - uint8_t proc_port_count; - - /** list of openib_btl's that touched this proc **/ - opal_list_t openib_btls; - - /** array of endpoints that have been created to access this proc */ - volatile struct mca_btl_base_endpoint_t **proc_endpoints; - - /** number of endpoints (length of proc_endpoints array) */ - volatile size_t proc_endpoint_count; - - /** lock to protect against concurrent access to proc state */ - opal_mutex_t proc_lock; -}; -typedef struct mca_btl_openib_proc_t mca_btl_openib_proc_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_proc_t); - -mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc); -int mca_btl_openib_proc_insert(mca_btl_openib_proc_t*, mca_btl_base_endpoint_t*); -int mca_btl_openib_proc_remove(opal_proc_t* proc, - mca_btl_base_endpoint_t* module_endpoint); -int mca_btl_openib_proc_reg_btl(mca_btl_openib_proc_t* ib_proc, - mca_btl_openib_module_t* openib_btl); - - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_put.c b/opal/mca/btl/openib/btl_openib_put.c deleted file mode 100644 index 83260e0544..0000000000 --- a/opal/mca/btl/openib/btl_openib_put.c +++ /dev/null @@ -1,175 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2006-2007 Voltaire All rights reserved. - * Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_openib.h" -#include "btl_openib_frag.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_xrc.h" -/* - * RDMA WRITE local buffer to remote buffer address. - */ - -int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, - uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) -{ - mca_btl_openib_put_frag_t *frag = NULL; - int rc, qp = order; - - if (MCA_BTL_NO_ORDER == qp) { - qp = mca_btl_openib_component.rdma_qp; - } - - if (OPAL_UNLIKELY((btl->btl_put_local_registration_threshold < size && !local_handle) || !remote_handle || - size > btl->btl_put_limit)) { - return OPAL_ERR_BAD_PARAM; - } - - frag = to_put_frag(alloc_send_user_frag ()); - if (OPAL_UNLIKELY(NULL == frag)) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* set base descriptor flags */ - to_base_frag(frag)->base.order = qp; - /* free this descriptor when the operation is complete */ - to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; - - /* set up scatter-gather entry */ - to_com_frag(frag)->sg_entry.length = size; - - if (local_handle) { - to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; - } else { - /* lkey is not required for inline RDMA write */ - to_com_frag(frag)->sg_entry.lkey = 0; - } - - to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address; - to_com_frag(frag)->endpoint = ep; - - /* set up rdma callback */ - frag->cb.func = cbfunc; - frag->cb.context = cbcontext; - frag->cb.data = cbdata; - frag->cb.local_handle = local_handle; - - /* post descriptor */ - to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; - to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address; - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey); - } else -#endif - { - to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; - } - - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if (OPAL_ERR_RESOURCE_BUSY == rc) { - /* descriptor was queued pending connection */ - return OPAL_SUCCESS; - } - - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - MCA_BTL_IB_FRAG_RETURN (frag); - return rc; - } - } - - rc = mca_btl_openib_put_internal (btl, ep, frag); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { - rc = OPAL_SUCCESS; - - /* queue the fragment for when resources are available */ - OPAL_THREAD_LOCK(&ep->endpoint_lock); - opal_list_append(&ep->pending_put_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - } else { - MCA_BTL_IB_FRAG_RETURN (frag); - } - } - - return rc; -} - -int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, - mca_btl_openib_put_frag_t *frag) -{ - int qp = to_base_frag(frag)->base.order; - struct ibv_send_wr *bad_wr; - int rc; - - /* NTH: the inline send size and remote SRQ number are only available once the endpoint is - * connected. By setting these values here instead of mca_btl_openib_put we guarantee - * both fields are initialized */ - to_out_frag(frag)->sr_desc.send_flags = ib_send_flags (to_com_frag(frag)->sg_entry.length, - &(ep->qps[qp]), 1); - -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { -#if OPAL_HAVE_CONNECTX_XRC - to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; -#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS - to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; -#else -#error "that should never happen" -#endif - } -#endif - - /* check for a send wqe */ - if (qp_get_wqe(ep, qp) < 0) { - qp_put_wqe(ep, qp); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); - qp_reset_signal_count(ep, qp); - - if (0 != (rc = ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr))) { - qp_put_wqe(ep, qp); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/btl_openib_xrc.c b/opal/mca/btl/openib/btl_openib_xrc.c deleted file mode 100644 index df88aa78eb..0000000000 --- a/opal/mca/btl/openib/btl_openib_xrc.c +++ /dev/null @@ -1,211 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include -#include -#include -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#include - -#include "opal/mca/btl/base/base.h" -#include "opal/util/printf.h" -#include "btl_openib_xrc.h" -#include "btl_openib.h" - -#if HAVE_XRC -#define SIZE_OF3(A, B, C) (sizeof(A) + sizeof(B) + sizeof(C)) - -static void ib_address_constructor(ib_address_t *ib_addr); -static void ib_address_destructor(ib_address_t *ib_addr); - -OBJ_CLASS_INSTANCE(ib_address_t, - opal_list_item_t, - ib_address_constructor, - ib_address_destructor); - -/* This func. opens XRC domain */ -int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) -{ - int len; - char *xrc_file_name; - const char *dev_name; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_xrcd_init_attr xrcd_attr; -#endif - - dev_name = ibv_get_device_name(device->ib_dev); - len = opal_asprintf(&xrc_file_name, - "%s"OPAL_PATH_SEP"openib_xrc_domain_%s", - opal_process_info.job_session_dir, dev_name); - if (0 > len) { - BTL_ERROR(("Failed to allocate memomry for XRC file name: %s\n", - strerror(errno))); - return OPAL_ERROR; - } - - device->xrc_fd = open(xrc_file_name, O_CREAT, S_IWUSR|S_IRUSR); - if (0 > device->xrc_fd) { - BTL_ERROR(("Failed to open XRC domain file %s, errno says %s\n", - xrc_file_name,strerror(errno))); - free(xrc_file_name); - return OPAL_ERROR; - } -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - memset(&xrcd_attr, 0, sizeof xrcd_attr); - xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; - xrcd_attr.fd = device->xrc_fd; - xrcd_attr.oflags = O_CREAT; - device->xrcd = ibv_open_xrcd(device->ib_dev_context, &xrcd_attr); - if (NULL == device->xrcd) { -#else - device->xrc_domain = ibv_open_xrc_domain(device->ib_dev_context, device->xrc_fd, O_CREAT); - if (NULL == device->xrc_domain) { -#endif - BTL_ERROR(("Failed to open XRC domain\n")); - close(device->xrc_fd); - free(xrc_file_name); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/* This func. closes XRC domain */ -int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device) -{ -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - if (NULL == device->xrcd) { -#else - if (NULL == device->xrc_domain) { -#endif - /* No XRC domain, just exit */ - return OPAL_SUCCESS; - } -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - if (ibv_close_xrcd(device->xrcd)) { -#else - if (ibv_close_xrc_domain(device->xrc_domain)) { -#endif - BTL_ERROR(("Failed to close XRC domain, errno %d says %s\n", - device->xrc_fd, strerror(errno))); - return OPAL_ERROR; - } - /* do we need to check exit status */ - if (close(device->xrc_fd)) { - BTL_ERROR(("Failed to close XRC file descriptor, errno %d says %s\n", - device->xrc_fd, strerror(errno))); - return OPAL_ERROR; - } - return OPAL_SUCCESS; -} - -static void ib_address_constructor(ib_address_t *ib_addr) -{ - ib_addr->key = NULL; - ib_addr->subnet_id = 0; - ib_addr->lid = 0; - ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; - ib_addr->qp = NULL; - ib_addr->max_wqe = 0; - /* NTH: make the addr_lock recursive because mca_btl_openib_endpoint_connected can call - * into the CPC with the lock held. The alternative would be to drop the lock but the - * lock is never obtained in a critical path. */ - OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_recursive_mutex_t); - OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t); -} - -static void ib_address_destructor(ib_address_t *ib_addr) -{ - if (NULL != ib_addr->key) { - free(ib_addr->key); - } - OBJ_DESTRUCT(&ib_addr->addr_lock); - OBJ_DESTRUCT(&ib_addr->pending_ep); -} - -static int ib_address_init(ib_address_t *ib_addr, uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid) -{ - ib_addr->key = malloc(SIZE_OF3(s_id, lid, ep_jobid)); - if (NULL == ib_addr->key) { - BTL_ERROR(("Failed to allocate memory for key\n")); - return OPAL_ERROR; - } - memset(ib_addr->key, 0, SIZE_OF3(s_id, lid, ep_jobid)); - /* creating the key = lid + s_id + ep_jobid */ - memcpy(ib_addr->key, &lid, sizeof(lid)); - memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id)); - memcpy((void*)((char*)ib_addr->key + sizeof(lid) + sizeof(s_id)), - &ep_jobid, sizeof(ep_jobid)); - /* caching lid and subnet id */ - ib_addr->subnet_id = s_id; - ib_addr->lid = lid; - - return OPAL_SUCCESS; -} - -/* Create new entry in hash table for subnet_id and lid, - * update the endpoint pointer. - * Before call to this function you need to protect with - */ -int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id, - opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep) -{ - void *tmp; - int ret = OPAL_SUCCESS; - struct ib_address_t *ib_addr = OBJ_NEW(ib_address_t); - - ret = ib_address_init(ib_addr, lid, s_id, ep_jobid); - if (OPAL_SUCCESS != ret ) { - BTL_ERROR(("XRC Internal error. Failed to init ib_addr\n")); - OBJ_DESTRUCT(ib_addr); - return ret; - } - /* is it already in the table ?*/ - OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); - if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table, - ib_addr->key, - SIZE_OF3(s_id, lid, ep_jobid), &tmp)) { - /* It is new one, lets put it on the table */ - ret = opal_hash_table_set_value_ptr(&mca_btl_openib_component.ib_addr_table, - ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), (void*)ib_addr); - if (OPAL_SUCCESS != ret) { - BTL_ERROR(("XRC Internal error." - " Failed to add element to mca_btl_openib_component.ib_addr_table\n")); - OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); - OBJ_DESTRUCT(ib_addr); - return ret; - } - /* update the endpoint with pointer to ib address */ - ep->ib_addr = ib_addr; - } else { - /* so we have this one in the table, just add the pointer to the endpoint */ - ep->ib_addr = (ib_address_t *)tmp; - assert(lid == ep->ib_addr->lid && s_id == ep->ib_addr->subnet_id); - OBJ_DESTRUCT(ib_addr); - } - OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); - - return ret; -} -#endif diff --git a/opal/mca/btl/openib/btl_openib_xrc.h b/opal/mca/btl/openib/btl_openib_xrc.h deleted file mode 100644 index 30313471ad..0000000000 --- a/opal/mca/btl/openib/btl_openib_xrc.h +++ /dev/null @@ -1,58 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2016 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - * @file - */ - -#ifndef MCA_BTL_OPENIB_XRC_H -#define MCA_BTL_OPENIB_XRC_H -#include "btl_openib.h" -#include "btl_openib_endpoint.h" - -#if HAVE_XRC -#define MCA_BTL_XRC_ENABLED (mca_btl_openib_component.num_xrc_qps) -#else -#define MCA_BTL_XRC_ENABLED 0 -#endif - -typedef enum { - MCA_BTL_IB_ADDR_CONNECTING = 100, - MCA_BTL_IB_ADDR_CONNECTED, - MCA_BTL_IB_ADDR_CLOSED -} mca_btl_openib_ib_addr_state_t; - -struct ib_address_t { - opal_list_item_t super; - void *key; /* the key with size 80bit - [subnet(64) LID(16bit)] */ - uint64_t subnet_id; /* caching subnet_id */ - uint16_t lid; /* caching lid */ - opal_list_t pending_ep; /* list of endpoints that use this ib_address */ - mca_btl_openib_qp_t *qp; /* pointer to qp that will be used - for communication with the - destination */ - uint32_t remote_xrc_rcv_qp_num; /* remote xrc qp number */ - opal_mutex_t addr_lock; /* protection */ - mca_btl_openib_ib_addr_state_t status; /* ib port status */ - int32_t max_wqe; -}; -typedef struct ib_address_t ib_address_t; - -int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device); -int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device); -int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id, - opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep); - -#endif diff --git a/opal/mca/btl/openib/common_sym_whitelist.txt b/opal/mca/btl/openib/common_sym_whitelist.txt deleted file mode 100644 index 7c16ac478d..0000000000 --- a/opal/mca/btl/openib/common_sym_whitelist.txt +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore symbols in this component that are auto-generated and we -# can't do anything about them (e.g., flex/bison symbols). -btl_openib_ini_yyleng -btl_openib_ini_yytext diff --git a/opal/mca/btl/openib/configure.m4 b/opal/mca/btl/openib/configure.m4 deleted file mode 100644 index d91c8edd78..0000000000 --- a/opal/mca/btl/openib/configure.m4 +++ /dev/null @@ -1,117 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2008-2011 Mellanox Technologies. All rights reserved. -# Copyright (c) 2011 Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2013 NVIDIA Corporation. All rights reserved. -# Copyright (c) 2015 Research Organization for Information Science -# and Technology (RIST). All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_btl_openib_POST_CONFIG([should_build]) -# ------------------------------------------ -AC_DEFUN([MCA_opal_btl_openib_POST_CONFIG], [ - AM_CONDITIONAL([MCA_btl_openib_have_xrc], [test $1 -eq 1 && test "x$btl_openib_have_xrc" = "x1"]) - AM_CONDITIONAL([MCA_btl_openib_have_rdmacm], [test $1 -eq 1 && test "x$btl_openib_have_rdmacm" = "x1"]) - AM_CONDITIONAL([MCA_btl_openib_have_dynamic_sl], [test $1 -eq 1 && test "x$btl_openib_have_opensm_devel" = "x1"]) - AM_CONDITIONAL([MCA_btl_openib_have_udcm], [test $1 -eq 1 && test "x$btl_openib_have_udcm" = "x1"]) -]) - - -# MCA_btl_openib_CONFIG([action-if-can-copalle], -# [action-if-cant-copalle]) -# ------------------------------------------------ -AC_DEFUN([MCA_opal_btl_openib_CONFIG],[ - AC_CONFIG_FILES([opal/mca/btl/openib/Makefile]) - - OPAL_VAR_SCOPE_PUSH([cpcs btl_openib_LDFLAGS_save btl_openib_LIBS_save]) - cpcs="oob" - - OPAL_CHECK_OPENFABRICS([btl_openib], - [btl_openib_happy="yes" - OPAL_CHECK_OPENFABRICS_CM([btl_openib])], - [btl_openib_happy="no"]) - OPAL_CHECK_EXP_VERBS([btl_openib], [], []) - - AS_IF([test "$btl_openib_happy" = "yes"], - [# With the new openib flags, look for ibv_fork_init - btl_openib_LDFLAGS_save="$LDFLAGS" - btl_openib_LIBS_save="$LIBS" - LDFLAGS="$LDFLAGS $btl_openib_LDFLAGS" - LIBS="$LIBS $btl_openib_LIBS" - AC_CHECK_FUNCS([ibv_fork_init]) - LDFLAGS="$btl_openib_LDFLAGS_save" - LIBS="$btl_openib_LIBS_save" - $1], - [$2]) - - AS_IF([test "$btl_openib_happy" = "yes"], - [if test "x$btl_openib_have_xrc" = "x1"; then - cpcs="$cpcs xoob" - fi - if test "x$btl_openib_have_rdmacm" = "x1"; then - cpcs="$cpcs rdmacm" - if test "$enable_openib_rdmacm_ibaddr" = "yes"; then - AC_MSG_CHECKING([IB addressing]) - AC_EGREP_CPP( - yes, - [ - #include - #ifdef AF_IB - yes - #endif - ], - [ - AC_CHECK_HEADERS( - [rdma/rsocket.h], - [ - AC_MSG_RESULT([yes]) - AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 1, rdmacm IB_AF addressing support) - ], - [ - AC_MSG_RESULT([no]) - AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 0, rdmacm without IB_AF addressing support) - AC_MSG_WARN([There is no IB_AF addressing support by lib rdmacm.]) - ] - )], - [ - AC_MSG_RESULT([no]) - AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 0, rdmacm without IB_AF addressing support) - AC_MSG_WARN([There is no IB_AF addressing support by lib rdmacm.]) - ]) - else - AC_DEFINE(BTL_OPENIB_RDMACM_IB_ADDR, 0, rdmacm without IB_AF addressing support) - fi - fi - if test "x$btl_openib_have_udcm" = "x1"; then - cpcs="$cpcs udcm" - fi - AC_MSG_CHECKING([which openib btl cpcs will be built]) - AC_MSG_RESULT([$cpcs])]) - - # make sure that CUDA-aware checks have been done - AC_REQUIRE([OPAL_CHECK_CUDA]) - - # substitute in the things needed to build openib - AC_SUBST([btl_openib_CFLAGS]) - AC_SUBST([btl_openib_CPPFLAGS]) - AC_SUBST([btl_openib_LDFLAGS]) - AC_SUBST([btl_openib_LIBS]) - - OPAL_VAR_SCOPE_POP -])dnl diff --git a/opal/mca/btl/openib/connect/base.h b/opal/mca/btl/openib/connect/base.h deleted file mode 100644 index dcf82c6974..0000000000 --- a/opal/mca/btl/openib/connect/base.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013 Mellanox Technologies, Inc. - * All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_BASE_H -#define BTL_OPENIB_CONNECT_BASE_H - -#include "opal/mca/btl/openib/connect/connect.h" - -#ifdef OPAL_HAVE_RDMAOE -#define BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl) \ - (((IBV_TRANSPORT_IB != ((btl)->device->ib_dev->transport_type)) || \ - (IBV_LINK_LAYER_ETHERNET == ((btl)->ib_port_attr.link_layer))) ? \ - true : false) -#else -#define BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl) \ - ((IBV_TRANSPORT_IB != ((btl)->device->ib_dev->transport_type)) ? \ - true : false) -#endif - -BEGIN_C_DECLS - -/* - * Forward declaration to resolve circular dependency - */ -struct mca_btl_base_endpoint_t; - -/* - * Open function - */ -int opal_btl_openib_connect_base_register(void); - -/* - * Component-wide CPC init - */ -int opal_btl_openib_connect_base_init(void); - -/* - * Query CPCs to see if they want to run on a specific module - */ -int opal_btl_openib_connect_base_select_for_local_port - (mca_btl_openib_module_t *btl); - -/* - * Forward reference to avoid an include file loop - */ -struct mca_btl_openib_proc_modex_t; - -/* - * Select function - */ -int opal_btl_openib_connect_base_find_match - (mca_btl_openib_module_t *btl, - struct mca_btl_openib_proc_modex_t *peer_port, - opal_btl_openib_connect_base_module_t **local_cpc, - opal_btl_openib_connect_base_module_data_t **remote_cpc_data); - -/* - * Find a CPC's index so that we can send it in the modex - */ -int opal_btl_openib_connect_base_get_cpc_index - (opal_btl_openib_connect_base_component_t *cpc); - -/* - * Lookup a CPC by its index (received from the modex) - */ -opal_btl_openib_connect_base_component_t * - opal_btl_openib_connect_base_get_cpc_byindex(uint8_t index); - -/* - * Allocate a CTS frag - */ -int opal_btl_openib_connect_base_alloc_cts( - struct mca_btl_base_endpoint_t *endpoint); - -/* - * Free a CTS frag - */ -int opal_btl_openib_connect_base_free_cts( - struct mca_btl_base_endpoint_t *endpoint); - -/* - * Start a new connection to an endpoint - */ -int opal_btl_openib_connect_base_start( - opal_btl_openib_connect_base_module_t *cpc, - struct mca_btl_base_endpoint_t *endpoint); - - -/* - * Component-wide CPC finalize - */ -void opal_btl_openib_connect_base_finalize(void); - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_base.c b/opal/mca/btl/openib/connect/btl_openib_connect_base.c deleted file mode 100644 index 0db8c62b83..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_base.c +++ /dev/null @@ -1,541 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "opal_config.h" - -#include "btl_openib.h" -#include "btl_openib_proc.h" -#include "connect/base.h" -#include "connect/btl_openib_connect_empty.h" -#if OPAL_HAVE_RDMACM -#include "connect/btl_openib_connect_rdmacm.h" -#endif -#if OPAL_HAVE_UDCM -#include "connect/btl_openib_connect_udcm.h" -#endif - -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/util/proc.h" -#include "opal/util/show_help.h" -#include "opal/util/printf.h" - -#include "opal/util/sys_limits.h" -#include "opal/align.h" - -/* - * Array of all possible connection functions - */ -static opal_btl_openib_connect_base_component_t *all[] = { - /* Always have an entry here so that the CP indexes will always be - the same: OOB has been removed, so use the "empty" CPC */ - &opal_btl_openib_connect_empty, - - /* Always have an entry here so that the CP indexes will always be - the same: XOOB has been removed, so use the "empty" CPC */ - &opal_btl_openib_connect_empty, - - /* Always have an entry here so that the CP indexes will always be - the same: if RDMA CM is not available, use the "empty" CPC */ -#if OPAL_HAVE_RDMACM - &opal_btl_openib_connect_rdmacm, -#else - &opal_btl_openib_connect_empty, -#endif - - /* Always have an entry here so that the CP indexes will always be - the same: if UD CM is not enabled, use the "empty" CPC */ -#if OPAL_HAVE_UDCM - &opal_btl_openib_connect_udcm, -#else - &opal_btl_openib_connect_empty, -#endif - - NULL -}; - -/* increase this count if any more cpcs are added */ -static opal_btl_openib_connect_base_component_t *available[5]; -static int num_available = 0; - -static char *btl_openib_cpc_include; -static char *btl_openib_cpc_exclude; - -/* - * Register MCA parameters - */ -int opal_btl_openib_connect_base_register(void) -{ - int i, j, save; - char **temp = NULL, *string = NULL, *all_cpc_names = NULL; - - /* Make an MCA parameter to select which connect module to use */ - for (i = 0; NULL != all[i]; ++i) { - /* The CPC name "empty" is reserved for "fake" CPC modules */ - if (0 != strcmp(all[i]->cbc_name, "empty")) { - opal_argv_append_nosize(&temp, all[i]->cbc_name); - } - } - all_cpc_names = opal_argv_join(temp, ','); - opal_argv_free(temp); - opal_asprintf(&string, - "Method used to select OpenFabrics connections (valid values: %s)", - all_cpc_names); - - btl_openib_cpc_include = NULL; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "cpc_include", string, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &btl_openib_cpc_include); - free(string); - - opal_asprintf(&string, - "Method used to exclude OpenFabrics connections (valid values: %s)", - all_cpc_names); - - btl_openib_cpc_exclude = NULL; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "cpc_exclude", string, MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &btl_openib_cpc_exclude); - free(string); - - /* Parse the if_[in|ex]clude paramters to come up with a list of - CPCs that are available */ - /* If we have an "include" list, then find all those CPCs and put - them in available[] */ - if (NULL != btl_openib_cpc_include) { - mca_btl_openib_component.cpc_explicitly_defined = true; - temp = opal_argv_split(btl_openib_cpc_include, ','); - for (save = j = 0; NULL != temp[j]; ++j) { - for (i = 0; NULL != all[i]; ++i) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - opal_output(-1, "include: saving %s", all[i]->cbc_name); - available[save++] = all[i]; - ++num_available; - break; - } - } - if (NULL == all[i]) { - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "cpc name not found", true, - "include", opal_process_info.nodename, - "include", btl_openib_cpc_include, temp[j], - all_cpc_names); - opal_argv_free(temp); - free(all_cpc_names); - return OPAL_ERR_NOT_FOUND; - } - } - opal_argv_free(temp); - } - - /* Otherwise, if we have an "exclude" list, take all the CPCs that - are not in that list and put them in available[] */ - else if (NULL != btl_openib_cpc_exclude) { - mca_btl_openib_component.cpc_explicitly_defined = true; - temp = opal_argv_split(btl_openib_cpc_exclude, ','); - /* First: error check -- ensure that all the names are valid */ - for (j = 0; NULL != temp[j]; ++j) { - for (i = 0; NULL != all[i]; ++i) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - break; - } - } - if (NULL == all[i]) { - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "cpc name not found", true, - "exclude", opal_process_info.nodename, - "exclude", btl_openib_cpc_exclude, temp[j], - all_cpc_names); - opal_argv_free(temp); - free(all_cpc_names); - return OPAL_ERR_NOT_FOUND; - } - } - - /* Now do the exclude */ - for (save = i = 0; NULL != all[i]; ++i) { - for (j = 0; NULL != temp[j]; ++j) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - break; - } - } - if (NULL == temp[j]) { - opal_output(-1, "exclude: saving %s", all[i]->cbc_name); - available[save++] = all[i]; - ++num_available; - } - } - opal_argv_free(temp); - } - - /* If there's no include/exclude list, copy all[] into available[] */ - else { - opal_output(-1, "no include or exclude: saving all"); - memcpy(available, all, sizeof(all)); - num_available = (sizeof(all) / - sizeof(opal_btl_openib_connect_base_module_t *)) - 1; - } - - /* Call the register function on all the CPCs so that they may - setup any MCA params specific to the connection type */ - for (i = 0; NULL != available[i]; ++i) { - if (NULL != available[i]->cbc_register) { - available[i]->cbc_register(); - } - } - - free (all_cpc_names); - return OPAL_SUCCESS; -} - -/* - * Called once during openib BTL component initialization to allow CPC - * components to initialize. - */ -int opal_btl_openib_connect_base_init(void) -{ - int i, rc; - - /* Call each available CPC component's open function, if it has - one. If the CPC component open function returns OPAL_SUCCESS, - keep it. If it returns ERR_NOT_SUPPORTED, remove it from the - available[] array. If it returns something else, return that - error upward. */ - for (i = num_available = 0; NULL != available[i]; ++i) { - if (NULL == available[i]->cbc_init) { - available[num_available++] = available[i]; - opal_output(-1, "found available cpc (NULL init): %s", - all[i]->cbc_name); - continue; - } - - rc = available[i]->cbc_init(); - if (OPAL_SUCCESS == rc) { - available[num_available++] = available[i]; - opal_output(-1, "found available cpc (SUCCESS init): %s", - all[i]->cbc_name); - continue; - } else if (OPAL_ERR_NOT_SUPPORTED == rc) { - continue; - } else { - return rc; - } - } - available[num_available] = NULL; - - return (num_available > 0) ? OPAL_SUCCESS : OPAL_ERR_NOT_AVAILABLE; -} - - -/* - * Find all the CPCs that are eligible for a single local port (i.e., - * openib module). - */ -int opal_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *btl) -{ - char *msg = NULL; - int i, rc, cpc_index, len; - opal_btl_openib_connect_base_module_t **cpcs; - - cpcs = (opal_btl_openib_connect_base_module_t **) calloc(num_available, - sizeof(opal_btl_openib_connect_base_module_t *)); - if (NULL == cpcs) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Go through all available CPCs and query them to see if they - want to run on this module. If they do, save them to a running - array. */ - for (len = 1, i = 0; NULL != available[i]; ++i) { - len += strlen(available[i]->cbc_name) + 2; - } - msg = (char *) malloc(len); - if (NULL == msg) { - free(cpcs); - return OPAL_ERR_OUT_OF_RESOURCE; - } - msg[0] = '\0'; - for (cpc_index = i = 0; NULL != available[i]; ++i) { - if (i > 0) { - strcat(msg, ", "); - } - strcat(msg, available[i]->cbc_name); - - rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); - if (OPAL_ERR_NOT_SUPPORTED == rc || OPAL_ERR_UNREACH == rc) { - continue; - } else if (OPAL_SUCCESS != rc) { - free(cpcs); - free(msg); - return rc; - } - opal_output(-1, "match cpc for local port: %s", - available[i]->cbc_name); - - /* If the CPC wants to use the CTS protocol, check to ensure - that QP 0 is PP; if it's not, we can't use this CPC (or the - CTS protocol) */ - if (cpcs[cpc_index]->cbm_uses_cts && - !BTL_OPENIB_QP_TYPE_PP(0)) { - BTL_VERBOSE(("this CPC only supports when the first btl_openib_receive_queues QP is a PP QP")); - continue; - } - - /* This CPC has indicated that it wants to run on this openib - BTL module. Woo hoo! */ - ++cpc_index; - } - - /* If we got an empty array, then no CPCs were eligible. Doh! */ - if (0 == cpc_index) { - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "no cpcs for port", true, - opal_process_info.nodename, - ibv_get_device_name(btl->device->ib_dev), - btl->port_num, msg); - free(cpcs); - free(msg); - return OPAL_ERR_NOT_SUPPORTED; - } - free(msg); - - /* We got at least one eligible CPC; save the array into the - module's port_info */ - btl->cpcs = cpcs; - btl->num_cpcs = cpc_index; - - return OPAL_SUCCESS; -} - -/* - * This function is invoked when determining whether we have a CPC in - * common with a specific remote port. We already know that the - * subnet ID is the same between a specific local port and the target - * remote port; now we need to know if we can find a CPC in common - * between the two. - * - * If yes, be sure to find the *same* CPC on both sides. We know - * which CPCs are available on each side, and we know the priorities - * that were assigned on both sides. So find a CPC that is common to - * both sides and has the highest overall priority (between both - * sides). - * - * Return the matching CPC, or NULL if not found. - */ -int -opal_btl_openib_connect_base_find_match(mca_btl_openib_module_t *btl, - mca_btl_openib_proc_modex_t *peer_port, - opal_btl_openib_connect_base_module_t **ret_local_cpc, - opal_btl_openib_connect_base_module_data_t **ret_remote_cpc_data) -{ - int i, j, max = -1; - opal_btl_openib_connect_base_module_t *local_cpc, *local_selected = NULL; - opal_btl_openib_connect_base_module_data_t *local_cpcd, *remote_cpcd, - *remote_selected = NULL; - - /* Iterate over all the CPCs on the local module */ - for (i = 0; i < btl->num_cpcs; ++i) { - local_cpc = btl->cpcs[i]; - local_cpcd = &(local_cpc->data); - - /* Iterate over all the CPCs on the remote port */ - for (j = 0; j < peer_port->pm_cpc_data_count; ++j) { - remote_cpcd = &(peer_port->pm_cpc_data[j]); - - /* Are the components the same? */ - if (local_cpcd->cbm_component == remote_cpcd->cbm_component) { - /* If so, update the max priority found so far */ - if (max < local_cpcd->cbm_priority) { - max = local_cpcd->cbm_priority; - local_selected = local_cpc; - remote_selected = remote_cpcd; - } - if (max < remote_cpcd->cbm_priority) { - max = remote_cpcd->cbm_priority; - local_selected = local_cpc; - remote_selected = remote_cpcd; - } - } - } - } - - /* All done! */ - if (NULL != local_selected) { - *ret_local_cpc = local_selected; - *ret_remote_cpc_data = remote_selected; - opal_output(-1, "find_match: found match!"); - return OPAL_SUCCESS; - } else { - opal_output(-1, "find_match: did NOT find match!"); - return OPAL_ERR_NOT_FOUND; - } -} - -/* - * Lookup a CPC component's index in the all[] array so that we can - * send it int the modex - */ -int opal_btl_openib_connect_base_get_cpc_index(opal_btl_openib_connect_base_component_t *cpc) -{ - int i; - for (i = 0; NULL != all[i]; ++i) { - if (all[i] == cpc) { - return i; - } - } - - /* Not found */ - return -1; -} - -/* - * Lookup a CPC by its index (received from the modex) - */ -opal_btl_openib_connect_base_component_t * -opal_btl_openib_connect_base_get_cpc_byindex(uint8_t index) -{ - return (index >= (sizeof(all) / - sizeof(opal_btl_openib_connect_base_module_t *))) ? - NULL : all[index]; -} - -int opal_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint) -{ - opal_free_list_item_t *fli; - int length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_footer_t) + - mca_btl_openib_component.qp_infos[mca_btl_openib_component.credits_qp].size; - - int align_it = 0; - int page_size; - - page_size = opal_getpagesize(); - if (length >= page_size / 2) { align_it = 1; } - if (align_it) { -// I think this is only active for ~64k+ buffers anyway, but I'm not -// positive, so I'm only increasing the buffer size and alignment if -// it's not too small. That way we'd avoid wasting excessive memory -// in case this code was active for tiny buffers. - length = OPAL_ALIGN(length, page_size, int); - } - - /* Explicitly don't use the mpool registration */ - fli = &(endpoint->endpoint_cts_frag.super.super.base.super); - fli->registration = NULL; - if (!align_it) { - fli->ptr = malloc(length); - } else { - posix_memalign((void**)&(fli->ptr), page_size, length); - } - if (NULL == fli->ptr) { - BTL_ERROR(("malloc failed")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - endpoint->endpoint_cts_mr = - ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd, - fli->ptr, length, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | - IBV_ACCESS_REMOTE_READ); - OPAL_OUTPUT((-1, "registered memory %p, length %d", fli->ptr, length)); - if (NULL == endpoint->endpoint_cts_mr) { - free(fli->ptr); - BTL_ERROR(("Failed to reg mr!")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - /* NOTE: We do not need to register this memory with the - opal_memory subsystem, because this is OMPI-controlled memory - -- we do not need to worry about this memory being freed out - from underneath us. */ - - /* Copy the lkey where it needs to go */ - endpoint->endpoint_cts_frag.super.sg_entry.lkey = - endpoint->endpoint_cts_mr->lkey; - endpoint->endpoint_cts_frag.super.sg_entry.length = length; - - /* Construct the rest of the recv_frag_t */ - OBJ_CONSTRUCT(&(endpoint->endpoint_cts_frag), mca_btl_openib_recv_frag_t); - endpoint->endpoint_cts_frag.super.super.base.order = - mca_btl_openib_component.credits_qp; - endpoint->endpoint_cts_frag.super.endpoint = endpoint; - OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr, - endpoint->endpoint_cts_frag.super.sg_entry.length, - endpoint->endpoint_cts_frag.super.sg_entry.lkey)); - - return OPAL_SUCCESS; -} - -int opal_btl_openib_connect_base_free_cts(mca_btl_base_endpoint_t *endpoint) -{ - /* NOTE: We don't need to deregister this memory with opal_memory - because it was not registered there in the first place (see - comment above, near call to ibv_reg_mr). */ - if (NULL != endpoint->endpoint_cts_mr) { - ibv_dereg_mr(endpoint->endpoint_cts_mr); - endpoint->endpoint_cts_mr = NULL; - } - if (NULL != endpoint->endpoint_cts_frag.super.super.base.super.ptr) { - free(endpoint->endpoint_cts_frag.super.super.base.super.ptr); - endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL; - OPAL_OUTPUT((-1, "Freeing CTS frag")); - } - - return OPAL_SUCCESS; -} - -/* - * Called to start a connection - */ -int opal_btl_openib_connect_base_start( - opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *endpoint) -{ - /* If the CPC uses the CTS protocol, provide a frag buffer for the - CPC to post. Must allocate these frags up here in the main - thread because the FREE_LIST_WAIT is not thread safe. */ - if (cpc->cbm_uses_cts) { - int rc; - rc = opal_btl_openib_connect_base_alloc_cts(endpoint); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - - return cpc->cbm_start_connect(cpc, endpoint); -} - -/* - * Called during openib btl component close - */ -void opal_btl_openib_connect_base_finalize(void) -{ - int i; - - for (i = 0 ; i < num_available ; ++i) { - if (NULL != available[i]->cbc_finalize) { - available[i]->cbc_finalize(); - } - } -} diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_empty.c b/opal/mca/btl/openib/connect/btl_openib_connect_empty.c deleted file mode 100644 index 4a6dce26f8..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_empty.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "btl_openib.h" -#include "btl_openib_endpoint.h" -#include "connect/connect.h" - -static void empty_component_register(void); -static int empty_component_init(void); -static int empty_component_query(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t **cpc); - -opal_btl_openib_connect_base_component_t opal_btl_openib_connect_empty = { - "empty", - empty_component_register, - empty_component_init, - empty_component_query, - NULL -}; - -static void empty_component_register(void) -{ - /* Nothing to do */ -} - -static int empty_component_init(void) -{ - /* Never let this CPC run */ - return OPAL_ERR_NOT_SUPPORTED; -} - -static int empty_component_query(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t **cpc) -{ - /* Never let this CPC run */ - return OPAL_ERR_NOT_SUPPORTED; -} diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_empty.h b/opal/mca/btl/openib/connect/btl_openib_connect_empty.h deleted file mode 100644 index 2c72fefa46..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_empty.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_EMPTY_H -#define BTL_OPENIB_CONNECT_EMPTY_H - -#include "opal_config.h" - -#include "connect/connect.h" - -extern opal_btl_openib_connect_base_component_t opal_btl_openib_connect_empty; - -#endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c deleted file mode 100644 index 4312becfe5..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ /dev/null @@ -1,2324 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2008 Chelsio, Inc. All rights reserved. - * Copyright (c) 2008 Mellanox Technologies. All rights reserved. - * Copyright (c) 2009 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif -#include -#ifdef HAVE_SYS_SOCKET_H -#include -#endif -#ifdef HAVE_SYS_IOCTL_H -#include -#endif -#ifdef HAVE_ARPA_INET_H -#include -#endif -#ifdef HAVE_NETINET_IN_H -#include -#endif -#ifdef HAVE_NET_IF_H -#include -#endif -#include -#include -#include -#include -#include -#ifdef HAVE_DIRENT_H -#include -#endif -#include - -#include "opal/util/output.h" -#include "opal/util/error.h" -#include "opal/util/show_help.h" -#include "opal/util/proc.h" -#include "opal/util/printf.h" -#include "opal/runtime/opal_progress_threads.h" - -#include "btl_openib_proc.h" -#include "btl_openib_endpoint.h" -#include "connect/connect.h" -#include "btl_openib_ip.h" -#include "btl_openib_ini.h" - -#if BTL_OPENIB_RDMACM_IB_ADDR -#include -#include -#include -#include -#include -#include -#endif - -#define mymin(a, b) ((a) < (b) ? (a) : (b)) - -static void rdmacm_component_register(void); -static int rdmacm_component_init(void); -static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, - opal_btl_openib_connect_base_module_t **cpc); -static int rdmacm_component_finalize(void); - -opal_btl_openib_connect_base_component_t opal_btl_openib_connect_rdmacm = { - "rdmacm", - rdmacm_component_register, - rdmacm_component_init, - rdmacm_component_query, - rdmacm_component_finalize -}; - -/* - * A single instance of this data structure is shared between one - * id_context_t for each BSRQ qp on an endpoint. - */ -typedef struct { - opal_list_item_t super; - mca_btl_openib_endpoint_t *endpoint; - mca_btl_openib_module_t *openib_btl; - /* Dummy QP only used when we expect the connection to be - rejected */ - struct ibv_cq *dummy_cq; -#if BTL_OPENIB_RDMACM_IB_ADDR - union ibv_gid gid; - uint64_t service_id; -#else - uint32_t ipaddr; - uint16_t tcp_port; -#endif - /* server==false means that this proc initiated the connection; - server==true means that this proc accepted the incoming - connection. Note that this may be different than the "one way" - / i_initiate() direction -- it is possible for server==false - and i_initiate() to return false; it means that this proc - initially initiated the connection, but we expect it to be - rejected. */ - bool server; - - /* Whether this contents struct has been saved on the client list - or not */ - bool on_client_list; - - /* A list of all the id_context_t's that are using this - rdmacm_contents_t */ - opal_list_t ids; -} rdmacm_contents_t; - -static void rdmacm_contents_constructor(rdmacm_contents_t *contents); -static void rdmacm_contents_destructor(rdmacm_contents_t *contents); -OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t, - rdmacm_contents_constructor, - rdmacm_contents_destructor); - -typedef struct { - int device_max_qp_rd_atom; - int device_max_qp_init_rd_atom; -#if BTL_OPENIB_RDMACM_IB_ADDR - uint8_t gid[16]; - uint64_t service_id; -#else - uint32_t ipaddr; - uint16_t tcp_port; -#endif - uint8_t end; -} modex_message_t; - -typedef struct { - int rdmacm_counter; -} rdmacm_endpoint_local_cpc_data_t; - -/* - * There are one of these for each RDMA CM ID. Because of BSRQ, there - * can be multiple of these for one endpoint, so all the - * id_context_t's on a single endpoing share a single - * rdmacm_contents_t. - */ -typedef struct { - opal_list_item_t super; - rdmacm_contents_t *contents; - mca_btl_openib_endpoint_t *endpoint; - uint8_t qpnum; - bool already_disconnected; - uint16_t route_retry_count; - struct rdma_cm_id *id; -} id_context_t; - -static void id_context_constructor(id_context_t *context); -static void id_context_destructor(id_context_t *context); -OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t, - id_context_constructor, - id_context_destructor); - -typedef struct { -#if BTL_OPENIB_RDMACM_IB_ADDR - /* - * According to infiniband spec a "Consumer Private Data" begings from 36th up - * to 91th byte (so the limit is 56 bytes) and first 36 bytes - * intended for lib RDMA CM header (sometimes not all of these bytes are used) - * so we must take into account that in case of AF_IB user private data pointer - * points to a header and not to a "Consumer Private Data". - */ - uint8_t librdmacm_header[36]; - uint64_t rem_port; -#else - uint16_t rem_port; -#endif - uint32_t rem_index; - uint8_t qpnum; - opal_process_name_t rem_name; -} __opal_attribute_packed__ private_data_t; - -#if !BTL_OPENIB_RDMACM_IB_ADDR -/* Used to send a specific show_help message from the service_thread - to the main thread (because we can't call show_help from the - service_thread) */ -typedef struct { - char device_name[32]; - uint32_t peer_ip_addr; - uint32_t peer_tcp_port; -} cant_find_endpoint_context_t; -#endif - -static opal_list_t server_listener_list; -static opal_list_t client_list; -static opal_mutex_t client_list_lock; -static struct rdma_event_channel *event_channel = NULL; -static int rdmacm_priority = 30; -static unsigned int rdmacm_port = 0; - -#if !BTL_OPENIB_RDMACM_IB_ADDR -static uint32_t rdmacm_addr = 0; -#endif - -static int rdmacm_resolve_timeout = 30000; -static int rdmacm_resolve_max_retry_count = 20; -static bool rdmacm_reject_causes_connect_error = false; -static pthread_cond_t rdmacm_disconnect_cond; -static pthread_mutex_t rdmacm_disconnect_lock; -static volatile int disconnect_callbacks = 0; -static bool rdmacm_component_initialized = false; -static opal_event_base_t *rdmacm_event_base = NULL; -static opal_event_t rdmacm_event; - -/* Calculate the *real* length of the message (not aligned/rounded - up) */ -static int message_len = offsetof(modex_message_t, end); - -/* Rejection reasons */ -typedef enum { - REJECT_WRONG_DIRECTION, - REJECT_TRY_AGAIN -} reject_reason_t; - -static void id_context_constructor(id_context_t *context) -{ - context->already_disconnected = false; - context->id = NULL; - context->contents = NULL; - context->endpoint = NULL; - context->qpnum = 255; - context->route_retry_count = 0; -} - -static void id_context_destructor(id_context_t *context) -{ - if (NULL != context->id) { - rdma_destroy_id(context->id); - context->id = NULL; - } - if (NULL != context->contents) { - OBJ_RELEASE(context->contents); - } -} - -static void rdmacm_contents_constructor(rdmacm_contents_t *contents) -{ - contents->endpoint = NULL; - contents->openib_btl = NULL; - contents->dummy_cq = NULL; -#if BTL_OPENIB_RDMACM_IB_ADDR - contents->service_id = 0; -#else - contents->ipaddr = 0; - contents->tcp_port = 0; -#endif - contents->server = false; - contents->on_client_list = false; - OBJ_CONSTRUCT(&(contents->ids), opal_list_t); -} - -static void rdmacm_contents_destructor(rdmacm_contents_t *contents) -{ - OBJ_DESTRUCT(&(contents->ids)); -} - -/* - * Invoked by main thread - * - * Sets up any rdma_cm specific commandline params - */ -static void rdmacm_component_register(void) -{ - /* the priority is initialized in the declaration above */ - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_rdmacm_priority", - "The selection method priority for rdma_cm", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rdmacm_priority); - if (rdmacm_priority > 100) { - rdmacm_priority = 100; - } else if (rdmacm_priority < 0) { - rdmacm_priority = 0; - } - - rdmacm_port = 0; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_rdmacm_port", - "The selection method port for rdma_cm", - MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rdmacm_port); - if (rdmacm_port & ~0xfffful) { - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "illegal tcp port", true, (int) rdmacm_port); - rdmacm_port = 0; - } - - rdmacm_resolve_timeout = 30000; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_rdmacm_resolve_timeout", - "The timeout (in miliseconds) for address and route resolution", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rdmacm_resolve_timeout); - if (0 > rdmacm_resolve_timeout) { - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "illegal timeout", true, rdmacm_resolve_timeout); - rdmacm_resolve_timeout = 30000; - } - - rdmacm_resolve_max_retry_count = 20; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_rdmacm_retry_count", - "Maximum number of times rdmacm will retry route resolution", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rdmacm_resolve_max_retry_count); - if (0 > rdmacm_resolve_max_retry_count) { - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "illegal retry count", true, rdmacm_resolve_max_retry_count); - rdmacm_resolve_max_retry_count = 20; - } - - rdmacm_reject_causes_connect_error = false; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_rdmacm_reject_causes_connect_error", - "The drivers for some devices are buggy such that an RDMA REJECT action may result in a CONNECT_ERROR event instead of a REJECTED event. Setting this MCA parameter to true tells Open MPI to treat CONNECT_ERROR events on connections where a REJECT is expected as a REJECT (default: false)", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rdmacm_reject_causes_connect_error); -} - -/* - * Helper function for when we are debugging - */ -static char *stringify(uint32_t addr) -{ - char *line = (char *) malloc(64); - opal_asprintf(&line, "%d.%d.%d.%d (0x%x)", -#if defined(WORDS_BIGENDIAN) - (addr >> 24), - (addr >> 16) & 0xff, - (addr >> 8) & 0xff, - addr & 0xff, -#else - addr & 0xff, - (addr >> 8) & 0xff, - (addr >> 16) & 0xff, - (addr >> 24), -#endif - addr); - return line; -} - -/* - * Invoked by service thread - * - * This function traverses the list of endpoints associated with the - * device and determines which of them the remote side is attempting - * to connect to. This is determined based on the local endpoint's - * modex message recevied and the IP address and port associated with - * the rdma_cm event id - */ -static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents, - opal_process_name_t rem_name) -{ - mca_btl_openib_module_t *btl = contents->openib_btl; - mca_btl_openib_endpoint_t *ep = NULL; - opal_proc_t *opal_proc; - - opal_proc = opal_proc_for_name (rem_name); - if (NULL == opal_proc) { - BTL_ERROR(("could not get proc associated with remote peer %s", - opal_process_name_print (rem_name))); - return NULL; - } - - ep = mca_btl_openib_get_ep (&btl->super, opal_proc); - if (NULL == ep) { - BTL_ERROR(("could not find endpoint for peer %s", - opal_process_name_print (rem_name))); - } - - return ep; -} - -/* - * Returns max inlne size for qp #N - */ -static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device) -{ - if (mca_btl_openib_component.qp_infos[qp].size <= device->max_inline_data) { - /* If qp message size is smaller than max_inline_data, - * we should enable inline messages */ - return mca_btl_openib_component.qp_infos[qp].size; - } else if (mca_btl_openib_component.rdma_qp == qp || 0 == qp) { - /* If qp message size is bigger that max_inline_data, we - * should enable inline messages only for RDMA QP (for PUT/GET - * fin messages) and for the first qp */ - return device->max_inline_data; - } - /* Otherwise it is no reason for inline */ - return 0; -} - - -/* - * Invoked by both main and service threads - */ -static int rdmacm_setup_qp(rdmacm_contents_t *contents, - mca_btl_openib_endpoint_t *endpoint, - struct rdma_cm_id *id, - int qpnum) -{ - struct ibv_qp_init_attr attr; - struct ibv_qp *qp; - struct ibv_srq *srq = NULL; - int credits = 0, reserved = 0, max_recv_wr, max_send_wr; - size_t req_inline; - - if (qpnum == mca_btl_openib_component.credits_qp) { - int qp; - - for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - reserved += mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; - } - } - credits = mca_btl_openib_component.num_qps; - } - - if (BTL_OPENIB_QP_TYPE_PP(qpnum)) { - max_recv_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + reserved; - max_send_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + credits; - } else { - srq = endpoint->endpoint_btl->qps[qpnum].u.srq_qp.srq; - max_recv_wr = reserved; - max_send_wr = mca_btl_openib_component.qp_infos[qpnum].u.srq_qp.sd_max + credits; - } - - memset(&attr, 0, sizeof(attr)); - attr.qp_type = IBV_QPT_RC; - attr.send_cq = contents->openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ]; - attr.recv_cq = contents->openib_btl->device->ib_cq[qp_cq_prio(qpnum)]; - attr.srq = srq; - if(BTL_OPENIB_QP_TYPE_PP(qpnum)) { - /* Add one for the CTS receive frag that will be posted */ - attr.cap.max_recv_wr = max_recv_wr + 1; - } else { - attr.cap.max_recv_wr = 0; - } - attr.cap.max_send_wr = max_send_wr; - attr.cap.max_inline_data = req_inline = - max_inline_size(qpnum, contents->openib_btl->device); - attr.cap.max_send_sge = 1; - attr.cap.max_recv_sge = 1; /* we do not use SG list */ - - { - /* JMS Temprary gross hack: we *must* use rdma_create_cp() - (vs. ibv_create_qp()) because strange things happen on IB - if we don't. However, rdma_create_cp() wants us to use - rdma_get_devices() (and therefore the pd that they have - allocated). In order to get v1.3 out the door, we're - bypassing this functionality - we're temporarily overriding - the device context cached on the ID with our own, so that - our pd will match. We need to fix this to properly get the - pd from the RDMA CM and use that, etc. */ - struct ibv_context *temp = id->verbs; - id->verbs = contents->openib_btl->device->ib_pd->context; - if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd, - &attr)) { - BTL_ERROR(("Failed to create qp with %d", qpnum)); - goto out; - } - qp = id->qp; - id->verbs = temp; - } - - endpoint->qps[qpnum].qp->lcl_qp = qp; - endpoint->qps[qpnum].credit_frag = NULL; - if (attr.cap.max_inline_data < req_inline) { - endpoint->qps[qpnum].ib_inline_max = attr.cap.max_inline_data; - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "inline truncated", true, - opal_process_info.nodename, - ibv_get_device_name(contents->openib_btl->device->ib_dev), - contents->openib_btl->port_num, - req_inline, attr.cap.max_inline_data); - } else { - endpoint->qps[qpnum].ib_inline_max = req_inline; - } - id->qp = qp; - - return OPAL_SUCCESS; - -out: - return OPAL_ERROR; -} - - -/* - * Invoked by both main and service threads - * - * To avoid all kinds of nasty race conditions, we only allow - * connections to be made in one direction. So use a simple - * (arbitrary) test to decide which direction is allowed to initiate - * the connection: the process with the lower IP address wins. If the - * IP addresses are the same (i.e., the MPI procs are on the same - * node), then the process with the lower TCP port wins. - */ -static bool i_initiate(uint64_t local_port, uint64_t remote_port, -#if BTL_OPENIB_RDMACM_IB_ADDR - union ibv_gid *local_gid, union ibv_gid *remote_gid) -{ -#else - uint32_t local_ipaddr, uint32_t remote_ipaddr) -{ -#if OPAL_ENABLE_DEBUG - char *a = stringify(local_ipaddr); - char *b = stringify(remote_ipaddr); -#endif -#endif - -#if BTL_OPENIB_RDMACM_IB_ADDR - if (local_gid->global.subnet_prefix < remote_gid->global.subnet_prefix || - (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix && - local_gid->global.interface_id < remote_gid->global.interface_id) || - (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix && - local_gid->global.interface_id == remote_gid->global.interface_id && -#else - if (local_ipaddr > remote_ipaddr || - (local_ipaddr == remote_ipaddr && -#endif - local_port < remote_port)) { -#if !BTL_OPENIB_RDMACM_IB_ADDR - OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s", - a, b)); -#if OPAL_ENABLE_DEBUG - free(a); - free(b); -#endif -#endif - return true; - } -#if !BTL_OPENIB_RDMACM_IB_ADDR - OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s", - a, b)); -#if OPAL_ENABLE_DEBUG - free(a); - free(b); -#endif -#endif - return false; -} - -#if BTL_OPENIB_RDMACM_IB_ADDR -static int get_rdma_addr(char *src, char *dst, - struct rdma_addrinfo **rdma_addr, - int server) -{ - int rc; - struct rdma_addrinfo hints, *sres, *dres; - - memset(&hints, 0, sizeof hints); - - hints.ai_family = AF_IB; - hints.ai_port_space = RDMA_PS_TCP; - hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY | RAI_PASSIVE; - - rc = rdma_getaddrinfo(src, NULL, &hints, &sres); - if (0 != rc) { - return OPAL_ERROR; - } - - if (server) { - *rdma_addr = sres; - return OPAL_SUCCESS; - } - - hints.ai_src_len = sres->ai_src_len; - hints.ai_src_addr = sres->ai_src_addr; - - hints.ai_flags &= ~RAI_PASSIVE; - - rc = rdma_getaddrinfo(dst, NULL, &hints, &dres); - if (0 != rc) { - rdma_freeaddrinfo(sres); - return OPAL_ERROR; - } - - rdma_freeaddrinfo(sres); - *rdma_addr = dres; - - return OPAL_SUCCESS; -} -#endif - -/* - * Invoked by main thread - */ -static int rdmacm_client_connect_one(rdmacm_contents_t *contents, - modex_message_t *message, - int num) -{ - int rc; - id_context_t *context; -#if BTL_OPENIB_RDMACM_IB_ADDR - char src_addr[32], dst_addr[32]; - struct rdma_addrinfo *rdma_addr; -#else - struct sockaddr_in src_in, dest_in; - -#if OPAL_ENABLE_DEBUG - char *a, *b; -#endif -#endif - - /* We'll need to access some data in the event handler. We can - * encapsulate it in this data struct and attach it to the id being - * created below. The event->id will contain this same pointer. - */ - context = OBJ_NEW(id_context_t); - if (NULL == context) { - BTL_ERROR(("malloc error")); - goto out; - } - - context->contents = contents; - OBJ_RETAIN(contents); - context->qpnum = num; - context->endpoint = contents->endpoint; - - rc = rdma_create_id(event_channel, &(context->id), - context, RDMA_PS_TCP); - if (0 != rc) { - BTL_ERROR(("Failed to create a rdma id with %d", rc)); - goto out1; - } -#if !BTL_OPENIB_RDMACM_IB_ADDR - /* Source address (we must specify this to ensure that the traffic - goes out on the device+port that we expect it go out). */ - memset(&src_in, 0, sizeof(src_in)); - src_in.sin_family = AF_INET; - src_in.sin_addr.s_addr = contents->ipaddr; - src_in.sin_port = 0; - - /* Destination address */ - memset(&dest_in, 0, sizeof(dest_in)); - dest_in.sin_family = AF_INET; - dest_in.sin_addr.s_addr = message->ipaddr; - dest_in.sin_port = message->tcp_port; - - /* Once the route to the remote system is discovered, a - * RDMA_CM_EVENT_ADDR_RESOLVED event will occur on the local event - * handler. - */ - OPAL_OUTPUT((-1, "MAIN Resolving id: from IP %s:%d to IP %s:%d", - a = stringify(contents->ipaddr), - contents->tcp_port, - b = stringify(message->ipaddr), - message->tcp_port)); -#if OPAL_ENABLE_DEBUG - free(a); - free(b); -#endif -#endif - /* This is odd an worth explaining: when we place the context on - the ids list, we need to add an extra RETAIN to the context. - The reason is because of a race condition. Let's explain - through a few cases: - - 1. Normal termination: client side endpoint_finalize removes - the context from the ids list, has its service thread call - rdma_disconnect(), and then RELEASE. A DISCONNECT event - will occur on both sides; the client DISCONNECT will invoke - RELEASE again on the context. Note that the DISCONNECT - event may occur *very* quickly on the client side, so the - order of these two RELEASEs is not known. The destructor - will invoke rdma_destroy_id() -- we obviously can't have - this happen before both actions complete. Hence, - refcounting (and the additional RETAIN) saves us. - - Note that the server side never had the context on the ids - list, so it never had an extra RETAIN. So the DISCONNECT on - the server side will only invoke one RELEASE. - - 2. Abnormal termination: if the server side terminates - improperly (e.g., user's app segv's), then the kernel from - the server side will send a DISCONNECT event to the client - before the item has been removed from the ids list. This - will cause an assertion failure in debug builds (because - we'll be trying to RELEASE an opal_list_item_t that is still - on a list), and possibly other badness in optimized builds - because we'll be traversing a freed opal_list_item_t in - endpoint_finalize. So the extra RETAIN here right when we - put the item on the list prevents it from actually being - released in the client until BOTH the endpoint_finalize - occurs *and* the DISCONNECT event arrives. - - Asynchronous programming is fun! - */ - OBJ_RETAIN(context); - opal_list_append(&(contents->ids), &(context->super)); -#if BTL_OPENIB_RDMACM_IB_ADDR - if (NULL == inet_ntop(AF_INET6, contents->gid.raw, - src_addr, sizeof src_addr)) { - BTL_ERROR(("local addr string creating fail")); - goto out1; - } - - if (NULL == inet_ntop(AF_INET6, message->gid, - dst_addr, sizeof dst_addr)) { - BTL_ERROR(("remote addr string creating fail")); - goto out1; - } - - rc = get_rdma_addr(src_addr, dst_addr, &rdma_addr, 0); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("server: create rdma addr error")); - goto out1; - } - - ((struct sockaddr_ib *) (rdma_addr->ai_dst_addr))->sib_sid = message->service_id; -#endif - rc = rdma_resolve_addr(context->id, -#if BTL_OPENIB_RDMACM_IB_ADDR - rdma_addr->ai_src_addr, - rdma_addr->ai_dst_addr, -#else - (struct sockaddr *) &src_in, - (struct sockaddr *) &dest_in, -#endif - rdmacm_resolve_timeout); - if (0 != rc) { - BTL_ERROR(("Failed to resolve the remote address with %d", rc)); -#if BTL_OPENIB_RDMACM_IB_ADDR - rdma_freeaddrinfo(rdma_addr); -#endif - goto out1; - } -#if BTL_OPENIB_RDMACM_IB_ADDR - rdma_freeaddrinfo(rdma_addr); -#endif - - return OPAL_SUCCESS; - -out1: - OBJ_RELEASE(context); -out: - return OPAL_ERROR; -} - -/* - * Invoked by main thread - * - * Connect method called by the upper layers to connect the local - * endpoint to the remote endpoint by creating QP(s) to connect the two. - * Already holding endpoint lock when this function is called. - */ -static int rdmacm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *endpoint) -{ - rdmacm_contents_t *contents; - modex_message_t *message, *local_message; - int rc, qp; - opal_list_item_t *item; -#if !BTL_OPENIB_RDMACM_IB_ADDR -#if OPAL_ENABLE_DEBUG - char *a, *b; -#endif -#endif - /* Don't use the CPC to get the message, because this function is - invoked from the event_handler (to intitiate connections in the - Right direction), where we don't have the CPC, so it'll be - NULL. */ - local_message = - (modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message; - message = (modex_message_t *) - endpoint->endpoint_remote_cpc_data->cbm_modex_message; -#if !BTL_OPENIB_RDMACM_IB_ADDR - OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d ep state = %d", - a = stringify(local_message->ipaddr), local_message->tcp_port, - b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state)); -#if OPAL_ENABLE_DEBUG - free(a); - free(b); -#endif - BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d ep state = %d", - message->ipaddr, message->tcp_port, endpoint->endpoint_state)); -#endif - if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state || - MCA_BTL_IB_CONNECTING == endpoint->endpoint_state || - MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) { - return OPAL_SUCCESS; - } - - /* Set the endpoint state to "connecting" (this function runs in - the main MPI thread; not the service thread, so we can set the - endpoint_state here). */ - endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; - - contents = OBJ_NEW(rdmacm_contents_t); - if (NULL == contents) { - BTL_ERROR(("malloc of contents failed")); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out; - } - - contents->openib_btl = endpoint->endpoint_btl; - contents->endpoint = endpoint; - contents->server = false; - /* Populate the port information with the local port the server is - * listening on instead of the ephemerial port this client is - * connecting with. This port is used to determine which endpoint - * is being connected from, in the case where there are multiple - * listeners on the local system. - */ -#if BTL_OPENIB_RDMACM_IB_ADDR - memcpy(contents->gid.raw, local_message->gid, sizeof(contents->gid)); - contents->service_id = local_message->service_id; -#else - contents->ipaddr = local_message->ipaddr; - contents->tcp_port = local_message->tcp_port; -#endif - - /* Are we the initiator? Or do we expect this connect request to - be rejected? */ - endpoint->endpoint_initiator = - i_initiate( -#if BTL_OPENIB_RDMACM_IB_ADDR - contents->service_id, message->service_id, - &contents->gid, (union ibv_gid *) message->gid); -#else - contents->tcp_port, message->tcp_port, - contents->ipaddr, message->ipaddr); -#endif - OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s", - (void*) endpoint, - (void*) endpoint->endpoint_local_cpc, - endpoint->endpoint_initiator ? "am" : "am NOT", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - - /* If we're the initiator, then open all the QPs */ - if (contents->endpoint->endpoint_initiator) { - /* Initiator needs a CTS frag (non-initiator will have a CTS - frag allocated later) */ - if (OPAL_SUCCESS != - (rc = opal_btl_openib_connect_base_alloc_cts(contents->endpoint))) { - BTL_ERROR(("Failed to alloc CTS frag")); - goto out; - } - - for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - rc = rdmacm_client_connect_one(contents, message, qp); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("rdmacm_client_connect_one error (real QP %d)", - qp)); - goto out; - } - } - } - /* Otherwise, only open 1 QP that we expect to be rejected */ - else { - rc = rdmacm_client_connect_one(contents, message, 0); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("rdmacm_client_connect_one error (bogus QP)")); - goto out; - } - } - - return OPAL_SUCCESS; - -out: - while (NULL != (item = opal_list_remove_first (&contents->ids))) { - OBJ_RELEASE(item); - } - - return rc; -} - -#if !BTL_OPENIB_RDMACM_IB_ADDR -static void *show_help_cant_find_endpoint(void *context) -{ - char *msg; - cant_find_endpoint_context_t *c = - (cant_find_endpoint_context_t*) context; - - if (NULL != c) { - msg = stringify(c->peer_ip_addr); - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "could not find matching endpoint", true, - opal_process_info.nodename, - c->device_name, - c->peer_tcp_port); - free(msg); - } else { - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "could not find matching endpoint", true, - opal_process_info.nodename, - "", "", -1); - } - free(context); - - /* Now kill it */ - mca_btl_openib_endpoint_invoke_error(NULL); - return NULL; -} -#endif - -/* - * Invoked by service thread - * - * The server thread will handle the incoming connection requests and - * allow them or reject them based on a unidirectional connection - * method. The choonections are allowed based on the IP address and - * port values. This determination is arbitrary, but is uniform in - * allowing the connections only in 1 direction. If the connection in - * the requestion is disallowed by this rule, then the server will - * reject the connection and make its own in the proper direction. - */ -static int handle_connect_request(struct rdma_cm_event *event) -{ - id_context_t *listener_context = (id_context_t*) event->id->context; - id_context_t *new_context = NULL; - rdmacm_contents_t *contents = listener_context->contents; - mca_btl_openib_endpoint_t *endpoint; - struct rdma_conn_param conn_param; - opal_process_name_t rem_name; - modex_message_t *message; - private_data_t msg; - int rc = -1, qpnum; - uint32_t rem_index; -#if BTL_OPENIB_RDMACM_IB_ADDR - uint64_t rem_port; -#else - uint16_t rem_port; -#endif - - qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum; - rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port; - rem_index = ((private_data_t *)event->param.conn.private_data)->rem_index; - rem_name = ((private_data_t *)event->param.conn.private_data)->rem_name; - - /* Determine which endpoint the remote side is trying to connect - to; use the listener's context->contents to figure it out */ - endpoint = rdmacm_find_endpoint(contents, rem_name); - if (NULL == endpoint) { -#if !BTL_OPENIB_RDMACM_IB_ADDR - struct sockaddr *peeraddr = rdma_get_peer_addr(event->id); - cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c)); - if (NULL != c) { - snprintf(c->device_name, sizeof(c->device_name) - 1, - "%s:%d", - ibv_get_device_name(contents->openib_btl->device->ib_dev), - contents->openib_btl->port_num); - c->peer_ip_addr = - ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; - c->peer_tcp_port = rdma_get_dst_port(event->id); - } - show_help_cant_find_endpoint (c); -#else - BTL_ERROR(("Cannot find endpoint.")); -#endif - goto out; - } - - message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; - endpoint->endpoint_initiator = - i_initiate( -#if BTL_OPENIB_RDMACM_IB_ADDR - contents->service_id, rem_port, - &contents->gid, (union ibv_gid *) message->gid); -#else - contents->tcp_port, rem_port, - contents->ipaddr, message->ipaddr); - BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d", - endpoint->endpoint_state, contents->ipaddr, message->ipaddr, - contents->tcp_port, rem_port)); -#endif - OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s", - (void*) endpoint, - (void*) endpoint->endpoint_local_cpc, - endpoint->endpoint_initiator ? "am" : "am NOT", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - if (endpoint->endpoint_initiator) { - reject_reason_t reason = REJECT_WRONG_DIRECTION; - - OPAL_OUTPUT((-1, "SERVICE Received a connect request from an endpoint in the wrong direction")); - - /* This will cause a event on the remote system. By passing in - * a value in the second arg of rdma_reject, the remote side - * can check for this to know if it was an intentional reject or - * a reject based on an error. - */ - rc = rdma_reject(event->id, &reason, sizeof(reject_reason_t)); - if (0 != rc) { - BTL_ERROR(("rdma_reject failed %d", rc)); - goto out; - } - - OPAL_OUTPUT((-1, "SERVICE Starting connection in other direction")); - rdmacm_module_start_connect(NULL, endpoint); - - return OPAL_SUCCESS; - } - - /* Set the endpoint_state to "CONNECTING". This is running - in the service thread, so we need to do a write barrier. */ - endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; - opal_atomic_wmb(); - - endpoint->rem_info.rem_index = rem_index; - - /* Setup QP for new connection */ - BTL_VERBOSE(("ACCEPTING src port = %d, dst port = %d, qpnum = %d", - rdma_get_src_port(event->id), rdma_get_dst_port(event->id), qpnum)); - - rc = rdmacm_setup_qp(contents, endpoint, event->id, qpnum); - if (0 != rc) { - BTL_ERROR(("rdmacm_setup_qp error %d", rc)); - goto out; - } - - /* Post a single receive buffer on the smallest QP for the CTS - protocol */ - if (mca_btl_openib_component.credits_qp == qpnum) { - struct ibv_recv_wr *bad_wr, *wr; - - if (OPAL_SUCCESS != - opal_btl_openib_connect_base_alloc_cts(endpoint)) { - BTL_ERROR(("Failed to alloc CTS frag")); - goto out1; - } - wr = &(endpoint->endpoint_cts_frag.rd_desc); - assert(NULL != wr); - wr->next = NULL; - - if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp, - wr, &bad_wr)) { - BTL_ERROR(("failed to post CTS recv buffer")); - goto out1; - } - OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d", - (void*)((uintptr_t*) wr->sg_list[0].addr), - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), - qpnum, - endpoint->qps[qpnum].qp->lcl_qp->qp_num, - (void*)((uintptr_t*) wr->wr_id), - (void*)((uintptr_t*) wr->sg_list[0].addr), - wr->sg_list[0].length, - wr->sg_list[0].lkey)); - } - - /* Since the event id is already created (since we're the server), - the context that was passed to us was the listen server's - context -- which is no longer useful to us. So allocate a new - context and populate it just for this connection. */ - event->id->context = new_context = OBJ_NEW(id_context_t); - if (NULL == new_context) { - BTL_ERROR(("malloc error")); - goto out1; - } - - new_context->contents = contents; - OBJ_RETAIN(contents); - new_context->qpnum = qpnum; - new_context->endpoint = endpoint; - - memset(&conn_param, 0, sizeof(conn_param)); - /* See rdma_connect(3) for a description of these 2 values. We - ensure to pass these values around via the modex so that we can - compute the values properly. */ - conn_param.responder_resources = - mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom, - message->device_max_qp_init_rd_atom); - conn_param.initiator_depth = - mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom, - message->device_max_qp_rd_atom); - conn_param.retry_count = mca_btl_openib_component.ib_retry_count; - conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(qpnum) ? 0 : - mca_btl_openib_component.ib_rnr_retry; - conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(qpnum); - conn_param.private_data = &msg; - conn_param.private_data_len = sizeof(private_data_t); - - /* Fill the private data being sent to the other side */ - msg.qpnum = qpnum; - msg.rem_index = endpoint->index; - msg.rem_name = OPAL_PROC_MY_NAME; - - /* Accepting the connection will result in a - RDMA_CM_EVENT_ESTABLISHED event on both the client and server - side. */ - rc = rdma_accept(event->id, &conn_param); - if (0 != rc) { - BTL_ERROR(("rdma_accept error %d", rc)); - goto out2; - } - - return OPAL_SUCCESS; - -out2: - OBJ_RELEASE(new_context); -out1: - ibv_destroy_qp(endpoint->qps[qpnum].qp->lcl_qp); -out: - return OPAL_ERROR; -} - -/* - * Runs in service thread - * - * We call rdma_disconnect() here in the service thread so that there - * is zero chance that the DISCONNECT event is delivered and executed - * in the service thread while rdma_disconnect() is still running in - * the main thread (which causes all manner of Bad Things to occur). - */ -static void *call_disconnect_callback(int fd, int flags, void *v) -{ - rdmacm_contents_t *contents = (rdmacm_contents_t *) v; -#if OPAL_ENABLE_DEBUG - void *tmp = NULL; -#endif - id_context_t *context; - opal_list_item_t *item; - - pthread_mutex_lock (&rdmacm_disconnect_lock); - while (NULL != (item = opal_list_remove_first(&contents->ids))) { - context = (id_context_t *) item; - - OPAL_OUTPUT((-1, "RDMACM Event thread calling disconnect on ID %p", - (void*) context->id)); - - if (!context->already_disconnected) { -#if OPAL_ENABLE_DEBUG - tmp = context->id; -#endif - rdma_disconnect(context->id); - context->already_disconnected = true; - } - - OBJ_RELEASE(context); - - OPAL_OUTPUT((-1, "RDMACM Event thread disconnect on ID %p done", - (void*) tmp)); - } - - /* Tell the main thread that we're done */ - pthread_cond_signal(&rdmacm_disconnect_cond); - pthread_mutex_unlock(&rdmacm_disconnect_lock); - - return NULL; -} - -/* - * Invoked by main thread - * - * Runs *while* the progress thread is running. We can't stop the - * progress thread because this function may be invoked to kill a - * specific endpoint that was the result of MPI-2 dynamics (i.e., this - * is not during MPI_FINALIZE). - */ -static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint) -{ - rdmacm_contents_t *contents = NULL, *item; - opal_event_t event; - - BTL_VERBOSE(("Start disconnecting...")); - OPAL_OUTPUT((-1, "MAIN Endpoint finalizing")); - - if (NULL == endpoint) { - BTL_ERROR(("Attempting to shutdown a NULL endpoint")); - return OPAL_SUCCESS; - } - - /* Determine which rdmacm_contents_t correlates to the endpoint - * we are shutting down. By disconnecting instead of simply - * destroying the QPs, we are shutting down in a more graceful way - * thus preventing errors on the line. - * - * Need to lock because the client_list is accessed in both the - * main thread and service thread. - */ - opal_mutex_lock(&client_list_lock); - OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) { - if (endpoint == item->endpoint) { - contents = item; - opal_list_remove_item(&client_list, (opal_list_item_t *) contents); - contents->on_client_list = false; - - /* Fun race condition: we cannot call - rdma_disconnect() in this thread, because - if we do, there is a nonzero chance that the - DISCONNECT event will be delivered and get executed - in the rdcm event thread immediately. If this all - happens before rdma_disconnect() returns, all - manner of Bad Things can/will occur. So just - invoke rdma_disconnect() in the rdmacm event thread - where we guarantee that we won't be processing an - event when it is called. */ - - opal_event_set (rdmacm_event_base, &event, -1, OPAL_EV_READ, - call_disconnect_callback, contents); - opal_event_active (&event, OPAL_EV_READ, 1); - - /* remove_item returns the item before the item removed, - meaning that the for list is still safe */ - break; - } - } - - /* Flush writes to ensure we sync across threads */ - opal_atomic_wmb(); - opal_mutex_unlock(&client_list_lock); - - if (NULL != contents) { - /* Now wait for all the disconnect callbacks to occur */ - pthread_mutex_lock(&rdmacm_disconnect_lock); - while (opal_list_get_size (&contents->ids)) { - pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock); - } - pthread_mutex_unlock(&rdmacm_disconnect_lock); - } - - OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing")); - return OPAL_SUCCESS; -} - -/* - * Callback (from main thread) when the endpoint has been connected - */ -static void *local_endpoint_cpc_complete(void *context) -{ - mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context; - - OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - mca_btl_openib_endpoint_cpc_complete(endpoint); - - return NULL; -} - -/* - * Runs in service thread - */ -static int rdmacm_connect_endpoint(id_context_t *context, - struct rdma_cm_event *event) -{ - rdmacm_contents_t *contents = context->contents; - rdmacm_endpoint_local_cpc_data_t *data; - - mca_btl_openib_endpoint_t *endpoint; -#if OPAL_ENABLE_DEBUG -#if !BTL_OPENIB_RDMACM_IB_ADDR - modex_message_t *message; -#endif -#endif - - if (contents->server) { - endpoint = context->endpoint; - OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - } else { - endpoint = contents->endpoint; - endpoint->rem_info.rem_index = - ((private_data_t *)event->param.conn.private_data)->rem_index; - - if (!contents->on_client_list) { - opal_mutex_lock(&client_list_lock); - opal_list_append(&client_list, &(contents->super)); - /* Flush writes to ensure we sync across threads */ - opal_atomic_wmb(); - opal_mutex_unlock(&client_list_lock); - contents->on_client_list = true; - } - OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal))); - } - if (NULL == endpoint) { - BTL_ERROR(("Can't find endpoint")); - return OPAL_ERR_NOT_FOUND; - } - data = - (rdmacm_endpoint_local_cpc_data_t *)endpoint->endpoint_local_cpc_data; - - /* Only notify the upper layers after the last QP has been - connected */ - if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) { - BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter)); - OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client", - opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter)); - return OPAL_SUCCESS; - } - -#if OPAL_ENABLE_DEBUG -#if !BTL_OPENIB_RDMACM_IB_ADDR - message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message; - BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d", - contents->server?"server":"client", - contents->ipaddr, - message->ipaddr, - endpoint->endpoint_state)); -#endif -#endif - - /* Ensure that all the writes back to the endpoint and associated - data structures have completed */ - opal_atomic_wmb(); - mca_btl_openib_run_in_main (local_endpoint_cpc_complete, endpoint); - - return OPAL_SUCCESS; -} - -/* - * Runs in service thread - */ -static int rdmacm_disconnected(id_context_t *context) -{ - /* If this was a client thread, then it *may* still be listed in a - contents->ids list. */ - - OPAL_OUTPUT((-1, "SERVICE Releasing context because of DISCONNECT: context %p, id %p", - (void*) context, (void*) context->id)); - OBJ_RELEASE(context); - - return OPAL_SUCCESS; -} - -/* - * Runs in service thread - */ -static int rdmacm_destroy_dummy_qp(id_context_t *context) -{ - /* We need to check id pointer because of retransmitions. - Maybe the reject was already done. */ - - if (NULL != context->id) { - if (NULL != context->id->qp) { - ibv_destroy_qp(context->id->qp); - context->id->qp = NULL; - } - } - - if (NULL != context->contents->dummy_cq) { - ibv_destroy_cq(context->contents->dummy_cq); - } - /* This item was appended to the contents->ids list (the list will - only have just this one item), so remove it before RELEASEing - the item */ - opal_list_remove_first(&(context->contents->ids)); - OBJ_RELEASE(context); - - return OPAL_SUCCESS; -} - -/* - * Runs in service thread - */ -static int rdmacm_rejected(id_context_t *context, struct rdma_cm_event *event) -{ - if (NULL != event->param.conn.private_data) { - /* Why were we rejected? */ - switch (*((reject_reason_t*) event->param.conn.private_data)) { - case REJECT_WRONG_DIRECTION: - OPAL_OUTPUT((-1, "SERVICE A good reject! for qp %d, id 0x%p", - context->qpnum, (void*) context->id)); - rdmacm_destroy_dummy_qp(context); - break; - - default: - /* Just so compilers won't complain */ - break; - } - } - - return OPAL_SUCCESS; -} - -/* - * Runs in service thread - */ -static int resolve_route(id_context_t *context) -{ - int rc; - - /* Resolve the route to the remote system. Once established, the - * local system will get a RDMA_CM_EVENT_ROUTE_RESOLVED event. - */ - rc = rdma_resolve_route(context->id, rdmacm_resolve_timeout); - if (0 != rc) { - BTL_ERROR(("Failed to resolve the route with %d", rc)); - goto out; - } - -#if OPAL_ENABLE_DEBUG - { - char *a, *b; - OPAL_OUTPUT((-1, "Resolved route ID %p (local addr %s, remote addr %s)", - (void*) context->id, - a = stringify(((struct sockaddr_in*) rdma_get_local_addr(context->id))->sin_addr.s_addr), - b = stringify(((struct sockaddr_in*) rdma_get_peer_addr(context->id))->sin_addr.s_addr))); - free(a); - free(b); - } -#endif - - return OPAL_SUCCESS; - -out: - return OPAL_ERROR; -} - -/* - * Runs in service thread - */ -static int create_dummy_cq(rdmacm_contents_t *contents, - mca_btl_openib_module_t *openib_btl) -{ - contents->dummy_cq = - ibv_create_cq(openib_btl->device->ib_dev_context, 1, NULL, NULL, 0); - if (NULL == contents->dummy_cq) { - BTL_ERROR(("dummy_cq not created")); - goto out; - } - - return OPAL_SUCCESS; -out: - return OPAL_ERROR; -} - -/* - * Runs in service thread - */ -static int create_dummy_qp(rdmacm_contents_t *contents, - struct rdma_cm_id *id, int qpnum) -{ - struct ibv_qp_init_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.qp_type = IBV_QPT_RC; - attr.send_cq = contents->dummy_cq; - attr.recv_cq = contents->dummy_cq; - attr.cap.max_recv_wr = 1; - attr.cap.max_send_wr = 1; - attr.cap.max_send_sge = 1; - attr.cap.max_recv_sge = 1; - - { - /* JMS Temprary gross hack: we *must* use rdma_create_cp() - (vs. ibv_create_qp()) because strange things happen on IB - if we don't. However, rdma_create_cp() wants us to use - rdma_get_devices() (and therefore the pd that they have - allocated). In order to get v1.3 out the door, we're - bypassing this functionality - we're temporarily overriding - the device context cached on the ID with our own, so that - our pd will match. We need to fix this to properly get the - pd from the RDMA CM and use that, etc. */ - struct ibv_context *temp = id->verbs; - id->verbs = contents->openib_btl->device->ib_pd->context; - if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd, - &attr)) { - BTL_ERROR(("Failed to create qp with %d", qpnum)); - goto out; - } - id->verbs = temp; - } - BTL_VERBOSE(("dummy qp created %d", qpnum)); - - return OPAL_SUCCESS; - -out: - return OPAL_ERROR; -} - -/* - * Runs in service thread - */ -static int finish_connect(id_context_t *context) -{ - rdmacm_contents_t *contents = context->contents; - struct rdma_conn_param conn_param; - private_data_t msg; - int rc; -#if OPAL_ENABLE_DEBUG -#if !BTL_OPENIB_RDMACM_IB_ADDR - struct sockaddr *peeraddr; - uint32_t remoteipaddr; - uint16_t remoteport; -#endif -#endif - modex_message_t *message; - -#if OPAL_ENABLE_DEBUG -#if !BTL_OPENIB_RDMACM_IB_ADDR - peeraddr = rdma_get_peer_addr(context->id); - remoteport = rdma_get_dst_port(context->id); - remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; -#endif -#endif - - message = (modex_message_t *) - context->endpoint->endpoint_remote_cpc_data->cbm_modex_message; - - /* If we're the initiator, then setup the QP's and post the CTS - message buffer */ - if (contents->endpoint->endpoint_initiator) { - rc = rdmacm_setup_qp(contents, contents->endpoint, - context->id, context->qpnum); - if (0 != rc) { - BTL_ERROR(("rdmacm_setup_qp error %d", rc)); - goto out; - } - - if (mca_btl_openib_component.credits_qp == context->qpnum) { - /* Post a single receive buffer on the smallest QP for the CTS - protocol */ - - struct ibv_recv_wr *bad_wr, *wr; - assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr); - wr = &(contents->endpoint->endpoint_cts_frag.rd_desc); - assert(NULL != wr); - wr->next = NULL; - - if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp, - wr, &bad_wr)) { - BTL_ERROR(("failed to post CTS recv buffer")); - goto out1; - } - OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)", - (void*)((uintptr_t*) wr->sg_list[0].addr), - wr->sg_list[0].length, - opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal), - context->qpnum, - contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num)); - } - } else { - /* If we are establishing a connection in the "wrong" direction, - * setup a dummy CQ and QP and do NOT post any recvs on them. - * Otherwise this will screwup the recv accounting and will - * result in not posting recvs when you really really wanted to. - * All of the dummy cq and qps will be cleaned up on the reject - * event. - */ - rc = create_dummy_cq(contents, contents->openib_btl); - if (0 != rc) { - BTL_ERROR(("create_dummy_cq error %d", rc)); - goto out; - } - - rc = create_dummy_qp(contents, context->id, context->qpnum); - if (0 != rc) { - BTL_ERROR(("create_dummy_qp error %d", rc)); - goto out; - } - } - - memset(&conn_param, 0, sizeof(conn_param)); - /* See above comment about rdma_connect(3) and these two values. */ - conn_param.responder_resources = - mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom, - message->device_max_qp_init_rd_atom); - conn_param.initiator_depth = - mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom, - message->device_max_qp_rd_atom); - conn_param.flow_control = 0; - conn_param.retry_count = mca_btl_openib_component.ib_retry_count; - conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(context->qpnum) ? 0 : - mca_btl_openib_component.ib_rnr_retry; - conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(context->qpnum); - conn_param.private_data = &msg; - conn_param.private_data_len = sizeof(private_data_t); - - msg.qpnum = context->qpnum; - msg.rem_index = contents->endpoint->index; - msg.rem_name = OPAL_PROC_MY_NAME; -#if BTL_OPENIB_RDMACM_IB_ADDR - memset(msg.librdmacm_header, 0, sizeof(msg.librdmacm_header)); - msg.rem_port = contents->service_id; -#else - msg.rem_port = contents->tcp_port; - if (contents->endpoint->endpoint_initiator) { -#if OPAL_ENABLE_DEBUG - char *a; -#endif - OPAL_OUTPUT((-1, "Finish connect (I am initiator): sending from %s:%d, TCP port %d, qp index %d (num %d) to IP %s:%d", - ibv_get_device_name(contents->openib_btl->device->ib_dev), - contents->openib_btl->port_num, - contents->tcp_port, - context->qpnum, - contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num, - a = stringify(remoteipaddr), remoteport)); -#if OPAL_ENABLE_DEBUG - free(a); -#endif - } -#endif - - /* Now all of the local setup has been done. The remote system - should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further - the setup of the QP. */ - OPAL_OUTPUT((-1, "SERVICE in finish_connect; ep=%p (%p), I still %s the initiator to %s", - (void*) contents->endpoint, - (void*) contents->endpoint->endpoint_local_cpc, - contents->endpoint->endpoint_initiator ? "am" : "am NOT", - opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal))); - rc = rdma_connect(context->id, &conn_param); - if (0 != rc) { - BTL_ERROR(("rdma_connect Failed with %d", rc)); - goto out1; - } - - return OPAL_SUCCESS; - -out1: - ibv_destroy_qp(context->id->qp); -out: - OBJ_RELEASE(contents); - - return OPAL_ERROR; -} - -/* - * Runs in main thread - */ -static void *show_help_rdmacm_event_error (struct rdma_cm_event *event) -{ - id_context_t *context = (id_context_t*) event->id->context; - - if (RDMA_CM_EVENT_DEVICE_REMOVAL == event->event) { - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "rdma cm device removal", true, - opal_process_info.nodename, - ibv_get_device_name(event->id->verbs->device)); - } else { - const char *device = "Unknown"; - if (NULL != event->id && - NULL != event->id->verbs && - NULL != event->id->verbs->device) { - device = ibv_get_device_name(event->id->verbs->device); - } - opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt", - "rdma cm event error", true, - opal_process_info.nodename, - device, - rdma_event_str(event->event), - opal_get_proc_hostname(context->endpoint->endpoint_proc->proc_opal)); - } - - return NULL; -} - -/* - * Runs in service thread - */ -static int event_handler(struct rdma_cm_event *event) -{ - id_context_t *context = (id_context_t*) event->id->context; -#if !BTL_OPENIB_RDMACM_IB_ADDR - rdmacm_contents_t *contents; - struct sockaddr *localaddr; - uint32_t localipaddr; -#if OPAL_ENABLE_DEBUG - struct sockaddr *peeraddr; - uint32_t peeripaddr; -#endif -#endif - int rc = -1; - opal_btl_openib_ini_values_t ini; - bool found; - - if (NULL == context) { - return rc; - } - -#if !BTL_OPENIB_RDMACM_IB_ADDR - contents = context->contents; - - localaddr = rdma_get_local_addr(event->id); - localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr; -#if OPAL_ENABLE_DEBUG - peeraddr = rdma_get_peer_addr(event->id); - peeripaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr; -#endif - - BTL_VERBOSE(("%s event_handler -- %s, status = %d to %x", - contents->server?"server":"client", - rdma_event_str(event->event), - event->status, - peeripaddr)); -#endif - - switch (event->event) { - case RDMA_CM_EVENT_ADDR_RESOLVED: - OPAL_OUTPUT((-1, "SERVICE Got ADDR_RESOLVED: ID %p", (void*) context->id)); - rc = resolve_route(context); - break; - - case RDMA_CM_EVENT_ROUTE_RESOLVED: - OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id)); -#if !BTL_OPENIB_RDMACM_IB_ADDR - contents->ipaddr = localipaddr; -#endif - rc = finish_connect(context); - break; - - case RDMA_CM_EVENT_CONNECT_REQUEST: - OPAL_OUTPUT((-1, "SERVICE Got CONNECT_REQUEST: ID %p, context %p", - (void*) event->id, (void*) context)); - rc = handle_connect_request(event); - break; - - case RDMA_CM_EVENT_ESTABLISHED: - OPAL_OUTPUT((-1, "SERVICE Got ESTABLISHED: %p", (void*) event->id)); - rc = rdmacm_connect_endpoint(context, event); - break; - - case RDMA_CM_EVENT_DISCONNECTED: - OPAL_OUTPUT((-1, "SERVICE Got DISCONNECTED: %p", (void*) event->id)); - rc = rdmacm_disconnected(context); - break; - - case RDMA_CM_EVENT_REJECTED: - OPAL_OUTPUT((-1, "SERVICE Got REJECTED: %p", (void*) event->id)); - rc = rdmacm_rejected(context, event); - break; - - case RDMA_CM_EVENT_CONNECT_ERROR: - /* Some adapters have broken REJECT behavior; the recipient - gets a CONNECT_ERROR event instead of the expected REJECTED - event. So if we get a CONNECT_ERROR, see if it's on a - connection that we're expecting a REJECT (i.e., we have a - dummy_cq setup). If it is, and if a) the MCA param - btl_openib_connect_rdmacm_reject_causes_connect_error is - true, or b) if rdmacm_reject_causes_connect_error set on - the device INI values, then just treat this CONNECT_ERROR - as if it were the REJECT. */ - if (NULL != context->contents->dummy_cq) { - struct ibv_device_attr *attr = - &(context->endpoint->endpoint_btl->device->ib_dev_attr); - found = false; - if (OPAL_SUCCESS == opal_btl_openib_ini_query(attr->vendor_id, - attr->vendor_part_id, - &ini) && - ini.rdmacm_reject_causes_connect_error) { - found = true; - } - if (rdmacm_reject_causes_connect_error) { - found = true; - } - - if (found) { - OPAL_OUTPUT((-1, "SERVICE Got CONNECT_ERROR, but ignored: %p", (void*) event->id)); - rc = rdmacm_destroy_dummy_qp(context); - break; - } - } - - /* Otherwise, fall through and handle the error as normal */ - - case RDMA_CM_EVENT_UNREACHABLE: - case RDMA_CM_EVENT_CONNECT_RESPONSE: - case RDMA_CM_EVENT_ADDR_ERROR: - case RDMA_CM_EVENT_DEVICE_REMOVAL: - show_help_rdmacm_event_error (event); - rc = OPAL_ERROR; - break; - - case RDMA_CM_EVENT_ROUTE_ERROR: - /* Route lookup does not necessarily handle retries, and there - appear to be cases where the subnet manager node can no - longer handle incoming requests. The rdma connection - manager and lower level code doesn't handle retries, so we - have to. */ - if (context->route_retry_count < rdmacm_resolve_max_retry_count) { - context->route_retry_count++; - rc = resolve_route(context); - break; - } - show_help_rdmacm_event_error (event); - rc = OPAL_ERROR; - break; - - default: - /* Unknown error */ - BTL_ERROR(("Unknown RDMA CM error event_handler: %s, status = %d", - rdma_event_str(event->event), event->status)); - rc = OPAL_ERROR; - break; - } - - return rc; -} - -/* - * Runs in event thread - */ -static inline void rdmamcm_event_error(struct rdma_cm_event *event) -{ - mca_btl_base_endpoint_t *endpoint = NULL; - - if (event->id->context) { - endpoint = ((id_context_t *)event->id->context)->contents->endpoint; - } - - mca_btl_openib_run_in_main (mca_btl_openib_endpoint_invoke_error, - endpoint); -} - -/* - * Runs in event thread - */ -static void *rdmacm_event_dispatch(int fd, int flags, void *context) -{ - struct rdma_cm_event *event, ecopy; - void *data = NULL; - int rc; - - /* blocks until next cm_event */ - rc = rdma_get_cm_event(event_channel, &event); - if (0 != rc) { - BTL_ERROR(("rdma_get_cm_event error %d", rc)); - return NULL; - } - - /* If the incoming event is not acked in a sufficient amount of - * time, there will be a timeout error and the connection will be - * torndown. Also, the act of acking the event destroys the - * included data in the event. In certain circumstances, the time - * it takes to handle a incoming event could approach or exceed - * this time. To prevent this from happening, we will copy the - * event and all of its data, ack the event, and process the copy - * of the event. - */ - memcpy(&ecopy, event, sizeof(struct rdma_cm_event)); - if (event->param.conn.private_data_len > 0) { - data = malloc(event->param.conn.private_data_len); - if (NULL == data) { - BTL_ERROR(("error mallocing memory")); - return NULL; - } - memcpy(data, event->param.conn.private_data, event->param.conn.private_data_len); - ecopy.param.conn.private_data = data; - } - rdma_ack_cm_event(event); - - rc = event_handler(&ecopy); - if (OPAL_SUCCESS != rc) { - rdmamcm_event_error(&ecopy); - } - - if (NULL != data) { - free(data); - } - - return NULL; -} - -/* - * Runs in main thread - * - * CPC init function - Setup all globals here - */ -static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint) -{ - void *data; - - data = calloc(1, sizeof(rdmacm_endpoint_local_cpc_data_t)); - if (NULL == data) { - BTL_ERROR(("malloc failed")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - endpoint->endpoint_local_cpc_data = data; - - return OPAL_SUCCESS; -} - -#if !BTL_OPENIB_RDMACM_IB_ADDR -static int ipaddrcheck(id_context_t *context, - mca_btl_openib_module_t *openib_btl) -{ - rdmacm_contents_t *server = context->contents; - uint32_t ipaddr; - bool already_exists = false; - rdmacm_contents_t *contents; - int server_tcp_port = rdma_get_src_port(context->id); - char *str; - - /* Look up the IP address of this device/port. This call should not be - * necessary, as rdma_get_local_addr would be more correct in returning the - * IP address given the cm_id (and not necessitate having to do a list look - * up). Unfortunately, the subnet and IP address look up needs to match or - * there could be a mismatch if IP Aliases are being used. For more - * information on this, please read comment above - * mca_btl_openib_get_ip_subnet_id in btl_openib_ip.c - */ - ipaddr = - mca_btl_openib_rdma_get_ipv4addr(openib_btl->device->ib_dev_context, - openib_btl->port_num); - if (0 == ipaddr) { - BTL_VERBOSE(("*** Could not find IP address for %s:%d -- is there an IP address configured for this device?", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num)); - return OPAL_ERR_NOT_FOUND; - } - str = stringify(ipaddr); - BTL_VERBOSE(("Found device %s:%d = IP address %s:%d", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num, str, server_tcp_port)); - free(str); - - /* Ok, we found the IP address of this device/port. Have we - already see this IP address/TCP port before? */ - OPAL_LIST_FOREACH(contents, &server_listener_list, rdmacm_contents_t) { - BTL_VERBOSE(("paddr = %x, ipaddr addr = %x", - contents->ipaddr, ipaddr)); - if (contents->ipaddr == ipaddr && - contents->tcp_port == server_tcp_port) { - str = stringify(ipaddr); - BTL_VERBOSE(("server already listening on %s:%d", - str, server_tcp_port)); - free(str); - already_exists = true; - break; - } - } - - /* If we haven't seen it before, save it */ - if (!already_exists) { - str = stringify(ipaddr); - BTL_VERBOSE(("creating new server to listen on %s:%d", - str, server_tcp_port)); - free(str); - server->ipaddr = ipaddr; - server->tcp_port = server_tcp_port; - } - - return already_exists ? OPAL_ERROR : OPAL_SUCCESS; -} -#endif - -static int create_message(rdmacm_contents_t *server, - mca_btl_openib_module_t *openib_btl, - opal_btl_openib_connect_base_module_data_t *data) -{ - modex_message_t *message; -#if !BTL_OPENIB_RDMACM_IB_ADDR -#if OPAL_ENABLE_DEBUG - char *a; -#endif -#endif - - message = (modex_message_t *) malloc(sizeof(modex_message_t)); - if (NULL == message) { - BTL_ERROR(("malloc failed")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - message->device_max_qp_rd_atom = - openib_btl->device->ib_dev_attr.max_qp_rd_atom; - message->device_max_qp_init_rd_atom = - openib_btl->device->ib_dev_attr.max_qp_init_rd_atom; - -#if BTL_OPENIB_RDMACM_IB_ADDR - memcpy(message->gid, server->gid.raw, sizeof(server->gid)); - message->service_id = server->service_id; -#else - message->ipaddr = server->ipaddr; - message->tcp_port = server->tcp_port; - - OPAL_OUTPUT((-1, "Message IP address is %s, port %d", - a = stringify(message->ipaddr), message->tcp_port)); -#if OPAL_ENABLE_DEBUG - free(a); -#endif -#endif - data->cbm_modex_message = message; - data->cbm_modex_message_len = message_len; - - return OPAL_SUCCESS; -} - -/* - * Runs in main thread - * - * This function determines if the RDMACM is a possible cpc method and - * sets it up accordingly. - */ -static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, opal_btl_openib_connect_base_module_t **cpc) -{ - int rc; - - id_context_t *context; - rdmacm_contents_t *server = NULL; - -#if BTL_OPENIB_RDMACM_IB_ADDR - char rdmacm_addr_str[32]; - struct rdma_addrinfo *rdma_addr; -#else - struct sockaddr_in sin; -#endif - - /* RDMACM is not supported for MPI_THREAD_MULTIPLE */ - if (opal_using_threads()) { - BTL_VERBOSE(("rdmacm CPC is not supported with MPI_THREAD_MULTIPLE; skipped on %s:%d", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num)); - rc = OPAL_ERR_NOT_SUPPORTED; - goto out; - } - - /* RDMACM is not supported if we have any XRC QPs */ - if (mca_btl_openib_component.num_xrc_qps > 0) { - BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num)); - rc = OPAL_ERR_NOT_SUPPORTED; - goto out; - } - if (!BTL_OPENIB_QP_TYPE_PP(0)) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "rdmacm CPC only supported when the first QP is a PP QP; skipped"); - rc = OPAL_ERR_NOT_SUPPORTED; - goto out; - } - - BTL_VERBOSE(("rdmacm_component_query")); - - *cpc = (opal_btl_openib_connect_base_module_t *) malloc(sizeof(opal_btl_openib_connect_base_module_t)); - if (NULL == *cpc) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out; - } - - (*cpc)->data.cbm_component = &opal_btl_openib_connect_rdmacm; - (*cpc)->data.cbm_priority = rdmacm_priority; - (*cpc)->data.cbm_modex_message = NULL; - (*cpc)->data.cbm_modex_message_len = 0; - (*cpc)->cbm_endpoint_init = rdmacm_init; - (*cpc)->cbm_start_connect = rdmacm_module_start_connect; - (*cpc)->cbm_endpoint_finalize = rdmacm_endpoint_finalize; - (*cpc)->cbm_finalize = NULL; - /* Setting uses_cts=true also guarantees that we'll only be - selected if QP 0 is PP */ - (*cpc)->cbm_uses_cts = true; - - /* Start monitoring the fd associated with the cm_device */ - server = OBJ_NEW(rdmacm_contents_t); - if (NULL == server) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out1; - } - server->server = true; - server->openib_btl = openib_btl; - - context = OBJ_NEW(id_context_t); - OPAL_OUTPUT((-1, "MAIN Server context: %p", (void*) context)); - if (NULL == context) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC system error (malloc failed)"); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out3; - } - context->contents = server; - OBJ_RETAIN(context->contents); - opal_list_append(&(server->ids), &(context->super)); - context->qpnum = 0; - - rc = rdma_create_id(event_channel, &(context->id), context, RDMA_PS_TCP); - if (0 != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC failed to create ID"); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out4; - } -#if !BTL_OPENIB_RDMACM_IB_ADDR - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = rdmacm_addr; - sin.sin_port = (uint16_t) rdmacm_port; -#else - rc = ibv_query_gid(openib_btl->device->ib_pd->context, openib_btl->port_num, - mca_btl_openib_component.gid_index, &server->gid); - if (0 != rc) { - BTL_ERROR(("local gid query failed")); - goto out4; - } - - if (NULL == inet_ntop(AF_INET6, server->gid.raw, - rdmacm_addr_str, sizeof rdmacm_addr_str)) { - BTL_ERROR(("local gaddr string creating fail")); - goto out4; - } - - rc = get_rdma_addr(rdmacm_addr_str, NULL, &rdma_addr, 1); - if (OPAL_SUCCESS != rc) { - BTL_ERROR(("server: create rdma addr error")); - goto out4; - } -#endif - /* Bind the rdmacm server to the local IP address and an ephemerial - * port or one specified by a comand arg. - */ - rc = rdma_bind_addr(context->id, -#if BTL_OPENIB_RDMACM_IB_ADDR - rdma_addr->ai_src_addr); -#else - (struct sockaddr *)&sin); -#endif - if (0 != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC unable to bind to address"); - rc = OPAL_ERR_UNREACH; -#if BTL_OPENIB_RDMACM_IB_ADDR - rdma_freeaddrinfo(rdma_addr); -#endif - goto out5; - } -#if BTL_OPENIB_RDMACM_IB_ADDR - server->service_id = ((struct sockaddr_ib *) (&context->id->route.addr.src_addr))->sib_sid; - rdma_freeaddrinfo(rdma_addr); -#else - /* Verify that the device has a valid IP address on it, or we - cannot use the cpc */ - rc = ipaddrcheck(context, openib_btl); - if (OPAL_SUCCESS != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm IP address not found on port"); - rc = OPAL_ERR_NOT_SUPPORTED; - goto out5; - } -#endif - /* Listen on the specified address/port with the rdmacm, limit the - amount of incoming connections to 1024 */ - /* FIXME - 1024 should be (num of connectors * - mca_btl_openib_component.num_qps) */ - rc = rdma_listen(context->id, 1024); - if (0 != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC unable to listen"); - rc = OPAL_ERR_UNREACH; - goto out5; - } - - rc = create_message(server, openib_btl, &(*cpc)->data); - if (0 != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC unable to create message"); - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto out5; - } - - opal_list_append(&server_listener_list, &(server->super)); - - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC available for use on %s:%d", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num); - return OPAL_SUCCESS; - -out5: - /* - * Since rdma_create_id() succeeded, we need "rdma_destroy_id(context->id)". - * But don't do it here since it's part of out4:OBJ_RELEASE(context), - * and we don't want to do it twice. - */ -out4: - opal_list_remove_first(&(server->ids)); - OBJ_RELEASE(context); -out3: - OBJ_RELEASE(server); -out1: - free(*cpc); -out: - if (OPAL_ERR_NOT_SUPPORTED == rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num); - } else { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)", - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num, rc, - opal_strerror(rc)); - } - return rc; -} - -/* - * Invoked by main thread - * - * Shutting down the whole thing. - */ -static int rdmacm_component_finalize(void) -{ - opal_list_item_t *item, *item2; - - BTL_VERBOSE(("rdmacm_component_finalize")); - - /* If we're just trolling through ompi_info, don't bother doing - anything */ - if (!rdmacm_component_initialized) { - return OPAL_SUCCESS; - } - - if (rdmacm_event_base) { - opal_event_del (&rdmacm_event); - opal_progress_thread_finalize (NULL); - rdmacm_event_base = NULL; - } - - /* The event thread is no longer running; no need to lock access - to the client_list */ - OPAL_LIST_DESTRUCT(&client_list); - - /* For each of the items in the server list, there's only one item - in the "ids" list -- the server listener. So explicitly - destroy its RDMA ID context. */ - while (NULL != (item = opal_list_remove_first(&server_listener_list))) { - rdmacm_contents_t *contents = (rdmacm_contents_t*) item; - item2 = opal_list_remove_first(&(contents->ids)); - OBJ_RELEASE(item2); - OBJ_RELEASE(item); - } - OBJ_DESTRUCT(&server_listener_list); - - /* Now we're all done -- destroy the event channel */ - if (NULL != event_channel) { - rdma_destroy_event_channel(event_channel); - event_channel = NULL; - } - - mca_btl_openib_free_rdma_addr_list(); - - pthread_cond_destroy (&rdmacm_disconnect_cond); - pthread_mutex_destroy (&rdmacm_disconnect_lock); - - return OPAL_SUCCESS; -} - -#if BTL_OPENIB_RDMACM_IB_ADDR -static int rdmacm_check_ibaddr_support(void) -{ - int rsock; - rsock = rsocket(AF_IB, SOCK_STREAM, 0); - if (rsock < 0) { - return OPAL_ERROR; - } - - rclose(rsock); - - return OPAL_SUCCESS; -} -#endif - -static int rdmacm_component_init(void) -{ - int rc; - - OBJ_CONSTRUCT(&server_listener_list, opal_list_t); - OBJ_CONSTRUCT(&client_list, opal_list_t); - OBJ_CONSTRUCT(&client_list_lock, opal_mutex_t); - -#if !BTL_OPENIB_RDMACM_IB_ADDR - rc = mca_btl_openib_build_rdma_addr_list(); - if (OPAL_SUCCESS != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC unable to find any valid IP address"); - return OPAL_ERR_NOT_SUPPORTED; - } -#else - rc = rdmacm_check_ibaddr_support(); - if (OPAL_SUCCESS != rc) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "There is no IB_AF addressing support by lib rdmacm"); - return OPAL_ERR_NOT_SUPPORTED; - } -#endif - - event_channel = rdma_create_event_channel(); - if (NULL == event_channel) { - opal_output_verbose(5, opal_btl_base_framework.framework_output, - "openib BTL: rdmacm CPC failed to create channel"); - return OPAL_ERR_UNREACH; - } - - rdmacm_event_base = opal_progress_thread_init (NULL); - if (NULL == rdmacm_event_base) { - opal_output_verbose (5, opal_btl_base_framework.framework_output, - "openib BTL: could not create rdmacm event thread"); - return OPAL_ERR_UNREACH; - } - - opal_event_set (rdmacm_event_base, &rdmacm_event, event_channel->fd, - OPAL_EV_READ | OPAL_EV_PERSIST, rdmacm_event_dispatch, NULL); - - opal_event_add (&rdmacm_event, 0); - - pthread_cond_init (&rdmacm_disconnect_cond, NULL); - pthread_mutex_init (&rdmacm_disconnect_lock, NULL); - - rdmacm_component_initialized = true; - - return OPAL_SUCCESS; -} diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.h b/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.h deleted file mode 100644 index dccc36c223..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_RDMACM_H -#define BTL_OPENIB_CONNECT_RDMACM_H - -#include "opal_config.h" - -#include "connect/connect.h" - -extern opal_btl_openib_connect_base_component_t opal_btl_openib_connect_rdmacm; - -#endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_sl.c b/opal/mca/btl/openib/connect/btl_openib_connect_sl.c deleted file mode 100644 index cd3fe20d90..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_sl.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2011 Mellanox Technologies. All rights reserved. - * - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_openib.h" - -#include "opal/util/show_help.h" -#include "opal/util/sys_limits.h" -#include "opal/util/proc.h" - -#include "connect/btl_openib_connect_sl.h" -#include - -#ifdef HAVE_UNISTD_H -#include -#endif - -#define SL_NOT_PRESENT 0xFF -#define MAX_GET_SL_REC_RETRIES 20 -#define GET_SL_REC_RETRIES_TIMEOUT_MS 2000000 - -static struct mca_btl_openib_sa_qp_cache { - /* There will be a MR with the one send and receive buffer together */ - /* The send buffer is first, the receive buffer is second */ - /* The receive buffer in a UD queue pair needs room for the 40 byte GRH */ - /* The buffers are first in the structure for page alignment */ - char send_recv_buffer[MAD_BLOCK_SIZE * 2 + 40]; - struct mca_btl_openib_sa_qp_cache *next; - struct ibv_context *context; - char *device_name; - uint32_t port_num; - struct ibv_qp *qp; - struct ibv_ah *ah; - struct ibv_cq *cq; - struct ibv_mr *mr; - struct ibv_pd *pd; - struct ibv_recv_wr rwr; - struct ibv_sge rsge; - uint8_t sl_values[65536]; /* 64K */ -} *sa_qp_cache = 0; - -static int init_ud_qp( - struct ibv_context *context_arg, - struct mca_btl_openib_sa_qp_cache *cache); - -static void init_sa_mad( - struct mca_btl_openib_sa_qp_cache *cache, - ib_sa_mad_t *sa_mad, - struct ibv_send_wr *swr, - struct ibv_sge *ssge, - uint16_t lid, - uint16_t rem_lid); - -static int get_pathrecord_info( - struct mca_btl_openib_sa_qp_cache *cache, - ib_sa_mad_t *sa_mad, - ib_sa_mad_t *sar, - struct ibv_send_wr *swr, - uint16_t lid, - uint16_t rem_lid); - -static int init_device( - struct ibv_context *context_arg, - struct mca_btl_openib_sa_qp_cache *cache, - uint32_t port_num); - -/*=================================================================*/ - -static void free_sa_qp_cache(void) -{ - struct mca_btl_openib_sa_qp_cache *cache, *tmp; - - cache = sa_qp_cache; - while (NULL != cache) { - /* free cache data */ - if (cache->device_name) - free(cache->device_name); - if (NULL != cache->qp) - ibv_destroy_qp(cache->qp); - if (NULL != cache->ah) - ibv_destroy_ah(cache->ah); - if (NULL != cache->cq) - ibv_destroy_cq(cache->cq); - if (NULL != cache->mr) - ibv_dereg_mr(cache->mr); - if (NULL != cache->pd) - ibv_dealloc_pd(cache->pd); - tmp = cache->next; - free(cache); - cache = tmp; - } - sa_qp_cache = NULL; -} - -/*=================================================================*/ - -static int init_ud_qp(struct ibv_context *context_arg, - struct mca_btl_openib_sa_qp_cache *cache) -{ - struct ibv_qp_init_attr iattr; - struct ibv_qp_attr mattr; - int rc; - - /* create cq */ - cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0); - if (NULL == cache->cq) { - BTL_ERROR(("error creating cq, errno says %s", strerror(errno))); - opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q", - true, opal_process_info.nodename, - __FILE__, __LINE__, "ibv_create_cq", - strerror(errno), errno, - ibv_get_device_name(context_arg->device)); - return OPAL_ERROR; - } - - /* create qp */ - memset(&iattr, 0, sizeof(iattr)); - iattr.send_cq = cache->cq; - iattr.recv_cq = cache->cq; - iattr.cap.max_send_wr = 1; - iattr.cap.max_recv_wr = 1; - iattr.cap.max_send_sge = 1; - iattr.cap.max_recv_sge = 1; - iattr.qp_type = IBV_QPT_UD; - cache->qp = ibv_create_qp(cache->pd, &iattr); - if (NULL == cache->qp) { - BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno)); - return OPAL_ERROR; - } - - /* modify qp to IBV_QPS_INIT */ - memset(&mattr, 0, sizeof(mattr)); - mattr.qp_state = IBV_QPS_INIT; - mattr.port_num = cache->port_num; - mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY); - rc = ibv_modify_qp(cache->qp, &mattr, - IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_QKEY); - if (rc) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", - cache->qp->qp_num, strerror(errno), errno)); - return OPAL_ERROR; - } - - /* modify qp to IBV_QPS_RTR */ - memset(&mattr, 0, sizeof(mattr)); - mattr.qp_state = IBV_QPS_RTR; - rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE); - if (rc) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", - cache->qp->qp_num, strerror(errno), errno)); - return OPAL_ERROR; - } - - /* modify qp to IBV_QPS_RTS */ - mattr.qp_state = IBV_QPS_RTS; - rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN); - if (rc) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", - cache->qp->qp_num, strerror(errno), errno)); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/*=================================================================*/ - -static void init_sa_mad(struct mca_btl_openib_sa_qp_cache *cache, - ib_sa_mad_t *sa_mad, - struct ibv_send_wr *swr, - struct ibv_sge *ssge, - uint16_t lid, - uint16_t rem_lid) -{ - ib_path_rec_t *path_record = (ib_path_rec_t*)sa_mad->data; - - memset(swr, 0, sizeof(*swr)); - memset(ssge, 0, sizeof(*ssge)); - - /* Initialize the standard MAD header. */ - memset(sa_mad, 0, MAD_BLOCK_SIZE); - ib_mad_init_new((ib_mad_t *)sa_mad, /* mad header pointer */ - IB_MCLASS_SUBN_ADM, /* management class */ - (uint8_t) 2, /* version */ - IB_MAD_METHOD_GET, /* method */ - hton64((uint64_t)lid << 48 | /* transaction ID */ - (uint64_t)rem_lid << 32 | - (uint64_t)cache->qp->qp_num << 8), - IB_MAD_ATTR_PATH_RECORD, /* attribute ID */ - 0); /* attribute modifier */ - - sa_mad->comp_mask = IB_PR_COMPMASK_DLID | IB_PR_COMPMASK_SLID; - path_record->dlid = htons(rem_lid); - path_record->slid = htons(lid); - - swr->sg_list = ssge; - swr->num_sge = 1; - swr->opcode = IBV_WR_SEND; - swr->wr.ud.ah = cache->ah; - swr->wr.ud.remote_qpn = ntohl(IB_QP1); - swr->wr.ud.remote_qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY); - swr->send_flags = IBV_SEND_SIGNALED | IBV_SEND_SOLICITED; - - ssge->addr = (uint64_t)(void *)sa_mad; - ssge->length = MAD_BLOCK_SIZE; - ssge->lkey = cache->mr->lkey; -} - -/*=================================================================*/ - -static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, - ib_sa_mad_t *req_mad, - ib_sa_mad_t *resp_mad, - struct ibv_send_wr *swr, - uint16_t lid, - uint16_t rem_lid) -{ - struct ibv_send_wr *bswr; - struct ibv_wc wc; - struct timeval get_sl_rec_last_sent, get_sl_rec_last_poll; - struct ibv_recv_wr *brwr; - int got_sl_value, get_sl_rec_retries, rc, ne, i; - ib_path_rec_t *req_path_record = ib_sa_mad_get_payload_ptr(req_mad); - ib_path_rec_t *resp_path_record = ib_sa_mad_get_payload_ptr(resp_mad); - - got_sl_value = 0; - get_sl_rec_retries = 0; - - rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); - if (0 != rc) { - BTL_ERROR(("error posting receive on QP [0x%x] rc says: %s [%d]", - cache->qp->qp_num, strerror(rc), rc)); - return OPAL_ERROR; - } - - while (0 == got_sl_value) { - rc = ibv_post_send(cache->qp, swr, &bswr); - if (0 != rc) { - BTL_ERROR(("error posting send on QP [0x%x] rc says: %s [%d]", - cache->qp->qp_num, strerror(rc), rc)); - return OPAL_ERROR; - } - gettimeofday(&get_sl_rec_last_sent, NULL); - - while (0 == got_sl_value) { - ne = ibv_poll_cq(cache->cq, 1, &wc); - if (ne > 0 && IBV_WC_RECV == wc.opcode) { - /* We only care about the status of receive work requests. */ - /* If the status of the send work request was anything other */ - /* than success, we'll eventually retransmit, so ignore them. */ - if (0 == resp_mad->status && - req_path_record->slid == htons(lid) && - req_path_record->dlid == htons(rem_lid) && - IBV_WC_SUCCESS == wc.status && - wc.byte_len >= MAD_BLOCK_SIZE && - resp_mad->trans_id == req_mad->trans_id) { - /* Everything matches, so we have the desired SL */ - cache->sl_values[rem_lid] = ib_path_rec_sl(resp_path_record); - got_sl_value = 1; - break; - } - /* Probably bad status, unlikely bad lid match. We will */ - /* ignore response and let it time out so that we do a */ - /* retry, but after a delay. Need to repost receive WR. */ - rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); - if (0 != rc) { - BTL_ERROR(("error posing receive on QP[%x] rc says: %s [%d]", - cache->qp->qp_num, strerror(rc), rc)); - return OPAL_ERROR; - } - } else if (0 == ne) { /* poll did not find anything */ - gettimeofday(&get_sl_rec_last_poll, NULL); - i = get_sl_rec_last_poll.tv_sec - get_sl_rec_last_sent.tv_sec; - i = (i * 1000000) + - get_sl_rec_last_poll.tv_usec - get_sl_rec_last_sent.tv_usec; - if (i > GET_SL_REC_RETRIES_TIMEOUT_MS) { - get_sl_rec_retries++; - BTL_VERBOSE(("[%d/%d] retries to get PathRecord", - get_sl_rec_retries, MAX_GET_SL_REC_RETRIES)); - if (get_sl_rec_retries > MAX_GET_SL_REC_RETRIES) { - BTL_ERROR(("No response from SA after %d retries", - MAX_GET_SL_REC_RETRIES)); - return OPAL_ERROR; - } - /* Need to retransmit request. We must make a new TID */ - /* so the SM doesn't see it as the same request. */ - req_mad->trans_id += hton64(1); - break; - } - usleep(100); /* otherwise pause before polling again */ - } else if (ne < 0) { - BTL_ERROR(("error polling CQ returned %d\n", ne)); - return OPAL_ERROR; - } - } - } - return 0; -} - -/*=================================================================*/ - -static int init_device(struct ibv_context *context_arg, - struct mca_btl_openib_sa_qp_cache *cache, - uint32_t port_num) -{ - struct ibv_ah_attr aattr; - struct ibv_port_attr pattr; - int rc; - - cache->context = ibv_open_device(context_arg->device); - if (NULL == cache->context) { - BTL_ERROR(("error obtaining device context for %s errno says %s", - ibv_get_device_name(context_arg->device), strerror(errno))); - return OPAL_ERROR; - } - cache->device_name = strdup(ibv_get_device_name(cache->context->device)); - cache->port_num = port_num; - - /* init all sl_values to be SL_NOT_PRESENT */ - memset(&cache->sl_values, SL_NOT_PRESENT, sizeof(cache->sl_values)); - - cache->next = sa_qp_cache; - sa_qp_cache = cache; - - /* allocate the protection domain for the device */ - cache->pd = ibv_alloc_pd(cache->context); - if (NULL == cache->pd) { - BTL_ERROR(("error allocating protection domain for %s errno says %s", - ibv_get_device_name(context_arg->device), strerror(errno))); - return OPAL_ERROR; - } - - /* register memory region */ - cache->mr = ibv_reg_mr(cache->pd, cache->send_recv_buffer, - sizeof(cache->send_recv_buffer), - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); - if (NULL == cache->mr) { - BTL_ERROR(("error registering memory region, errno says %s", strerror(errno))); - return OPAL_ERROR; - } - - /* init the ud qp */ - rc = init_ud_qp(context_arg, cache); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - - rc = ibv_query_port(cache->context, cache->port_num, &pattr); - if (rc) { - BTL_ERROR(("error getting port attributes for device %s " - "port number %d errno says %s", - ibv_get_device_name(context_arg->device), - cache->port_num, strerror(errno))); - return OPAL_ERROR; - } - - /* create address handle */ - memset(&aattr, 0, sizeof(aattr)); - aattr.dlid = pattr.sm_lid; - aattr.sl = pattr.sm_sl; - aattr.port_num = cache->port_num; - cache->ah = ibv_create_ah(cache->pd, &aattr); - if (NULL == cache->ah) { - BTL_ERROR(("error creating address handle: %s", strerror(errno))); - return OPAL_ERROR; - } - - memset(&(cache->rwr), 0, sizeof(cache->rwr)); - cache->rwr.num_sge = 1; - cache->rwr.sg_list = &(cache->rsge); - memset(&(cache->rsge), 0, sizeof(cache->rsge)); - cache->rsge.addr = (uint64_t)(void *) - (cache->send_recv_buffer + MAD_BLOCK_SIZE); - cache->rsge.length = MAD_BLOCK_SIZE + 40; - cache->rsge.lkey = cache->mr->lkey; - - return 0; -} - -/*=================================================================*/ - -static int get_pathrecord_sl(struct ibv_context *context_arg, - uint32_t port_num, - uint16_t lid, - uint16_t rem_lid) -{ - struct ibv_send_wr swr; - ib_sa_mad_t *req_mad, *resp_mad; - struct ibv_sge ssge; - struct mca_btl_openib_sa_qp_cache *cache; - size_t page_size = (size_t)opal_getpagesize(); - int rc; - - /* search for a cached item */ - for (cache = sa_qp_cache; cache; cache = cache->next) { - if (0 == strcmp(cache->device_name, - ibv_get_device_name(context_arg->device)) - && cache->port_num == port_num) { - break; - } - } - - if (NULL == cache) { - /* init new cache */ - if (posix_memalign((void **)(&cache), page_size, - sizeof(struct mca_btl_openib_sa_qp_cache))) { - BTL_ERROR(("error in posix_memalign SA cache")); - return OPAL_ERROR; - } - /* one time setup for each device/port combination */ - rc = init_device(context_arg, cache, port_num); - if (0 != rc) { - return rc; - } - } - - /* if the destination lid SL value is not in the cache, go get it */ - if (SL_NOT_PRESENT == cache->sl_values[rem_lid]) { - /* sa_mad is first buffer, where we build the SA Get request to send */ - req_mad = (ib_sa_mad_t *)(cache->send_recv_buffer); - - init_sa_mad(cache, req_mad, &swr, &ssge, lid, rem_lid); - - /* resp_mad is the receive buffer (40 byte offset is for GRH) */ - resp_mad = (ib_sa_mad_t *)(cache->send_recv_buffer + MAD_BLOCK_SIZE + 40); - - rc = get_pathrecord_info(cache, req_mad, resp_mad, &swr, lid, rem_lid); - if (0 != rc) { - return rc; - } - } - - /* now all we do is send back the value laying around */ - return cache->sl_values[rem_lid]; -} - -/*=================================================================*/ - -int btl_openib_connect_get_pathrecord_sl(struct ibv_context *context_arg, - uint32_t port_num, - uint16_t lid, - uint16_t rem_lid) -{ - int rc = get_pathrecord_sl(context_arg, port_num, lid, rem_lid); - if (OPAL_ERROR == rc) { - free_sa_qp_cache(); - } - return rc; -} - -/*=================================================================*/ - -void btl_openib_connect_sl_finalize() -{ - free_sa_qp_cache(); -} diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_sl.h b/opal/mca/btl/openib/connect/btl_openib_connect_sl.h deleted file mode 100644 index b6fbc41550..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_sl.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2011 Mellanox Technologies. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_SL_H -#define BTL_OPENIB_CONNECT_SL_H - -BEGIN_C_DECLS - -int btl_openib_connect_get_pathrecord_sl( - struct ibv_context *context_arg, - uint32_t port_num, - uint16_t lid, - uint16_t rem_lid); - -void btl_openib_connect_sl_finalize(void); - -END_C_DECLS - -#endif /* BTL_OPENIB_CONNECT_SL_H */ diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c deleted file mode 100644 index ee5678120a..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ /dev/null @@ -1,3051 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014 Bull SAS. All rights reserved. - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/* - * The UD connection module creates and listens on a unconnected - * datagram (UD) queue pair (QP) for connections requests. - * - * There are two ways an RC connection can be established by UD: - * 1. One side starts a connection and the request is received before - * the receiving side starts a connection. (One sided) - * 2. Both sides send a request before either receives a request. - * (Simulaneous). - * - * The protocol for case 1 looks like: - * peer1 peer2 - * | | - * CONNECT |------>| - * | | move QPs to RTS - * | | post rc receive - * |<------| CONNECT - * move QPs to RTS | | - * post rc send | | - * |<------| COMPLETE - * COMPLETE |------>| - * - * The protocol for case 2 looks like: - * peer1 peer2 - * | | - * CONNECT |<----->| CONNECT - * move QPs to RTS | | move QPs to RTS - * post rc send | | post rc recv - * COMPLETE |<----->| COMPLETE - * - */ - -#include "opal_config.h" - -#include -#include -#include -#include -#include -#include - -#include - -#include "opal/util/show_help.h" -#include "opal/util/proc.h" -#include "opal/util/output.h" -#include "opal/util/error.h" -#include "opal/util/alfg.h" -#include "opal_stdint.h" -#include "opal/class/opal_fifo.h" - -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_async.h" -#include "connect/connect.h" - -#include "opal/util/sys_limits.h" -#include "opal/align.h" - -#if (ENABLE_DYNAMIC_SL) -#include "connect/btl_openib_connect_sl.h" -#endif - -#if HAVE_XRC -#include "btl_openib_xrc.h" -#endif - -/*--------------------------------------------------------------------*/ - -/* - * Message that this CPC includes in the modex. Filed are laid out in - * order to avoid holes. - */ -typedef struct { - /** The qp_num we are listening on (this alone may be sufficient for - matching the endpoint) */ - uint32_t mm_qp_num; - /** The LID that we're listening on; it also identifies the source - endpoint when an UD CM request arrives */ - uint16_t mm_lid; - /** The port number of this port, also used to locate the source - endpoint when an UD CM request arrives */ - uint8_t mm_port_num; - /** Global ID (needed when routers are in use) */ - union ibv_gid mm_gid; -} modex_msg_t; - -/* - * The UD module (i.e., the base module plus more meta data required - * by this CPC) - */ -typedef struct udcm_module { - opal_btl_openib_connect_base_module_t cpc; - - /* This mutex must be held by any thread modifying - the module directly */ - opal_mutex_t cm_lock; - - /* Signal callbacks and threads that this module - is exiting */ - bool cm_exiting; - - /* UD QP this module is listening on */ - struct ibv_qp *listen_qp; - - /* Work request completion queues */ - struct ibv_cq *cm_send_cq, *cm_recv_cq; - - /* Completion channel for receive completions */ - struct ibv_comp_channel *cm_channel; - - /* Memory register for cm_buffer */ - struct ibv_mr *cm_mr; - - /* All buffers (grh + receive, send) */ - char *cm_buffer; - - /* Pointer to send buffer (near end of cm_buffer) */ - char *cm_send_buffer; - - /* Length of largest message */ - size_t msg_length; - - /* timeout thread */ - opal_mutex_t cm_timeout_lock; - - /* Messages waiting for ack */ - opal_list_t flying_messages; - - /* This mutex must be held when calling ibv_post_send - or waiting on cm_send_cq */ - opal_mutex_t cm_send_lock; - - /* Receive queue */ - opal_fifo_t cm_recv_msg_fifo; - - /* The associated BTL */ - struct mca_btl_openib_module_t *btl; - - /* This module's modex message */ - modex_msg_t modex; - - /* channel monitoring */ - - /** channel event base */ - opal_event_base_t *channel_evbase; - - /** channel monitoring event */ - opal_event_t channel_event; - - /* message processing */ - /** mesage event is active */ - int32_t cm_message_event_active; - - /** message event */ - opal_event_t cm_message_event; -} udcm_module_t; - -/* - * Per-endpoint UD data - */ -typedef struct { - /* Lock for IPC between threads in the ud CPC */ - opal_mutex_t udep_lock; - - struct ibv_ah *ah; - - bool sent_req, recv_req, recv_resp, recv_comp; - - /* Has this endpoint's data been initialized */ - bool udep_initialized, udep_created_qps; -} udcm_endpoint_t; - -typedef struct udcm_qp_t { - uint32_t qp_num; - uint32_t psn; -} udcm_qp_t; - -typedef enum udcm_message_type { - UDCM_MESSAGE_CONNECT = 100, - UDCM_MESSAGE_COMPLETE = 101, - UDCM_MESSAGE_REJECT = 102, -#if HAVE_XRC - UDCM_MESSAGE_XCONNECT = 103, - UDCM_MESSAGE_XRESPONSE = 104, - UDCM_MESSAGE_XCONNECT2 = 105, - UDCM_MESSAGE_XRESPONSE2 = 106, -#endif - UDCM_MESSAGE_ACK = 107 -} udcm_message_type_t; - -typedef enum { - UDCM_REJ_REMOTE_ERROR = -1, - UDCM_REJ_ALREADY_CONNECTED = -2, -#if HAVE_XRC - UDCM_REJ_NOT_READY = -3, -#endif -} udcm_reject_reason_t; - -typedef struct udcm_msg_hdr { - uint8_t type; - - /* ack context */ - uintptr_t rem_ctx; - - /* endpoint local to the sender */ - mca_btl_base_endpoint_t *rem_ep; - /* endpoint local to the receiver */ - mca_btl_base_endpoint_t *lcl_ep; - - union { - /* UDCM_MESSAGE_CONNECT */ - struct msg_connect { - opal_process_name_t rem_name; - int32_t rem_ep_index; - uint8_t rem_port_num; - } req; - /* UDCM_MESSAGE_REJECT */ - struct msg_reject { - int32_t reason; - } rej; -#if HAVE_XRC - /* UDCM_MESSAGE_XCONNECT, UDCM_MESSAGE_XCONNECT2 */ - struct msg_xrc_connect { - opal_process_name_t rem_name; - int32_t rem_ep_index; - uint8_t rem_port_num; - uint32_t rem_qp_num; - uint32_t rem_psn; - } xreq; - /* UDCM_MESSAGE_XRESPONSE */ - struct msg_xrc_response { - int32_t rem_ep_index; - uint32_t rem_qp_num; - uint32_t rem_psn; - } xres; -#endif - } data; -} udcm_msg_hdr_t; - -typedef struct udcm_msg_t { - udcm_msg_hdr_t hdr; - - /* If the message type is UDCM_MESSAGE_CONNECT, - UDCM_MESSAGE_XRESPONSE, or UDCM_MESSAGE_XRESPONSE2 - then queue pair/srq data will follow the header */ - udcm_qp_t qps[]; -} udcm_msg_t; - -typedef struct udcm_message_recv { - opal_list_item_t super; - - udcm_msg_hdr_t msg_hdr; -} udcm_message_recv_t; - -static OBJ_CLASS_INSTANCE(udcm_message_recv_t, opal_list_item_t, - NULL, NULL); - -typedef struct udcm_message_sent { - opal_list_item_t super; - - udcm_msg_t *data; - size_t length; - mca_btl_base_endpoint_t *endpoint; - - int tries; - opal_event_t event; - bool event_active; -} udcm_message_sent_t; - -static void udcm_sent_message_constructor (udcm_message_sent_t *); -static void udcm_sent_message_destructor (udcm_message_sent_t *); -static OBJ_CLASS_INSTANCE(udcm_message_sent_t, opal_list_item_t, - udcm_sent_message_constructor, - udcm_sent_message_destructor); - -#define UDCM_ENDPOINT_MODULE(ep) ((udcm_module_t *)(ep)->endpoint_local_cpc) -#define UDCM_ENDPOINT_DATA(ep) ((udcm_endpoint_t *)(ep)->endpoint_local_cpc_data) -#define UDCM_ENDPOINT_REM_MODEX(ep) \ - (((modex_msg_t *)(ep)->endpoint_remote_cpc_data->cbm_modex_message)) - -/*--------------------------------------------------------------------*/ - -static void udcm_component_register(void); -static int udcm_component_query(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t **cpc); -static int udcm_component_finalize(void); - -/* Module methods */ -static int udcm_endpoint_init(struct mca_btl_base_endpoint_t *lcl_ep); -static int udcm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *lcl_ep); -static int udcm_endpoint_finalize(struct mca_btl_base_endpoint_t *lcl_ep); -static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep); -static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep); -static int udcm_module_finalize(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t *cpc); - -static void *udcm_cq_event_dispatch(int fd, int flags, void *context); -static void *udcm_message_callback (int fd, int flags, void *context); - -static void udcm_set_message_timeout (udcm_message_sent_t *message); -static void udcm_free_message (udcm_message_sent_t *message); - -static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl); - -static int udcm_module_create_listen_qp (udcm_module_t *m); -static void udcm_module_destroy_listen_qp (udcm_module_t *m); - -static int udcm_module_allocate_buffers (udcm_module_t *m); -static void udcm_module_destroy_buffers (udcm_module_t *m); - -static int udcm_module_post_all_recvs (udcm_module_t *m); - -static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep, - mca_btl_base_endpoint_t *rem_ep); - -static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg); -static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep); -static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep); - -/* XRC support */ -#if HAVE_XRC -static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *lcl_ep); -static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep); -static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); -static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep); -static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num); -static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn); -static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, - uint8_t msg_type); -static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, - uint8_t msg_type); -static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); -static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); -#endif - - -/*--------------------------------------------------------------------*/ - -#define UDCM_MIN_RECV_COUNT 512 -#define UDCM_MIN_TIMEOUT 500000 - -#define UDCM_SEND_CQ_SIZE 512 - -#define UDCM_WR_RECV_ID 0x20000000ll -#define UDCM_WR_SEND_ID 0x10000000ll -#define UDCM_WR_ACK_ID 0x10000000ll -#define UDCM_WR_DIR_MASK 0x30000000ll - -/* Useless 40 bytes of data that proceeds received scatter gather data. - Can we get rid of this? */ -#define UDCM_GRH_SIZE (sizeof (struct ibv_grh)) - -/* Priority of this connection module */ -static int udcm_priority; - -/* Number of receive work requests to post */ -static int udcm_recv_count; -static int udcm_max_retry; - -/* Message ACK timeout in usec */ -static int udcm_timeout; - -/* seed for rand_r. remove me when opal gets a random number generator */ -/* Uses the OPAL ALFG RNG */ -static uint32_t udcm_random_seed = 0; -static opal_rng_buff_t udcm_rand_buff; - -static struct timeval udcm_timeout_tv; - -/******************************************************************* - * Component - *******************************************************************/ - -/* mark: udcm component */ - -opal_btl_openib_connect_base_component_t opal_btl_openib_connect_udcm = { - "udcm", - udcm_component_register, - NULL, - udcm_component_query, - udcm_component_finalize -}; - -static void udcm_component_register(void) -{ - /* the priority is initialized in the declaration above */ - udcm_priority = 63; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_udcm_priority", "Priority of the udcm " - "connection method", MCA_BASE_VAR_TYPE_INT, NULL, - 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &udcm_priority); - - udcm_recv_count = UDCM_MIN_RECV_COUNT; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_udcm_recv_count", "Number of registered " - "buffers to post", MCA_BASE_VAR_TYPE_INT, NULL, - 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &udcm_recv_count); - - udcm_timeout = UDCM_MIN_TIMEOUT; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_udcm_timeout", "Ack timeout for udcm " - "connection messages", MCA_BASE_VAR_TYPE_INT, NULL, - 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &udcm_timeout); - - udcm_max_retry = 25; - (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "connect_udcm_max_retry", "Maximum number of times " - "to retry sending a udcm connection message", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &udcm_max_retry); -} - -static int udcm_component_query(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t **cpc) -{ - udcm_module_t *m = NULL; - int rc = OPAL_ERR_NOT_SUPPORTED; - - do { - /* If we do not have struct ibv_device.transport_device, then - we're in an old version of OFED that is IB only (i.e., no - iWarp), so we can safely assume that we can use this CPC. */ -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) && HAVE_DECL_IBV_LINK_LAYER_ETHERNET - if (BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl)) { - BTL_VERBOSE(("UD CPC only supported on InfiniBand; skipped on %s:%d", - ibv_get_device_name(btl->device->ib_dev), - btl->port_num)); - break; - } -#endif - - /* Allocate the module struct. Use calloc so that it's safe to - finalize the module if something goes wrong. */ - m = calloc(1, sizeof(*m)); - if (NULL == m) { - BTL_ERROR(("malloc failed!")); - rc = OPAL_ERR_OUT_OF_RESOURCE; - break; - } - - if (udcm_priority > 100) { - udcm_priority = 100; - } else if (udcm_priority < 0) { - udcm_priority = 0; - } - - if (UDCM_MIN_RECV_COUNT > udcm_recv_count) { - udcm_recv_count = UDCM_MIN_RECV_COUNT; - } - - if (UDCM_MIN_TIMEOUT > udcm_timeout) { - udcm_timeout = UDCM_MIN_TIMEOUT; - } - - rc = udcm_module_init (m, btl); - if (OPAL_SUCCESS != rc) { - break; - } - - /* seed the random number generator */ - udcm_random_seed = time (NULL); - opal_srand(&udcm_rand_buff,udcm_random_seed); - /* All done */ - *cpc = (opal_btl_openib_connect_base_module_t *) m; - BTL_VERBOSE(("available for use on %s:%d", - ibv_get_device_name(btl->device->ib_dev), - btl->port_num)); - - return OPAL_SUCCESS; - } while (0); - - udcm_module_finalize(btl, (opal_btl_openib_connect_base_module_t *) m); - if (OPAL_ERR_NOT_SUPPORTED == rc) { - BTL_VERBOSE(("unavailable for use on %s:%d; skipped", - ibv_get_device_name(btl->device->ib_dev), - btl->port_num)); - } else { - BTL_VERBOSE(("unavailable for use on %s:%d; fatal error %d (%s)", - ibv_get_device_name(btl->device->ib_dev), - btl->port_num, rc, - opal_strerror(rc))); - } - - return rc; -} - -static int udcm_component_finalize(void) -{ - return OPAL_SUCCESS; -} - -/*--------------------------------------------------------------------*/ - -/******************************************************************* - * Module - *******************************************************************/ - -/* mark: udcm module */ - -#if HAVE_XRC -static int udcm_endpoint_init_self_xrc (struct mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int32_t recv_qpn; - int rc; - - opal_mutex_lock (&udep->udep_lock); - do { - if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { - BTL_VERBOSE(("error initializing loopback endpoint cpc data")); - break; - } - - rc = udcm_xrc_send_qp_create (lcl_ep); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error creating send queue pair for loopback endpoint")); - break; - } - - lcl_ep->rem_info.rem_index = lcl_ep->index; - - rc = udcm_xrc_recv_qp_create (lcl_ep, lcl_ep->qps[0].qp->lcl_qp->qp_num, - lcl_ep->qps[0].qp->lcl_psn); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error creating loopback XRC receive queue pair")); - break; - } - - for (int i = 0 ; i < mca_btl_openib_component.num_xrc_qps ; ++i) { - uint32_t srq_num; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - if (ibv_get_srq_num(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq, &srq_num)) { - BTL_ERROR(("BTL openib UDCM internal error: can't get srq num")); - } -#else - srq_num = lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num; -#endif - lcl_ep->rem_info.rem_srqs[i].rem_srq_num = srq_num; - } - -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - recv_qpn = lcl_ep->xrc_recv_qp->qp_num; -#else - recv_qpn = lcl_ep->xrc_recv_qp_num; -#endif - - lcl_ep->ib_addr->remote_xrc_rcv_qp_num = recv_qpn; - lcl_ep->rem_info.rem_qps[0].rem_psn = lcl_ep->xrc_recv_psn; - lcl_ep->rem_info.rem_qps[0].rem_qp_num = recv_qpn; - - rc = udcm_xrc_send_qp_connect (lcl_ep, recv_qpn, lcl_ep->xrc_recv_psn); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error connecting loopback XRC send queue pair")); - break; - } - - BTL_VERBOSE(("successfully created loopback queue pair")); - - /* need to hold the endpoint lock before calling udcm_finish_connection */ - OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock); - rc = udcm_finish_connection (lcl_ep); - } while (0); - opal_mutex_unlock (&udep->udep_lock); - - return rc; -} -#endif - -static int udcm_endpoint_init_self (struct mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int rc; - - opal_mutex_lock (&udep->udep_lock); - do { - if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { - BTL_VERBOSE(("error initializing loopback endpoint cpc data")); - break; - } - - if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) { - BTL_VERBOSE(("error initializing loopback endpoint qps")); - break; - } - - /* save queue pair info */ - lcl_ep->rem_info.rem_index = lcl_ep->index; - - for (int i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) { - lcl_ep->rem_info.rem_qps[i].rem_psn = lcl_ep->qps[i].qp->lcl_psn; - lcl_ep->rem_info.rem_qps[i].rem_qp_num = lcl_ep->qps[i].qp->lcl_qp->qp_num; - } - - if (OPAL_SUCCESS != (rc = udcm_rc_qps_to_rts (lcl_ep))) { - BTL_VERBOSE(("error moving loopback endpoint qps to RTS")); - break; - } - - /* need to hold the endpoint lock before calling udcm_finish_connection */ - OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock); - rc = udcm_finish_connection (lcl_ep); - - return OPAL_SUCCESS; - } while (0); - opal_mutex_unlock (&udep->udep_lock); - - return rc; -} - -static int udcm_endpoint_init (struct mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = lcl_ep->endpoint_local_cpc_data = - calloc(1, sizeof(udcm_endpoint_t)); - if (NULL == udep) { - BTL_ERROR(("malloc failed!")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - OBJ_CONSTRUCT(&udep->udep_lock, opal_mutex_t); - - if (lcl_ep->endpoint_proc->proc_opal == opal_proc_local_get ()) { - /* go ahead and try to create a loopback queue pair */ -#if HAVE_XRC - if (mca_btl_openib_component.num_xrc_qps > 0) { - return udcm_endpoint_init_self_xrc (lcl_ep); - } else -#endif - return udcm_endpoint_init_self (lcl_ep); - } - - return OPAL_SUCCESS; -} - -static int udcm_endpoint_finalize(struct mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - - /* Free the stuff we allocated in udcm_endpoint_init */ - if (NULL != udep) { - if (udep->ah) { - ibv_destroy_ah(udep->ah); - } - - OBJ_DESTRUCT(&udep->udep_lock); - - free(lcl_ep->endpoint_local_cpc_data); - lcl_ep->endpoint_local_cpc_data = NULL; - } - - return OPAL_SUCCESS; -} - -static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl) -{ - int rc = OPAL_ERR_NOT_SUPPORTED; - - BTL_VERBOSE(("created cpc module %p for btl %p", - (void*)m, (void*)btl)); - - OBJ_CONSTRUCT(&m->cm_lock, opal_mutex_t); - OBJ_CONSTRUCT(&m->cm_send_lock, opal_mutex_t); - OBJ_CONSTRUCT(&m->cm_recv_msg_fifo, opal_fifo_t); - OBJ_CONSTRUCT(&m->flying_messages, opal_list_t); - OBJ_CONSTRUCT(&m->cm_timeout_lock, opal_mutex_t); - - m->btl = btl; - - /* Create completion channel */ - m->cm_channel = ibv_create_comp_channel (btl->device->ib_dev_context); - if (NULL == m->cm_channel) { - BTL_VERBOSE(("error creating ud completion channel")); - return OPAL_ERR_NOT_SUPPORTED; - } - - /* Create completion queues */ - m->cm_recv_cq = ibv_create_cq (btl->device->ib_dev_context, - udcm_recv_count, NULL, - m->cm_channel, 0); - if (NULL == m->cm_recv_cq) { - BTL_VERBOSE(("error creating ud recv completion queue")); - return OPAL_ERR_NOT_SUPPORTED; - } - - m->cm_send_cq = ibv_create_cq (btl->device->ib_dev_context, - UDCM_SEND_CQ_SIZE, NULL, NULL, 0); - if (NULL == m->cm_send_cq) { - BTL_VERBOSE(("error creating ud send completion queue")); - return OPAL_ERR_NOT_SUPPORTED; - } - - if (0 != (rc = udcm_module_allocate_buffers (m))) { - BTL_VERBOSE(("error allocating cm buffers")); - return rc; - } - - if (0 != (rc = udcm_module_create_listen_qp (m))) { - BTL_VERBOSE(("error creating UD QP")); - return rc; - } - - if (0 != (rc = udcm_module_post_all_recvs (m))) { - BTL_VERBOSE(("error posting receives")); - return rc; - } - - /* UD CM initialized properly. So fill in the rest of the CPC - module. */ - m->cpc.data.cbm_component = &opal_btl_openib_connect_udcm; - m->cpc.data.cbm_priority = udcm_priority; - m->cpc.data.cbm_modex_message = &m->modex; - - /* Initialize module modex */ - m->modex.mm_lid = btl->lid; - m->modex.mm_port_num = btl->port_num; - m->modex.mm_qp_num = m->listen_qp->qp_num; - - rc = ibv_query_gid (btl->device->ib_dev_context, btl->port_num, - mca_btl_openib_component.gid_index, &m->modex.mm_gid); - if (0 != rc) { - BTL_VERBOSE(("error querying port GID")); - return OPAL_ERROR; - } - - BTL_VERBOSE(("my modex = LID: %d, Port: %d, QPN: %d, GID: %08x %08x", - m->modex.mm_lid, m->modex.mm_port_num, m->modex.mm_qp_num, - (unsigned int)m->modex.mm_gid.global.interface_id, - (unsigned int)m->modex.mm_gid.global.subnet_prefix)); - - m->cpc.data.cbm_modex_message_len = sizeof(m->modex); - - /* Initialize module */ - m->cpc.cbm_endpoint_init = udcm_endpoint_init; - m->cpc.cbm_start_connect = udcm_module_start_connect; - m->cpc.cbm_endpoint_finalize = udcm_endpoint_finalize; - m->cpc.cbm_finalize = udcm_module_finalize; - - m->cpc.cbm_uses_cts = false; - - m->cm_exiting = false; - - /* Monitor the fd associated with the completion channel */ - m->channel_evbase = opal_progress_thread_init (NULL); - - opal_event_set (m->channel_evbase, &m->channel_event, - m->cm_channel->fd, OPAL_EV_READ | OPAL_EV_PERSIST, - udcm_cq_event_dispatch, m); - - opal_event_add (&m->channel_event, 0); - - udcm_timeout_tv.tv_sec = udcm_timeout / 1000000; - udcm_timeout_tv.tv_usec = udcm_timeout - 1000000 * - udcm_timeout_tv.tv_sec; - - m->cm_message_event_active = 0; - - /* set up the message event */ - opal_event_set (opal_sync_event_base, &m->cm_message_event, -1, - OPAL_EV_READ, udcm_message_callback, m); - - /* Finally, request CQ notification */ - if (0 != ibv_req_notify_cq (m->cm_recv_cq, 0)) { - BTL_VERBOSE(("error requesting recv completions")); - return OPAL_ERROR; - } - - /* Ready to use */ - - return OPAL_SUCCESS; -} - -static int -udcm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int rc = OPAL_SUCCESS; - - BTL_VERBOSE(("endpoint %p (lid %d, ep index %d)", - (void*)lcl_ep, lcl_ep->endpoint_btl->port_info.lid, - lcl_ep->index)); - -#if HAVE_XRC - if (mca_btl_openib_component.num_xrc_qps > 0) { - return udcm_xrc_start_connect (cpc, lcl_ep); - } -#endif - - - opal_mutex_lock (&udep->udep_lock); - - if (MCA_BTL_IB_CLOSED != lcl_ep->endpoint_state) { - opal_mutex_unlock (&udep->udep_lock); - BTL_VERBOSE(("already ongoing %p. state = %d", - (void *) lcl_ep, lcl_ep->endpoint_state)); - return OPAL_SUCCESS; - } - - do { - opal_atomic_wmb (); - - lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTING; - - if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { - BTL_VERBOSE(("error initializing endpoint cpc data")); - break; - } - - if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) { - BTL_VERBOSE(("error initializing endpoint qps")); - break; - } - - rc = udcm_send_request (lcl_ep, NULL); - } while (0); - - opal_mutex_unlock (&udep->udep_lock); - - return rc; -} - -static int udcm_module_finalize(mca_btl_openib_module_t *btl, - opal_btl_openib_connect_base_module_t *cpc) -{ - udcm_module_t *m = (udcm_module_t *) cpc; - opal_list_item_t *item; - - if (NULL == m) { - return OPAL_SUCCESS; - } - - m->cm_exiting = true; - - if (m->channel_evbase) { - opal_event_del (&m->channel_event); - opal_progress_thread_finalize (NULL); - } - - opal_mutex_lock (&m->cm_lock); - - /* clear message queue */ - while (NULL != (item = opal_fifo_pop_atomic (&m->cm_recv_msg_fifo))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&m->cm_recv_msg_fifo); - - opal_mutex_lock (&m->cm_timeout_lock); - while ((item = opal_list_remove_first(&m->flying_messages))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&m->flying_messages); - opal_mutex_unlock (&m->cm_timeout_lock); - - BTL_VERBOSE(("destroying listing thread")); - - /* destroy the listen queue pair. this will cause ibv_get_cq_event to - return. */ - udcm_module_destroy_listen_qp (m); - - udcm_module_destroy_buffers (m); - - if (m->cm_send_cq) { - if (0 != ibv_destroy_cq (m->cm_send_cq)) { - BTL_VERBOSE(("failed to destroy send CQ. errno = %d", - errno)); - } - } - - if (m->cm_recv_cq) { - if (0 != ibv_destroy_cq (m->cm_recv_cq)) { - BTL_VERBOSE(("failed to destroy recv CQ. errno = %d", - errno)); - } - } - - if (m->cm_channel) { - if (0 != ibv_destroy_comp_channel (m->cm_channel)) { - BTL_VERBOSE(("failed to completion channel. errno = %d", - errno)); - } - - m->cm_channel = NULL; - } - - opal_mutex_unlock (&m->cm_lock); - OBJ_DESTRUCT(&m->cm_send_lock); - OBJ_DESTRUCT(&m->cm_lock); - OBJ_DESTRUCT(&m->cm_timeout_lock); - - return OPAL_SUCCESS; -} - -/*--------------------------------------------------------------------*/ - -static int udcm_module_create_listen_qp (udcm_module_t *m) -{ - struct ibv_qp_init_attr init_attr; - struct ibv_qp_attr attr; - struct ibv_qp *qp; - - BTL_VERBOSE(("creating listen QP on port %d", m->btl->port_num)); - - /* create the UD keypair */ - memset(&init_attr, 0, sizeof(init_attr)); - - init_attr.qp_type = IBV_QPT_UD; - - init_attr.send_cq = m->cm_send_cq; - init_attr.recv_cq = m->cm_recv_cq; - - init_attr.cap.max_send_sge = 1; - init_attr.cap.max_recv_sge = 1; - - init_attr.cap.max_recv_wr = udcm_recv_count; - init_attr.cap.max_send_wr = 1; - - qp = ibv_create_qp(m->btl->device->ib_pd, &init_attr); - if (NULL == qp) { - BTL_VERBOSE(("could not create UD listen queue pair")); - return OPAL_ERROR; - } - /* end: create the UD queue pair */ - - /* move the UD QP into the INIT state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.pkey_index = m->btl->pkey_index; - attr.port_num = m->btl->port_num; - attr.qkey = 0; - - if (0 != ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX - | IBV_QP_PORT | IBV_QP_QKEY)) { - BTL_ERROR(("error modifying qp to INIT errno says %s", - strerror(errno))); - return OPAL_ERROR; - } - - /* Move listen QP to RTR */ - attr.qp_state = IBV_QPS_RTR; - - if (0 != ibv_modify_qp(qp, &attr, IBV_QP_STATE)) { - BTL_ERROR(("error modifing QP to RTR errno says %s", - strerror(errno))); - return OPAL_ERROR; - } - - - /* Move listen QP to RTS */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTS; - attr.sq_psn = 0; - - if (0 != ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { - BTL_ERROR(("error modifing QP to RTS errno says %s; errno=%d", - strerror(errno), errno)); - return OPAL_ERROR; - } - - m->listen_qp = qp; - - BTL_VERBOSE(("listening for connections on lid %d, qpn %d", - m->btl->lid, qp->qp_num)); - - return OPAL_SUCCESS; -} - -static void udcm_module_destroy_listen_qp (udcm_module_t *m) -{ - struct ibv_qp_attr attr; - struct ibv_wc wc; - - if (NULL == m->listen_qp) { - return; - } - - mca_btl_openib_async_add_qp_ignore (m->listen_qp); - - do { - /* Move listen QP into the ERR state to cancel all outstanding - work requests */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_ERR; - attr.sq_psn = 0; - - BTL_VERBOSE(("Setting qp to err state %p", (void *)m->listen_qp)); - - if (0 != ibv_modify_qp(m->listen_qp, &attr, IBV_QP_STATE)) { - BTL_VERBOSE(("error modifying qp to ERR. errno = %d", - errno)); - break; - } - - while (ibv_poll_cq (m->cm_recv_cq, 1, &wc) > 0); - - /* move the QP into the RESET state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RESET; - - if (0 != ibv_modify_qp(m->listen_qp, &attr, IBV_QP_STATE)) { - BTL_VERBOSE(("error modifying qp to RESET. errno = %d", - errno)); - break; - } - } while (0); - - if (0 != ibv_destroy_qp (m->listen_qp)) { - BTL_VERBOSE(("error destroying listen qp. errno = %d", - errno)); - } - - m->listen_qp = NULL; -} - -static int udcm_module_allocate_buffers (udcm_module_t *m) -{ - size_t total_size, page_size; - - m->msg_length = sizeof (udcm_msg_hdr_t) + - mca_btl_openib_component.num_qps * sizeof (udcm_qp_t); - - total_size = (udcm_recv_count + 1) * (m->msg_length + - UDCM_GRH_SIZE); - - page_size = opal_getpagesize(); - total_size = OPAL_ALIGN(total_size, page_size, size_t); - - m->cm_buffer = NULL; - posix_memalign ((void **)&m->cm_buffer, (size_t)page_size, - total_size); - if (NULL == m->cm_buffer) { - BTL_ERROR(("malloc failed! errno = %d", errno)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* mark buffer memory as initialized for valgrind's sake */ - memset (m->cm_buffer, 0, total_size); - - m->cm_mr = ibv_reg_mr (m->btl->device->ib_pd, m->cm_buffer, - total_size, IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE); - if (NULL == m->cm_mr) { - BTL_ERROR(("failed to register memory. errno = %d", errno)); - return OPAL_ERROR; - } - - m->cm_send_buffer = m->cm_buffer + ((UDCM_GRH_SIZE + - m->msg_length) * - udcm_recv_count); - - return 0; -} - -static void udcm_module_destroy_buffers (udcm_module_t *m) -{ - if (m->cm_mr) { - if (0 != ibv_dereg_mr (m->cm_mr)) { - BTL_VERBOSE(("failed to deregister memory. errno = %d", - errno)); - } - m->cm_mr = NULL; - } - - if (m->cm_buffer) { - free (m->cm_buffer); - } -} - -static inline char *udcm_module_get_recv_buffer (udcm_module_t *m, - int msg_num, bool skip_grh) -{ - return m->cm_buffer + msg_num * (m->msg_length + UDCM_GRH_SIZE) + - skip_grh * UDCM_GRH_SIZE; -} - -static inline char *udcm_module_get_send_buffer (udcm_module_t *m) -{ - return m->cm_send_buffer; -} - -static int udcm_module_post_one_recv (udcm_module_t *m, int msg_num) -{ - char *recv_buffer = udcm_module_get_recv_buffer (m, msg_num, 0); - struct ibv_recv_wr wr, *bad_wr; - struct ibv_sge sge; - int rc; - - /* GRH + request data*/ - sge.addr = (uintptr_t) recv_buffer; - sge.length = UDCM_GRH_SIZE + m->msg_length; - sge.lkey = m->cm_mr->lkey; - - wr.next = NULL; - wr.wr_id = UDCM_WR_RECV_ID | (uint64_t)msg_num; - wr.sg_list = &sge; - wr.num_sge = 1; - - rc = ibv_post_recv (m->listen_qp, &wr, &bad_wr); - if (0 != rc) { - BTL_VERBOSE(("error posting receive. errno = %d", errno)); - } - - return (0 == rc) ? OPAL_SUCCESS : OPAL_ERROR; -} - -static int udcm_module_post_all_recvs (udcm_module_t *m) -{ - int i, rc; - - for (i = 0 ; i < udcm_recv_count ; ++i) { - if (0 != (rc = udcm_module_post_one_recv (m, i))) { - return rc; - } - } - - return 0; -} - - -/*--------------------------------------------------------------------*/ - -/* mark: helper functions */ - -/* Returns max inlne size for qp #N */ -static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device) -{ - if (mca_btl_openib_component.qp_infos[qp].size <= device->max_inline_data) { - /* If qp message size is smaller than max_inline_data, - * we should enable inline messages */ - return mca_btl_openib_component.qp_infos[qp].size; - } else if (mca_btl_openib_component.rdma_qp == qp || 0 == qp) { - /* If qp message size is bigger that max_inline_data, we - * should enable inline messages only for RDMA QP (for PUT/GET - * fin messages) and for the first qp */ - return device->max_inline_data; - } - /* Otherway it is no reason for inline */ - return 0; -} - -/* Using OPAL's Additive Lagged Fibonacci RNG */ -static inline uint32_t udcm_random (void) -{ - return opal_rand(&udcm_rand_buff); -} - -/* mark: rc helper functions */ - -static inline int udcm_rc_qp_to_init (struct ibv_qp *qp, - mca_btl_openib_module_t *btl) -{ - enum ibv_qp_attr_mask attr_mask; - struct ibv_qp_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.pkey_index = btl->pkey_index; - attr.port_num = btl->port_num; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; -#if HAVE_DECL_IBV_ATOMIC_HCA - attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; -#endif - attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS; - - if (0 != ibv_modify_qp(qp, &attr, attr_mask)) { - BTL_ERROR(("error modifying qp to INIT errno says %s", - strerror(errno))); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -static inline int udcm_rc_qp_to_rtr (mca_btl_base_endpoint_t *lcl_ep, - int qp_index) -{ - struct ibv_qp *qp = lcl_ep->qps[qp_index].qp->lcl_qp; - mca_btl_openib_module_t *btl = lcl_ep->endpoint_btl; - struct ibv_qp_attr attr; - enum ibv_mtu mtu; - int rc; - - mtu = (btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? - btl->device->mtu : lcl_ep->rem_info.rem_mtu; - - /* Move the QP into the RTR state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTR; - /* Setup attributes */ - attr.path_mtu = mtu; - attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; - attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; - attr.dest_qp_num = lcl_ep->rem_info.rem_qps[qp_index].rem_qp_num; - attr.rq_psn = lcl_ep->rem_info.rem_qps[qp_index].rem_psn; - - attr.ah_attr.is_global = 0; - attr.ah_attr.dlid = lcl_ep->rem_info.rem_lid; - attr.ah_attr.src_path_bits = btl->src_path_bits; - attr.ah_attr.port_num = btl->port_num; - attr.ah_attr.sl = mca_btl_openib_component.ib_service_level; - attr.ah_attr.static_rate = 0; - -#if (ENABLE_DYNAMIC_SL) - /* if user enabled dynamic SL, get it from PathRecord */ - if (0 != mca_btl_openib_component.ib_path_record_service_level) { - int rc = btl_openib_connect_get_pathrecord_sl(qp->context, - attr.ah_attr.port_num, - btl->lid, - attr.ah_attr.dlid); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - attr.ah_attr.sl = rc; - } -#endif - - rc = ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_PATH_MTU | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER | - IBV_QP_RQ_PSN | IBV_QP_AV | IBV_QP_DEST_QPN); - if (OPAL_UNLIKELY(0 != rc)) { - BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno))); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -static inline int udcm_rc_qp_to_rts (mca_btl_base_endpoint_t *lcl_ep, - int qp_index) -{ - struct ibv_qp *qp = lcl_ep->qps[qp_index].qp->lcl_qp; - struct ibv_qp_attr attr; - int rc; - - BTL_VERBOSE(("transitioning QP %p to RTS", (void *)qp)); - - /* Move the QP into the RTS state */ - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTS; - attr.timeout = mca_btl_openib_component.ib_timeout; - attr.retry_cnt = mca_btl_openib_component.ib_retry_count; - /* On PP QPs we have SW flow control, no need for rnr retries. Setting - * it to zero helps to catch bugs */ - attr.rnr_retry = BTL_OPENIB_QP_TYPE_PP(qp_index) ? 0 : - mca_btl_openib_component.ib_rnr_retry; - attr.sq_psn = lcl_ep->qps[qp_index].qp->lcl_psn; - attr.max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; - - rc = ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC); - if (OPAL_UNLIKELY(0 != rc)) { - BTL_ERROR(("error modifing QP %p to RTS errno says %s", - (void *) qp, strerror(errno))); - return OPAL_ERROR; - } - - BTL_VERBOSE(("successfully set RTS")); - - return OPAL_SUCCESS; -} - -/*--------------------------------------------------------------------*/ - -/* - * We have received information about the remote peer's QP; move the - * local QP from INIT to RTS through RTR. - */ -static int udcm_rc_qps_to_rts(mca_btl_openib_endpoint_t *lcl_ep) -{ - int rc; - - for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; ++qp) { - if (lcl_ep->qps[qp].qp->lcl_qp->state == IBV_QPS_RTS) { - continue; - } - - rc = udcm_rc_qp_to_rtr (lcl_ep, qp); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_VERBOSE(("failed moving QP to RTR")); - return rc; - } - - rc = udcm_rc_qp_to_rts (lcl_ep, qp); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - BTL_VERBOSE(("failed moving QP to RTS")); - return rc; - } - } - - /* Ensure that all the writes back to the endpoint and associated - * data structures have completed */ - opal_atomic_wmb(); - mca_btl_openib_endpoint_post_recvs(lcl_ep); - - /* All done */ - return OPAL_SUCCESS; -} - -/* - * Create the local side of one qp. The remote side will be connected - * later. - */ -static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ep, - int qp, struct ibv_srq *srq, uint32_t max_recv_wr, - uint32_t max_send_wr) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); -#if HAVE_DECL_IBV_EXP_CREATE_QP - struct ibv_exp_qp_init_attr init_attr; -#else - struct ibv_qp_init_attr init_attr; -#endif - size_t req_inline; - int rc; - - memset(&init_attr, 0, sizeof(init_attr)); - - init_attr.qp_type = IBV_QPT_RC; - init_attr.send_cq = m->btl->device->ib_cq[BTL_OPENIB_LP_CQ]; - init_attr.recv_cq = m->btl->device->ib_cq[qp_cq_prio(qp)]; - init_attr.srq = srq; - init_attr.cap.max_inline_data = req_inline = - max_inline_size(qp, m->btl->device); - init_attr.cap.max_send_sge = 1; - init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ - if(BTL_OPENIB_QP_TYPE_PP(qp)) { - init_attr.cap.max_recv_wr = max_recv_wr; - } else { - init_attr.cap.max_recv_wr = 0; - } - init_attr.cap.max_send_wr = max_send_wr; - -#if HAVE_DECL_IBV_EXP_CREATE_QP - /* use expanded verbs qp create to enable use of mlx5 atomics */ - init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; - init_attr.pd = m->btl->device->ib_pd; - -#if HAVE_DECL_IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG - init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; - init_attr.max_atomic_arg = sizeof (int64_t); -#endif - -#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE - if (IBV_EXP_ATOMIC_HCA_REPLY_BE == m->btl->device->ib_exp_dev_attr.exp_atomic_cap) { - init_attr.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY; - init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; - } -#endif - - while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_exp_create_qp (m->btl->device->ib_dev_context, - &init_attr))) { - /* NTH: this process may be out of registered memory. try evicting an item from - the lru of this btl's mpool */ - if (false == m->btl->device->rcache->rcache_evict (m->btl->device->rcache)) { - break; - } - } - -#else - - while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_create_qp(m->btl->device->ib_pd, - &init_attr))) { - /* NTH: this process may be out of registered memory. try evicting an item from - the lru of this btl's mpool */ - if (false == m->btl->device->rcache->rcache_evict (m->btl->device->rcache)) { - break; - } - } - -#endif - - if (NULL == lcl_ep->qps[qp].qp->lcl_qp) { - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "ibv_create_qp failed", true, opal_process_info.nodename, - ibv_get_device_name(m->btl->device->ib_dev), - "Reliable connected (RC)"); - - return OPAL_ERROR; - } - - if (init_attr.cap.max_inline_data < req_inline) { - lcl_ep->qps[qp].ib_inline_max = init_attr.cap.max_inline_data; - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "inline truncated", true, opal_process_info.nodename, - ibv_get_device_name(m->btl->device->ib_dev), - m->btl->port_num, req_inline, - init_attr.cap.max_inline_data); - } else { - lcl_ep->qps[qp].ib_inline_max = req_inline; - } - - /* Setup meta data on the endpoint */ - lcl_ep->qps[qp].qp->lcl_psn = udcm_random () & 0x00ffffff; - lcl_ep->qps[qp].credit_frag = NULL; - - rc = udcm_rc_qp_to_init (lcl_ep->qps[qp].qp->lcl_qp, m->btl); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - /* If we have already received a request go ahead and move to - RTS. */ - if (udep->recv_req) { - rc = udcm_rc_qp_to_rtr (lcl_ep, qp); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - return rc; - } - - return udcm_rc_qp_to_rts (lcl_ep, qp); - } - - return OPAL_SUCCESS; -} - -/* - * Create the local side of all the qp's. The remote sides will be - * connected later. - * NTH: This code is common to (and repeated by) all non-XRC cpcs. - */ -static int udcm_rc_qp_create_all (mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - int qp, rc, pp_qp_num = 0; - int32_t rd_rsv_total = 0; - - if (udep->udep_created_qps) - return OPAL_SUCCESS; - - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - rd_rsv_total += - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; - pp_qp_num++; - } - } - - /* if there is no pp QPs we still need reserved WQE for eager rdma flow - * control */ - if (0 == pp_qp_num && true == lcl_ep->use_eager_rdma) { - pp_qp_num = 1; - } - - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - struct ibv_srq *srq = NULL; - uint32_t max_recv_wr, max_send_wr; - int32_t rd_rsv, rd_num_credits; - - /* QP used for SW flow control need some additional recourses */ - if (qp == mca_btl_openib_component.credits_qp) { - rd_rsv = rd_rsv_total; - rd_num_credits = pp_qp_num; - } else { - rd_rsv = rd_num_credits = 0; - } - - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num + - rd_rsv; - max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num + - rd_num_credits; - } else { - srq = lcl_ep->endpoint_btl->qps[qp].u.srq_qp.srq; - - max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num - + rd_rsv; - max_send_wr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max - + rd_num_credits; - } - - /* Go create the actual qp */ - rc = udcm_rc_qp_create_one (m, lcl_ep, qp, srq, max_recv_wr, max_send_wr); - if (OPAL_SUCCESS != rc) { - BTL_VERBOSE(("error creating qp %d for endpoint %p", qp, (void *) lcl_ep)); - return rc; - } - } - - /* All done! */ - udep->udep_created_qps = true; - - return OPAL_SUCCESS; -} - -/* mark: endpoint helper functions */ - -/* JMS: optimization target -- can we send something in private - data to find the proc directly instead of having to search - through *all* procs? */ -static mca_btl_openib_endpoint_t *udcm_find_endpoint (struct mca_btl_openib_module_t *btl, - uint32_t qp_num, uint16_t lid, - udcm_msg_hdr_t *msg_hdr) -{ - mca_btl_base_endpoint_t *endpoint; - struct opal_proc_t *opal_proc; - - opal_proc = opal_proc_for_name (msg_hdr->data.req.rem_name); - if (NULL == opal_proc) { - BTL_ERROR(("could not get proc associated with remote peer")); - return NULL; - } - - endpoint = mca_btl_openib_get_ep (&btl->super, opal_proc); - if (NULL == endpoint) { - BTL_ERROR(("could not find endpoint with port: %d, lid: %d, msg_type: %d", - msg_hdr->data.req.rem_port_num, lid, msg_hdr->type)); - } - - return endpoint; -} - -static int udcm_endpoint_init_data (mca_btl_base_endpoint_t *lcl_ep) -{ - modex_msg_t *remote_msg = UDCM_ENDPOINT_REM_MODEX(lcl_ep); - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - struct ibv_ah_attr ah_attr; - int rc = OPAL_SUCCESS; - - do { - if (udep->udep_initialized) - break; - - /* Cache an address handle for this endpoint */ - memset(&ah_attr, 0, sizeof(ah_attr)); - - ah_attr.dlid = lcl_ep->rem_info.rem_lid; - ah_attr.port_num = remote_msg->mm_port_num; - ah_attr.sl = mca_btl_openib_component.ib_service_level; - ah_attr.src_path_bits = lcl_ep->endpoint_btl->src_path_bits; - if (0 != memcmp (&remote_msg->mm_gid, &m->modex.mm_gid, sizeof (m->modex.mm_gid))) { - ah_attr.is_global = 1; - ah_attr.grh.flow_label = 0; - ah_attr.grh.dgid = remote_msg->mm_gid; - ah_attr.grh.sgid_index = mca_btl_openib_component.gid_index; - /* NTH: probably won't need to go over more than a single router. changeme if this - * assumption is wrong. this value should never be <= 1 as it will not leave the - * the subnet. */ - ah_attr.grh.hop_limit = 2; - /* Seems reasonable to set this to 0 for connection messages. */ - ah_attr.grh.traffic_class = 0; - } - - udep->ah = ibv_create_ah (lcl_ep->endpoint_btl->device->ib_pd, &ah_attr); - if (!udep->ah) { - rc = OPAL_ERROR; - break; - } - } while (0); - - if (OPAL_SUCCESS == rc) { - udep->udep_initialized = true; - } - - return rc; -} - -/* mark: ud send */ - -static inline int udcm_wait_for_send_completion (udcm_module_t *m) -{ - struct ibv_wc wc; - int rc; - - do { - rc = ibv_poll_cq (m->cm_send_cq, 1, &wc); - if (0 > rc) { - BTL_VERBOSE(("send failed")); - return OPAL_ERROR; - } else if (0 == rc) { - continue; - } else if (IBV_WC_SUCCESS != wc.status) { - BTL_ERROR(("send failed with verbs status %d", wc.status)); - return OPAL_ERROR; - } - - break; - } while (1); - - return OPAL_SUCCESS; -} - -static int udcm_post_send (mca_btl_base_endpoint_t *lcl_ep, void *data, - int length, int lkey) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - volatile static int msg_num = 0; - struct ibv_send_wr wr, *bad_wr; - struct ibv_sge sge; - int rc; - - /* NTH: need to lock here or we run into problems (slowness) */ - opal_mutex_lock(&m->cm_send_lock); - - if (0 == lkey) { - /* copy the message into the registered send buffer */ - sge.addr = (uintptr_t) udcm_module_get_send_buffer (m); - sge.length = length; - sge.lkey = m->cm_mr->lkey; - - memcpy ((uintptr_t *)sge.addr, data, length); - } else { - sge.addr = (uintptr_t) data; - sge.length = length; - sge.lkey = lkey; - } - - wr.wr_id = UDCM_WR_SEND_ID | msg_num++; - wr.next = NULL; - wr.sg_list = &sge; - wr.num_sge = 1; - wr.opcode = IBV_WR_SEND; - wr.send_flags = IBV_SEND_SOLICITED | IBV_SEND_SIGNALED; - wr.wr.ud.ah = udep->ah; - - wr.wr.ud.remote_qpn = UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num; - wr.wr.ud.remote_qkey = 0; - - rc = ibv_post_send (m->listen_qp, &wr, &bad_wr); - if (0 != rc) { - BTL_VERBOSE(("error posting send. errno: %d", errno)); - } else { - rc = udcm_wait_for_send_completion (m); - } - - opal_mutex_unlock (&m->cm_send_lock); - - return rc; -} - -/* mark: message allocation */ - -static int udcm_new_message (mca_btl_base_endpoint_t *lcl_ep, - mca_btl_base_endpoint_t *rem_ep, uint8_t type, - size_t length, udcm_message_sent_t **msgp) -{ - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - udcm_message_sent_t *message; - - message = OBJ_NEW(udcm_message_sent_t); - if (NULL == message) { - BTL_ERROR(("malloc failed!")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - message->data = calloc (m->msg_length, 1); - if (NULL == message->data) { - OBJ_RELEASE(message); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - message->length = length; - - message->data->hdr.rem_ep = lcl_ep; - message->data->hdr.lcl_ep = rem_ep; - message->data->hdr.type = type; - message->data->hdr.rem_ctx = (uintptr_t) message; - - message->endpoint = lcl_ep; - - udcm_set_message_timeout (message); - - opal_atomic_wmb (); - - *msgp = message; - - BTL_VERBOSE(("created message %p with type %d", (void *) message, type)); - - return OPAL_SUCCESS; -} - -/* mark: rc message functions */ - -/* - * Allocate a CM request structure and initialize some common fields - * (that are independent of the specific QP, etc.) - */ -static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep, - mca_btl_base_endpoint_t *rem_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - udcm_message_sent_t *msg; - int i, rc; - - BTL_VERBOSE(("sending request for endpoint %p", (void *) lcl_ep)); - - udep->sent_req = true; - - if (0 != (rc = udcm_new_message (lcl_ep, rem_ep, UDCM_MESSAGE_CONNECT, - m->msg_length, &msg))) { - return rc; - } - - msg->data->hdr.data.req.rem_ep_index = htonl(lcl_ep->index); - msg->data->hdr.data.req.rem_port_num = m->modex.mm_port_num; - msg->data->hdr.data.req.rem_name = OPAL_PROC_MY_NAME; - - for (i = 0 ; i < mca_btl_openib_component.num_qps ; ++i) { - msg->data->qps[i].psn = htonl(lcl_ep->qps[i].qp->lcl_psn); - msg->data->qps[i].qp_num = htonl(lcl_ep->qps[i].qp->lcl_qp->qp_num); - } - - if (0 != (rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0))) { - BTL_VERBOSE(("error posting REQ")); - - udcm_free_message (msg); - - return rc; - } - - return 0; -} - -static int udcm_send_complete (mca_btl_base_endpoint_t *lcl_ep, - mca_btl_base_endpoint_t *rem_ep) -{ - udcm_message_sent_t *msg; - int rc; - - if (0 != (rc = udcm_new_message (lcl_ep, rem_ep, UDCM_MESSAGE_COMPLETE, - sizeof (udcm_msg_hdr_t), &msg))) { - return rc; - } - - rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0); - if (0 != rc) { - BTL_VERBOSE(("error posting complete")); - - udcm_free_message (msg); - - return rc; - } - - return 0; -} - -static int udcm_send_reject (mca_btl_base_endpoint_t *lcl_ep, - mca_btl_base_endpoint_t *rem_ep, - int rej_reason) -{ - udcm_message_sent_t *msg; - int rc; - - if (0 != (rc = udcm_new_message (lcl_ep, rem_ep, UDCM_MESSAGE_REJECT, - sizeof (udcm_msg_hdr_t), &msg))) { - return rc; - } - - msg->data->hdr.data.rej.reason = htonl(rej_reason); - - rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0); - if (0 != rc) { - BTL_VERBOSE(("error posting rejection")); - - udcm_free_message (msg); - - return rc; - } - - return 0; -} - -static int udcm_send_ack (mca_btl_base_endpoint_t *lcl_ep, uintptr_t rem_ctx) -{ - udcm_msg_hdr_t hdr; - - BTL_VERBOSE(("sending ack for message %p on ep %p", (void *) rem_ctx, (void *) lcl_ep)); - - hdr.type = UDCM_MESSAGE_ACK; - hdr.rem_ctx = rem_ctx; - - return udcm_post_send (lcl_ep, &hdr, sizeof (hdr), 0); -} - -static int udcm_handle_ack (udcm_module_t *m, const uintptr_t ctx, const uint16_t slid, - const uint32_t rem_qp) -{ - udcm_message_sent_t *msg, *next; - bool found = false; - - opal_mutex_lock (&m->cm_timeout_lock); - - BTL_VERBOSE(("got ack for message %p from slid 0x%04x qp 0x%08x", (void *) ctx, slid, - rem_qp)); - - /* verify that the message is still active */ - OPAL_LIST_FOREACH_SAFE(msg, next, &m->flying_messages, udcm_message_sent_t) { - if ((uintptr_t) msg != ctx) { - continue; - } - - BTL_VERBOSE(("found matching message")); - found = true; - - /* mark that this event is not active anymore */ - msg->event_active = false; - - /* there is a possibility this event is being handled by another thread right now. it - * should be safe to activate the event even in this case. the callback will handle - * releasing the message. this is done to avoid a race between the message handling - * thread and the thread progressing libevent. if the message handler is ever put - * in the event base then it will be safe to just release the message here but that - * is not the case atm. */ - opal_event_active (&msg->event, 0, 0); - - break; - } - - if (!found) { - BTL_VERBOSE(("message %p not found in the list of flying messages", (void *) ctx)); - } - - opal_mutex_unlock (&m->cm_timeout_lock); - - return OPAL_SUCCESS; -} - -/* mark: rc message handling */ - -static int udcm_handle_connect(mca_btl_openib_endpoint_t *lcl_ep, - mca_btl_openib_endpoint_t *rem_ep) -{ - udcm_reject_reason_t rej_reason = UDCM_REJ_REMOTE_ERROR; - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int rc = OPAL_ERROR; - - if (NULL == udep) { - return OPAL_ERROR; - } - - do { - opal_mutex_lock (&udep->udep_lock); - - if (true == udep->recv_req) { - /* this endpoint is already connected */ - BTL_VERBOSE(("already connected")); - rc = OPAL_SUCCESS; - rej_reason = UDCM_REJ_ALREADY_CONNECTED; - break; - } - - udep->recv_req = true; - - opal_atomic_wmb (); - if (MCA_BTL_IB_CLOSED == lcl_ep->endpoint_state) { - lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTING; - } - - if (OPAL_SUCCESS != (rc = udcm_rc_qp_create_all (lcl_ep))) { - BTL_VERBOSE(("error initializing endpoint qps")); - break; - } - - rc = udcm_rc_qps_to_rts (lcl_ep); - if (OPAL_SUCCESS != rc) { - break; - } - - if (false == udep->sent_req) { - rc = udcm_send_request (lcl_ep, rem_ep); - - if (OPAL_SUCCESS != rc) { - break; - } - } - - rc = udcm_send_complete (lcl_ep, rem_ep); - if (OPAL_SUCCESS != rc) { - break; - } - - if (udep->recv_comp) { - udcm_finish_connection (lcl_ep); - } - - opal_mutex_unlock (&udep->udep_lock); - - return OPAL_SUCCESS; - } while (0); - - opal_mutex_unlock (&udep->udep_lock); - - /* Reject the request */ - BTL_VERBOSE(("rejecting request for reason %d", rej_reason)); - - udcm_send_reject (lcl_ep, rem_ep, rej_reason); - - if (OPAL_SUCCESS != rc) { - /* Communicate to the upper layer that the connection on this - endpoint has failed */ - mca_btl_openib_endpoint_invoke_error (lcl_ep); - } - - return rc; -} - -static int udcm_handle_reject(mca_btl_openib_endpoint_t *lcl_ep, - udcm_msg_hdr_t *msg_hdr) -{ - int32_t reason = ntohl(msg_hdr->data.rej.reason); - - BTL_VERBOSE(("reject received: reason %d", reason)); - - if (UDCM_REJ_ALREADY_CONNECTED == reason) { - return OPAL_SUCCESS; - } -#if HAVE_XRC - else if (UDCM_REJ_NOT_READY == reason) { - return udcm_xrc_restart_connect (lcl_ep); - } -#endif - - /* Communicate to the upper layer that the connection on this - endpoint has failed */ - mca_btl_openib_endpoint_invoke_error (lcl_ep); - - return OPAL_ERR_NOT_FOUND; -} - -static int udcm_finish_connection (mca_btl_openib_endpoint_t *lcl_ep) -{ - BTL_VERBOSE(("finishing connection for endpoint %p.", (void *) lcl_ep)); - - /* Ensure that all the writes back to the endpoint and associated - data structures have completed */ - opal_atomic_wmb(); - - mca_btl_openib_endpoint_cpc_complete(lcl_ep); - - return OPAL_SUCCESS; -} - -static int udcm_handle_complete (mca_btl_openib_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - - udep->recv_comp = true; - if (udep->recv_req) { - udcm_finish_connection (lcl_ep); - } else { - OPAL_THREAD_UNLOCK(&lcl_ep->endpoint_lock); - } - - return OPAL_SUCCESS; -} - -/* mark: message processing */ - -static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m) -{ - mca_btl_openib_endpoint_t *lcl_ep; - int msg_num, i, count; - udcm_msg_t *message = NULL; - udcm_message_recv_t *item; - struct ibv_wc wc[20]; -#if OPAL_ENABLE_DEBUG - struct ibv_grh *grh; -#endif - udcm_endpoint_t *udep; - uint64_t dir; - - memset(wc, 0, sizeof(wc)); - - count = ibv_poll_cq (event_cq, 20, wc); - if (count < 0) - return count; - - for (i = 0 ; i < count ; i++) { - dir = wc[i].wr_id & UDCM_WR_DIR_MASK; - - if (UDCM_WR_RECV_ID != dir) { - opal_output (0, "unknown packet"); - continue; - } - - msg_num = (int)(wc[i].wr_id & (~UDCM_WR_DIR_MASK)); - -#if OPAL_ENABLE_DEBUG - grh = (wc[i].wc_flags & IBV_WC_GRH) ? (struct ibv_grh *) udcm_module_get_recv_buffer (m, msg_num, false) : NULL; -#endif - - BTL_VERBOSE(("WC: wr_id: 0x%016" PRIu64 ", status: %d, opcode: 0x%x, byte_len: %x, imm_data: 0x%08x, " - "qp_num: 0x%08x, src_qp: 0x%08x, wc_flags: 0x%x, slid: 0x%04x grh_present: %s", - wc[i].wr_id, wc[i].status, wc[i].opcode, wc[i].byte_len, - wc[i].imm_data, wc[i].qp_num, wc[i].src_qp, wc[i].wc_flags, wc[i].slid, - grh ? "yes" : "no")); - - if (IBV_WC_SUCCESS != wc[i].status) { - BTL_ERROR(("recv work request for buffer %d failed, code = %d", - msg_num, wc[i].status)); - count = -1; - break; - } - - message = (udcm_msg_t *) udcm_module_get_recv_buffer (m, msg_num, true); - - if (UDCM_MESSAGE_ACK == message->hdr.type) { - /* ack! */ - udcm_handle_ack (m, message->hdr.rem_ctx, wc[i].slid, wc[i].src_qp); - udcm_module_post_one_recv (m, msg_num); - - continue; - } - - lcl_ep = message->hdr.lcl_ep; - - if (NULL == lcl_ep) { - lcl_ep = udcm_find_endpoint (m->btl, wc[i].src_qp, wc[i].slid, &message->hdr); - } - - if (NULL == lcl_ep ) { - /* cant find associated endpoint */ - BTL_ERROR(("could not find associated endpoint.")); - udcm_module_post_one_recv (m, msg_num); - - continue; - } - - message->hdr.lcl_ep = lcl_ep; - - BTL_VERBOSE(("received message. type: %u, lcl_ep = %p, rem_ep = %p, " - "src qpn = %d, length = %d, local buffer # = %d", - message->hdr.type, (void *) message->hdr.lcl_ep, (void *) message->hdr.rem_ep, - wc[i].src_qp, wc[i].byte_len, msg_num)); - - udep = UDCM_ENDPOINT_DATA(lcl_ep); - - if (NULL == udep) { - /* Endpoint was not initialized or was finalized */ - udcm_module_post_one_recv (m, msg_num); - continue; - } - - opal_mutex_lock (&udep->udep_lock); - - /* Need to ensure endpoint data is initialized before sending the ack */ - if (OPAL_SUCCESS != udcm_endpoint_init_data (lcl_ep)) { - BTL_ERROR(("could not initialize cpc data for endpoint")); - udcm_module_post_one_recv (m, msg_num); - opal_mutex_unlock (&udep->udep_lock); - continue; - } - - /* save message data in the endpoint */ - if (UDCM_MESSAGE_CONNECT == message->hdr.type) { - /* Save remote queue pair information */ - int num_qps = mca_btl_openib_component.num_qps; - - lcl_ep->rem_info.rem_index = ntohl(message->hdr.data.req.rem_ep_index); - - for (int qp_index = 0 ; qp_index < num_qps ; ++qp_index) { - /* Save these numbers on the endpoint for reference. */ - lcl_ep->rem_info.rem_qps[qp_index].rem_psn = - ntohl(message->qps[qp_index].psn); - lcl_ep->rem_info.rem_qps[qp_index].rem_qp_num = - ntohl(message->qps[qp_index].qp_num); - } - } - -#if HAVE_XRC - else if (UDCM_MESSAGE_XRESPONSE == message->hdr.type || - UDCM_MESSAGE_XRESPONSE2 == message->hdr.type) { - /* save remote srq information */ - int num_srqs = mca_btl_openib_component.num_xrc_qps; - - lcl_ep->rem_info.rem_index = ntohl(message->hdr.data.xres.rem_ep_index); - - for (int i = 0 ; i < num_srqs ; ++i) { - lcl_ep->rem_info.rem_srqs[i].rem_srq_num = ntohl(message->qps[i].qp_num); - BTL_VERBOSE(("Received srq[%d] num = %d", i, lcl_ep->rem_info.rem_srqs[i].rem_srq_num)); - } - - if (UDCM_MESSAGE_XRESPONSE == message->hdr.type) { - /* swap response header data */ - message->hdr.data.xres.rem_psn = ntohl(message->hdr.data.xres.rem_psn); - message->hdr.data.xres.rem_qp_num = ntohl(message->hdr.data.xres.rem_qp_num); - - /* save remote qp information not included in the XRESPONSE2 message */ - lcl_ep->rem_info.rem_qps[0].rem_psn = message->hdr.data.xres.rem_psn; - lcl_ep->rem_info.rem_qps[0].rem_qp_num = message->hdr.data.xres.rem_qp_num; - - BTL_VERBOSE(("Received remote qp: %d, psn: %d", lcl_ep->rem_info.rem_qps[0].rem_qp_num, - lcl_ep->rem_info.rem_qps[0].rem_psn)) - - /* update ib_addr with remote qp number */ - lcl_ep->ib_addr->remote_xrc_rcv_qp_num = lcl_ep->rem_info.rem_qps[0].rem_qp_num; - } - } else if (UDCM_MESSAGE_XCONNECT == message->hdr.type || - UDCM_MESSAGE_XCONNECT2 == message->hdr.type) { - lcl_ep->rem_info.rem_index = ntohl(message->hdr.data.xreq.rem_ep_index); - - /* swap request header data */ - message->hdr.data.xreq.rem_qp_num = ntohl(message->hdr.data.xreq.rem_qp_num); - message->hdr.data.xreq.rem_psn = ntohl(message->hdr.data.xreq.rem_psn); - - if (UDCM_MESSAGE_XCONNECT2 == message->hdr.type) { - /* save the qp number for unregister */ -#if ! OPAL_HAVE_CONNECTX_XRC_DOMAINS - lcl_ep->xrc_recv_qp_num = message->hdr.data.xreq.rem_qp_num; -#endif - - } - } -#endif - - opal_mutex_unlock (&udep->udep_lock); - - item = OBJ_NEW(udcm_message_recv_t); - - /* Copy just the message header */ - memcpy (&item->msg_hdr, &message->hdr, sizeof (message->hdr)); - - opal_fifo_push_atomic (&m->cm_recv_msg_fifo, &item->super); - - udcm_send_ack (lcl_ep, message->hdr.rem_ctx); - - /* Repost the receive */ - udcm_module_post_one_recv (m, msg_num); - } - - opal_atomic_wmb (); - - if (0 == opal_atomic_swap_32 (&m->cm_message_event_active, 1)) { - opal_event_active (&m->cm_message_event, OPAL_EV_READ, 1); - } - - return count; -} - -static void *udcm_cq_event_dispatch(int fd, int flags, void *context) -{ - udcm_module_t *m = (udcm_module_t *) context; - struct ibv_cq *event_cq = m->cm_recv_cq; - void *event_context; - int rc; - - opal_mutex_lock (&m->cm_lock); - - do { - if (OPAL_UNLIKELY(NULL == m || NULL == m->cm_channel)) { - break; - } - - rc = ibv_get_cq_event (m->cm_channel, &event_cq, &event_context); - - if (0 != rc || NULL == event_cq) { - break; - } - - /* acknowlege the event */ - ibv_ack_cq_events (event_cq, 1); - - if (m->cm_exiting) { - break; - } - - rc = udcm_process_messages (event_cq, m); - if (rc < 0) { - BTL_VERBOSE(("error processing incomming messages")); - break; - } - - if (ibv_req_notify_cq(event_cq, 0)) { - BTL_VERBOSE(("error asking for cq notifications")); - } - } while (0); - - opal_mutex_unlock (&m->cm_lock); - - return NULL; -} - -static void *udcm_message_callback (int fd, int flags, void *context) -{ - udcm_module_t *m = (udcm_module_t *) context; - udcm_message_recv_t *item; - - BTL_VERBOSE(("running message thread")); - - /* Mark that the callback was started */ - opal_atomic_swap_32 (&m->cm_message_event_active, 0); - opal_atomic_wmb (); - - while ((item = (udcm_message_recv_t *) opal_fifo_pop_atomic (&m->cm_recv_msg_fifo))) { - mca_btl_openib_endpoint_t *lcl_ep = item->msg_hdr.lcl_ep; - - OPAL_THREAD_LOCK(&lcl_ep->endpoint_lock); - - switch (item->msg_hdr.type) { - case UDCM_MESSAGE_CONNECT: - udcm_handle_connect (lcl_ep, item->msg_hdr.rem_ep); - OPAL_THREAD_UNLOCK(&lcl_ep->endpoint_lock); - break; - case UDCM_MESSAGE_REJECT: - udcm_handle_reject (lcl_ep, &item->msg_hdr); - OPAL_THREAD_UNLOCK(&lcl_ep->endpoint_lock); - break; - case UDCM_MESSAGE_COMPLETE: - udcm_handle_complete (lcl_ep); - break; -#if HAVE_XRC - case UDCM_MESSAGE_XRESPONSE2: - udcm_finish_connection (lcl_ep); - break; - case UDCM_MESSAGE_XRESPONSE: - /* udcm_handle_xresponse will call mca_btl_openib_endpoint_cpc_complete - which will drop the thread lock */ - udcm_xrc_handle_xresponse (lcl_ep, &item->msg_hdr); - break; - case UDCM_MESSAGE_XCONNECT: - case UDCM_MESSAGE_XCONNECT2: - udcm_xrc_handle_xconnect (lcl_ep, &item->msg_hdr); - OPAL_THREAD_UNLOCK(&lcl_ep->endpoint_lock); - break; -#endif - default: - BTL_VERBOSE(("unknown message type")); - } - - OBJ_RELEASE (item); - } - - BTL_VERBOSE(("exiting message thread")); - - return NULL; -} - -/* mark: udcm_message_sent_t class */ - -static void udcm_sent_message_constructor (udcm_message_sent_t *message) -{ - memset ((char *)message + sizeof (message->super), 0, - sizeof (*message) - sizeof (message->super)); - opal_event_evtimer_set(opal_sync_event_base, &message->event, udcm_send_timeout, message); -} - -static void udcm_sent_message_destructor (udcm_message_sent_t *message) -{ - if (message->data) { - free (message->data); - } - - opal_event_evtimer_del (&message->event); - message->event_active = false; -} - -/* mark: message timeout code */ -/* Message timeouts */ -static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg) -{ - udcm_message_sent_t *msg = (udcm_message_sent_t *) arg; - mca_btl_base_endpoint_t *lcl_ep = msg->endpoint; - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - - opal_mutex_lock (&m->cm_timeout_lock); - opal_list_remove_item (&m->flying_messages, &msg->super); - opal_mutex_unlock (&m->cm_timeout_lock); - - if (m->cm_exiting || !msg->event_active) { - /* we are exiting or the event is no longer valid */ - OBJ_RELEASE(msg); - return; - } - - msg->event_active = false; - - do { - BTL_VERBOSE(("send for message to 0x%04x:0x%08x timed out", - UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_lid, - UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num)); - - /* This happens from time to time at the end of a run (probably due to a - lost ack on the completion message). */ - if (NULL == lcl_ep->endpoint_local_cpc_data || - MCA_BTL_IB_CONNECTED == lcl_ep->endpoint_state || - m->cm_exiting) { - OBJ_RELEASE (msg); - break; - } - - if (msg->tries == udcm_max_retry) { - opal_output (0, "too many retries sending message to 0x%04x:0x%08x, giving up", - UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_lid, - UDCM_ENDPOINT_REM_MODEX(lcl_ep)->mm_qp_num); - - /* We are running in the timeout thread. Invoke the error in the - * "main thread" because it may call up into the pml or another - * component that may not have threading support enabled. */ - mca_btl_openib_run_in_main (mca_btl_openib_endpoint_invoke_error, lcl_ep); - break; - } - - msg->tries++; - - udcm_set_message_timeout (msg); - - if (0 != udcm_post_send (lcl_ep, msg->data, msg->length, 0)) { - BTL_VERBOSE(("error reposting message")); - mca_btl_openib_run_in_main (mca_btl_openib_endpoint_invoke_error, lcl_ep); - break; - } - } while (0); -} - -static void udcm_set_message_timeout (udcm_message_sent_t *message) -{ - udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint); - - BTL_VERBOSE(("activating timeout for message %p", (void *) message)); - - opal_mutex_lock (&m->cm_timeout_lock); - - opal_list_append (&m->flying_messages, &message->super); - - /* start the event */ - opal_event_evtimer_add (&message->event, &udcm_timeout_tv); - message->event_active = true; - - opal_mutex_unlock (&m->cm_timeout_lock); -} - -static void udcm_free_message (udcm_message_sent_t *message) -{ - udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint); - - BTL_VERBOSE(("releasing message %p", (void *) message)); - - opal_mutex_lock (&m->cm_timeout_lock); - - if (message->event_active) { - opal_list_remove_item (&m->flying_messages, &message->super); - message->event_active = false; - } - - opal_mutex_unlock (&m->cm_timeout_lock); - - OBJ_RELEASE(message); -} - -/* mark: xrc connection support */ - -/* XRC support functions */ -#if HAVE_XRC -static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, - mca_btl_base_endpoint_t *lcl_ep) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int rc = OPAL_SUCCESS; - - opal_mutex_lock (&udep->udep_lock); - opal_mutex_lock (&lcl_ep->ib_addr->addr_lock); - - if (OPAL_SUCCESS != (rc = udcm_endpoint_init_data (lcl_ep))) { - BTL_VERBOSE(("error initializing endpoint cpc data")); - opal_mutex_unlock (&udep->udep_lock); - opal_mutex_unlock (&lcl_ep->ib_addr->addr_lock); - return rc; - } - - lcl_ep->endpoint_state = MCA_BTL_IB_CONNECTING; - - BTL_VERBOSE(("The IB addr: sid %" PRIx64 " lid %d with status %d, " - "subscribing to this address", lcl_ep->ib_addr->subnet_id, - lcl_ep->ib_addr->status, lcl_ep->ib_addr->lid)); - - switch (lcl_ep->ib_addr->status) { - case MCA_BTL_IB_ADDR_CLOSED: - if (OPAL_SUCCESS != (rc = udcm_xrc_send_qp_create(lcl_ep))) { - break; - } - - /* Send connection info over to remote endpoint */ - lcl_ep->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTING; - if (OPAL_SUCCESS != (rc = udcm_xrc_send_request (lcl_ep, NULL, UDCM_MESSAGE_XCONNECT))) { - BTL_ERROR(("Error sending connect request, error code %d", rc)); - } - break; - case MCA_BTL_IB_ADDR_CONNECTING: - /* somebody already connectng to this machine, lets wait */ - opal_list_append(&lcl_ep->ib_addr->pending_ep, &lcl_ep->super); - break; - case MCA_BTL_IB_ADDR_CONNECTED: - /* so we have the send qp, we just need the receive site. - * Send request for SRQ numbers */ - - if (OPAL_SUCCESS != (rc = udcm_xrc_send_request (lcl_ep, NULL, UDCM_MESSAGE_XCONNECT2))) { - BTL_ERROR(("error sending xrc connect request, error code %d", rc)); - } - break; - default: - BTL_ERROR(("Invalid endpoint status %d", lcl_ep->ib_addr->status)); - } - - opal_mutex_unlock (&lcl_ep->ib_addr->addr_lock); - opal_mutex_unlock (&udep->udep_lock); - - return rc; -} - -/* In case if XRC recv qp was closed and sender still don't know about it - * we need close the qp, reset the ib_adrr status to CLOSED and start everything - * from scratch. - */ -static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep) -{ - opal_mutex_lock (&lcl_ep->ib_addr->addr_lock); - - BTL_VERBOSE(("Restart connection for IB addr: sid %" PRIx64 " lid %d, with status " - "%d, resetting and starting from scratch", lcl_ep->ib_addr->subnet_id, - lcl_ep->ib_addr->lid, lcl_ep->ib_addr->status)); - - if (MCA_BTL_IB_ADDR_CONNECTED == lcl_ep->ib_addr->status) { - /* so we have the send qp, we just need the recive site. - * Send request for SRQ numbers */ - /* Switching back to closed and starting from scratch */ - lcl_ep->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; - /* destroy the qp */ - /* the reciver site was alredy closed so all pending list must be clean ! */ - assert (opal_list_is_empty(&lcl_ep->qps->no_wqe_pending_frags[0])); - assert (opal_list_is_empty(&lcl_ep->qps->no_wqe_pending_frags[1])); - - if (ibv_destroy_qp (lcl_ep->qps[0].qp->lcl_qp)) - BTL_ERROR(("Failed to destroy QP. errno %d", errno)); - } - - opal_mutex_unlock (&lcl_ep->ib_addr->addr_lock); - - /* udcm_xrc_start_connect () should automaticly handle all other cases */ - return udcm_xrc_start_connect (NULL, lcl_ep); -} - -/* mark: xrc send qp */ - -/* Send qp connect */ -static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn) -{ - mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; - struct ibv_qp_attr attr; - struct ibv_qp *qp; - uint32_t psn; - int ret; - - BTL_VERBOSE(("Connecting send qp: %p, remote qp: %d", (void *)lcl_ep->qps[0].qp->lcl_qp, - rem_qp_num)); - assert(NULL != lcl_ep->qps); - qp = lcl_ep->qps[0].qp->lcl_qp; - psn = lcl_ep->qps[0].qp->lcl_psn; - - - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? - openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; - attr.dest_qp_num = rem_qp_num; - attr.rq_psn = rem_psn; - attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; - attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; - attr.ah_attr.is_global = 0; - attr.ah_attr.dlid = lcl_ep->rem_info.rem_lid; - attr.ah_attr.src_path_bits = openib_btl->src_path_bits; - attr.ah_attr.port_num = openib_btl->port_num; - attr.ah_attr.static_rate = 0; - attr.ah_attr.sl = mca_btl_openib_component.ib_service_level; - -#if (ENABLE_DYNAMIC_SL) - /* if user enabled dynamic SL, get it from PathRecord */ - if (0 != mca_btl_openib_component.ib_path_record_service_level) { - int rc = btl_openib_connect_get_pathrecord_sl(qp->context, - attr.ah_attr.port_num, - openib_btl->lid, - attr.ah_attr.dlid); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - attr.ah_attr.sl = rc; - } -#endif - - if (mca_btl_openib_component.verbose) { - BTL_VERBOSE(("Set MTU to IBV value %d (%s bytes)", attr.path_mtu, - (attr.path_mtu == IBV_MTU_256) ? "256" : - (attr.path_mtu == IBV_MTU_512) ? "512" : - (attr.path_mtu == IBV_MTU_1024) ? "1024" : - (attr.path_mtu == IBV_MTU_2048) ? "2048" : - (attr.path_mtu == IBV_MTU_4096) ? "4096" : - "unknown (!)")); - } - ret = ibv_modify_qp(qp, &attr, - IBV_QP_STATE | - IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | - IBV_QP_MIN_RNR_TIMER); - if (ret) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", - qp->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - attr.qp_state = IBV_QPS_RTS; - attr.timeout = mca_btl_openib_component.ib_timeout; - attr.retry_cnt = mca_btl_openib_component.ib_retry_count; - attr.rnr_retry = mca_btl_openib_component.ib_rnr_retry; - attr.sq_psn = psn; - attr.max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; - ret = ibv_modify_qp(qp, &attr, - IBV_QP_STATE | - IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | - IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC); - if (ret) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTS errno says: %s [%d]", - qp->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/* Create XRC send qp */ -static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) -{ - int prio = BTL_OPENIB_LP_CQ; /* all send completions go to low prio CQ */ - uint32_t send_wr; - struct ibv_qp **qp; - uint32_t *psn; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_qp_init_attr_ex qp_init_attr; -#else - struct ibv_qp_init_attr qp_init_attr; -#endif - struct ibv_qp_attr attr; - int ret; - size_t req_inline; - - mca_btl_openib_module_t *openib_btl = - (mca_btl_openib_module_t*)lcl_ep->endpoint_btl; - - /* Prepare QP structs */ - BTL_VERBOSE(("creating xrc send qp")); - qp = &lcl_ep->qps[0].qp->lcl_qp; - psn = &lcl_ep->qps[0].qp->lcl_psn; - - /* reserve additional wr for eager rdma credit management */ - send_wr = lcl_ep->ib_addr->max_wqe + - (mca_btl_openib_component.use_eager_rdma ? - mca_btl_openib_component.max_eager_rdma : 0); -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr_ex)); -#else - memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); -#endif - memset(&attr, 0, sizeof(struct ibv_qp_attr)); - - qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->device->ib_cq[prio]; - - /* if this code is update the code in endpoint_init_qp_xrc may need to - * be updated as well */ - /* no need recv queue; receives are posted to srq */ - qp_init_attr.cap.max_recv_wr = 0; - qp_init_attr.cap.max_send_wr = send_wr; - qp_init_attr.cap.max_inline_data = req_inline = - openib_btl->device->max_inline_data; - qp_init_attr.cap.max_send_sge = 1; - /* this one is ignored by driver */ - qp_init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - qp_init_attr.qp_type = IBV_QPT_XRC_SEND; - qp_init_attr.comp_mask = IBV_QP_INIT_ATTR_PD; - qp_init_attr.pd = openib_btl->device->ib_pd; - *qp = ibv_create_qp_ex(openib_btl->device->ib_dev_context, &qp_init_attr); -#else - qp_init_attr.qp_type = IBV_QPT_XRC; - qp_init_attr.xrc_domain = openib_btl->device->xrc_domain; - *qp = ibv_create_qp(openib_btl->device->ib_pd, &qp_init_attr); -#endif - if (NULL == *qp) { - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "ibv_create_qp failed", true, - opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), - "Reliable connected (XRC)"); - return OPAL_ERROR; - } - - if (qp_init_attr.cap.max_inline_data < req_inline) { - lcl_ep->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data; - opal_show_help("help-mpi-btl-openib-cpc-base.txt", - "inline truncated", opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), - openib_btl->port_num, - req_inline, qp_init_attr.cap.max_inline_data); - } else { - lcl_ep->qps[0].ib_inline_max = req_inline; - } - - attr.qp_state = IBV_QPS_INIT; - attr.pkey_index = openib_btl->pkey_index; - attr.port_num = openib_btl->port_num; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; -#if HAVE_DECL_IBV_ATOMIC_HCA - attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; -#endif - ret = ibv_modify_qp(*qp, &attr, - IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS ); - if (ret) { - BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", - (*qp)->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - /* Setup meta data on the endpoint */ - *psn = udcm_random () & 0x00ffffff; - - /* Now that all the qp's are created locally, post some receive - buffers, setup credits, etc. */ - return mca_btl_openib_endpoint_post_recvs(lcl_ep); -} - -/* mark: xrc receive qp */ - -/* Recv qp connect */ -static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num) -{ - mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; - -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_qp_open_attr attr; - memset(&attr, 0, sizeof(struct ibv_qp_open_attr)); - attr.comp_mask = IBV_QP_OPEN_ATTR_NUM | IBV_QP_OPEN_ATTR_XRCD | IBV_QP_OPEN_ATTR_TYPE; - attr.qp_num = qp_num; - attr.qp_type = IBV_QPT_XRC_RECV; - attr.xrcd = openib_btl->device->xrcd; - BTL_VERBOSE(("Connecting Recv QP\n")); - lcl_ep->xrc_recv_qp = ibv_open_qp(openib_btl->device->ib_dev_context, &attr); - if (NULL == lcl_ep->xrc_recv_qp) { /* failed to regester the qp, so it is already die and we should create new one */ - /* Return NOT READY !!!*/ - BTL_VERBOSE(("Failed to register qp_num: %d, get error: %s (%d)\n. Replying with RNR", - qp_num, strerror(errno), errno)); - return OPAL_ERROR; - } else { - BTL_VERBOSE(("Connected to XRC Recv qp [%d]", lcl_ep->xrc_recv_qp->qp_num)); - return OPAL_SUCCESS; - } -#else - int ret; - /* silence unused variable warning */ - (void) qp_num; - - BTL_VERBOSE(("Connecting receive qp: %d", lcl_ep->xrc_recv_qp_num)); - ret = ibv_reg_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num); - if (ret) { /* failed to regester the qp, so it is already die and we should create new one */ - /* Return NOT READY !!!*/ - lcl_ep->xrc_recv_qp_num = 0; - BTL_VERBOSE(("Failed to register qp_num: %d , get error: %s (%d). Replying with RNR", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } -#endif - - return OPAL_SUCCESS; -} - -/* Recv qp create */ -static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, uint32_t rem_qp_num, uint32_t rem_psn) -{ - mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl; -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - struct ibv_qp_init_attr_ex qp_init_attr; -#else - struct ibv_qp_init_attr qp_init_attr; -#endif - struct ibv_qp_attr attr; - int ret; - - BTL_VERBOSE(("creating xrc receive qp")); - -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr_ex)); - qp_init_attr.qp_type = IBV_QPT_XRC_RECV; - qp_init_attr.comp_mask = IBV_QP_INIT_ATTR_XRCD; - qp_init_attr.xrcd = openib_btl->device->xrcd; - lcl_ep->xrc_recv_qp = ibv_create_qp_ex(openib_btl->device->ib_dev_context, - &qp_init_attr); - if (NULL == lcl_ep->xrc_recv_qp) { - BTL_ERROR(("Error creating XRC recv QP, errno says: %s [%d]", - strerror(errno), errno)); - return OPAL_ERROR; - } -#else - memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); - /* Only xrc_domain is required, all other are ignored */ - qp_init_attr.xrc_domain = openib_btl->device->xrc_domain; - ret = ibv_create_xrc_rcv_qp(&qp_init_attr, &lcl_ep->xrc_recv_qp_num); - if (ret) { - BTL_ERROR(("Error creating XRC recv QP[%x], errno says: %s [%d]", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } -#endif - - memset(&attr, 0, sizeof(struct ibv_qp_attr)); - attr.qp_state = IBV_QPS_INIT; - attr.pkey_index = openib_btl->pkey_index; - attr.port_num = openib_btl->port_num; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; - -#if HAVE_DECL_IBV_ATOMIC_HCA - attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; -#endif - -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - ret = ibv_modify_qp(lcl_ep->xrc_recv_qp, - &attr, - IBV_QP_STATE| - IBV_QP_PKEY_INDEX| - IBV_QP_PORT| - IBV_QP_ACCESS_FLAGS); - if (ret) { - BTL_ERROR(("Error modifying XRC recv QP to IBV_QPS_INIT, errno says: %s [%d]", - strerror(ret), ret)); - return OPAL_ERROR; - } -#else - ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain, - lcl_ep->xrc_recv_qp_num, &attr, - IBV_QP_STATE | IBV_QP_PKEY_INDEX | - IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); - if (ret) { - BTL_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_INIT, errno says: %s [%d]", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); - while(1); - return OPAL_ERROR; - } -#endif - - memset(&attr, 0, sizeof(struct ibv_qp_attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = (openib_btl->device->mtu < lcl_ep->rem_info.rem_mtu) ? - openib_btl->device->mtu : lcl_ep->rem_info.rem_mtu; - attr.dest_qp_num = rem_qp_num; - attr.rq_psn = rem_psn; - attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; - attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; - attr.ah_attr.is_global = 0; - attr.ah_attr.dlid = lcl_ep->rem_info.rem_lid; - attr.ah_attr.src_path_bits = openib_btl->src_path_bits; - attr.ah_attr.port_num = openib_btl->port_num; - attr.ah_attr.static_rate = 0; - attr.ah_attr.sl = mca_btl_openib_component.ib_service_level; - -#if (ENABLE_DYNAMIC_SL) - /* if user enabled dynamic SL, get it from PathRecord */ - if (0 != mca_btl_openib_component.ib_path_record_service_level) { - int rc = btl_openib_connect_get_pathrecord_sl( -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - openib_btl->device->xrcd->context, -#else - openib_btl->device->xrc_domain->context, -#endif - attr.ah_attr.port_num, - openib_btl->lid, - attr.ah_attr.dlid); - if (OPAL_ERROR == rc) { - return OPAL_ERROR; - } - attr.ah_attr.sl = rc; - } -#endif - -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - ret = ibv_modify_qp(lcl_ep->xrc_recv_qp, - &attr, - IBV_QP_STATE| - IBV_QP_AV| - IBV_QP_PATH_MTU| - IBV_QP_DEST_QPN| - IBV_QP_RQ_PSN| - IBV_QP_MAX_DEST_RD_ATOMIC| - IBV_QP_MIN_RNR_TIMER); - if (ret) { - BTL_ERROR(("Error modifying XRC recv QP to IBV_QPS_RTR, errno says: %s [%d]", - strerror(ret), ret)); - return OPAL_ERROR; - } -#else - ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain, - lcl_ep->xrc_recv_qp_num, - &attr, - IBV_QP_STATE| - IBV_QP_AV| - IBV_QP_PATH_MTU| - IBV_QP_DEST_QPN| - IBV_QP_RQ_PSN| - IBV_QP_MAX_DEST_RD_ATOMIC| - IBV_QP_MIN_RNR_TIMER); - if (ret) { - BTL_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_RTR, errno says: %s [%d]", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } -#endif - if (APM_ENABLED) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - mca_btl_openib_load_apm(lcl_ep->xrc_recv_qp, lcl_ep); -#else - mca_btl_openib_load_apm_xrc_rcv(lcl_ep->xrc_recv_qp_num, lcl_ep); -#endif - } - - return OPAL_SUCCESS; -} - -/* mark: xrc message functions */ - -static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, - uint8_t msg_type) -{ - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - udcm_message_sent_t *msg; - int rc; - - assert (UDCM_MESSAGE_XCONNECT == msg_type || UDCM_MESSAGE_XCONNECT2 == msg_type); - - BTL_VERBOSE(("sending xrc request for endpoint %p", (void *) lcl_ep)); - - if (0 != (rc = udcm_new_message (lcl_ep, rem_ep, msg_type, - sizeof (udcm_msg_hdr_t), &msg))) { - return rc; - } - - msg->data->hdr.data.xreq.rem_ep_index = htonl(lcl_ep->index); - msg->data->hdr.data.xreq.rem_port_num = m->modex.mm_port_num; - msg->data->hdr.data.xreq.rem_name = OPAL_PROC_MY_NAME; - - if (UDCM_MESSAGE_XCONNECT == msg_type) { - BTL_VERBOSE(("Sending XConnect with qp: %d, psn: %d", lcl_ep->qps[0].qp->lcl_qp->qp_num, - lcl_ep->qps[0].qp->lcl_psn)); - msg->data->hdr.data.xreq.rem_qp_num = htonl(lcl_ep->qps[0].qp->lcl_qp->qp_num); - msg->data->hdr.data.xreq.rem_psn = htonl(lcl_ep->qps[0].qp->lcl_psn); - } else { - BTL_VERBOSE(("Sending XConnect2 with qp: %d", lcl_ep->ib_addr->remote_xrc_rcv_qp_num)); - msg->data->hdr.data.xreq.rem_qp_num = htonl(lcl_ep->ib_addr->remote_xrc_rcv_qp_num); - } - - if (0 != (rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0))) { - BTL_VERBOSE(("error posting XREQ")); - - udcm_free_message (msg); - - return rc; - } - - return 0; -} - -static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, - uint8_t msg_type) -{ - udcm_module_t *m = UDCM_ENDPOINT_MODULE(lcl_ep); - udcm_message_sent_t *msg; - int rc; - - assert (UDCM_MESSAGE_XRESPONSE == msg_type || UDCM_MESSAGE_XRESPONSE2 == msg_type); - - if (0 != (rc = udcm_new_message (lcl_ep, rem_ep, msg_type, m->msg_length, &msg))) { - return rc; - } - - msg->data->hdr.data.xres.rem_ep_index = htonl(lcl_ep->index); - - if (UDCM_MESSAGE_XRESPONSE == msg_type) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - BTL_VERBOSE(("Sending qp: %d, psn: %d", lcl_ep->xrc_recv_qp->qp_num, lcl_ep->xrc_recv_psn)); - msg->data->hdr.data.xres.rem_qp_num = htonl(lcl_ep->xrc_recv_qp->qp_num); - msg->data->hdr.data.xres.rem_psn = htonl(lcl_ep->xrc_recv_psn); -#else - BTL_VERBOSE(("Sending qp: %d, psn: %d", lcl_ep->xrc_recv_qp_num, lcl_ep->xrc_recv_psn)); - msg->data->hdr.data.xres.rem_qp_num = htonl(lcl_ep->xrc_recv_qp_num); - msg->data->hdr.data.xres.rem_psn = htonl(lcl_ep->xrc_recv_psn); -#endif - } - - for (int i = 0; i < mca_btl_openib_component.num_xrc_qps; ++i) { -#if OPAL_HAVE_CONNECTX_XRC_DOMAINS - uint32_t srq_num; - if (ibv_get_srq_num(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq, &srq_num)) { - BTL_ERROR(("BTL openib XOOB internal error: can't get srq num")); - } - BTL_VERBOSE(("Sending srq[%d] num = %d", i, srq_num)); - msg->data->qps[i].qp_num = htonl(srq_num); -#else - BTL_VERBOSE(("Sending srq[%d] num = %d", i, lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num)); - msg->data->qps[i].qp_num = htonl(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num); -#endif - } - - rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0); - if (0 != rc) { - BTL_VERBOSE(("error posting complete")); - - udcm_free_message (msg); - - return rc; - } - - return 0; -} - -/* mark: xrc message handling */ - -static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) -{ - udcm_reject_reason_t rej_reason = UDCM_REJ_REMOTE_ERROR; - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int response_type; - int rc = OPAL_ERROR; - - /* sanity check on message type */ - assert (UDCM_MESSAGE_XCONNECT == msg_hdr->type || UDCM_MESSAGE_XCONNECT2 == msg_hdr->type); - - do { - if (NULL == udep) { - break; - } - - if (udep->recv_req) { - /* duplicate request */ - return OPAL_SUCCESS; - } - - udep->recv_req = true; - - opal_mutex_lock (&udep->udep_lock); - - if (UDCM_MESSAGE_XCONNECT2 == msg_hdr->type) { - response_type = UDCM_MESSAGE_XRESPONSE2; - rc = udcm_xrc_recv_qp_connect (lcl_ep, msg_hdr->data.xreq.rem_qp_num); - if (OPAL_SUCCESS != rc) { - /* return not ready. remote side will retry */ - rej_reason = UDCM_REJ_NOT_READY; - break; - } - } - - /* prepost receives */ - rc = mca_btl_openib_endpoint_post_recvs (lcl_ep); - if (OPAL_SUCCESS != rc) { - break; - } - - /* Create local QP's and post receive resources */ - if (UDCM_MESSAGE_XCONNECT == msg_hdr->type) { - BTL_VERBOSE(("Initialized QPs, LID = %d", ((mca_btl_openib_module_t *) lcl_ep->endpoint_btl)->lid)); - - response_type = UDCM_MESSAGE_XRESPONSE; - - rc = udcm_xrc_recv_qp_create (lcl_ep, msg_hdr->data.xreq.rem_qp_num, msg_hdr->data.xreq.rem_psn); - if (OPAL_SUCCESS != rc) { - break; - } - } - - rc = udcm_xrc_send_xresponse (lcl_ep, msg_hdr->rem_ep, response_type); - if (OPAL_SUCCESS != rc) { - break; - } - - opal_mutex_unlock (&udep->udep_lock); - - return OPAL_SUCCESS; - } while (0); - - if (udep) { - opal_mutex_unlock (&udep->udep_lock); - } - - /* Reject the request */ - BTL_VERBOSE(("rejecting request for reason %d", rej_reason)); - - udcm_send_reject (lcl_ep, msg_hdr->rem_ep, rej_reason); - - if (OPAL_SUCCESS != rc) { - /* Communicate to the upper layer that the connection on this - endpoint has failed */ - mca_btl_openib_endpoint_invoke_error (lcl_ep); - } - - return rc; -} - -static int udcm_xrc_handle_xresponse (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) -{ - udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); - int rc; - - BTL_VERBOSE(("finishing xrc connection for endpoint %p.", (void *) lcl_ep)); - - /* duplicate message */ - if (udep->recv_resp) { - return OPAL_SUCCESS; - } - - udep->recv_resp = true; - - rc = udcm_xrc_send_qp_connect (lcl_ep, msg_hdr->data.xres.rem_qp_num, msg_hdr->data.xres.rem_psn); - if (OPAL_SUCCESS != rc) { - mca_btl_openib_endpoint_invoke_error (lcl_ep); - } - - return udcm_finish_connection (lcl_ep); -} -#endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.h b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.h deleted file mode 100644 index a0fc2b062b..0000000000 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.h +++ /dev/null @@ -1,22 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (c) 2011 Los Alamos National Security, LLC. All - * right reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_UD_H -#define BTL_OPENIB_CONNECT_UD_H - -#include "opal_config.h" - -#include "connect/connect.h" - -extern opal_btl_openib_connect_base_component_t opal_btl_openib_connect_udcm; - -#endif diff --git a/opal/mca/btl/openib/connect/connect.h b/opal/mca/btl/openib/connect/connect.h deleted file mode 100644 index 134cac87a4..0000000000 --- a/opal/mca/btl/openib/connect/connect.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - * This interface is designed to hide the back-end details of how IB - * RC connections are made from the rest of the openib BTL. There are - * module-like instances of the implemented functionality (dlopen and - * friends are not used, but all the functionality is accessed through - * struct's of function pointers, so you can swap between multiple - * different implementations at run time, just like real components). - * Hence, these entities are referred to as "Connect - * Pseudo-Components" (CPCs). - * - * The CPCs are referenced by their names (e.g., "oob", "rdma_cm"). - * - * CPCs are split into components and modules, similar to all other - * MCA frameworks in this code base. - * - * Before diving into the CPC interface, let's discuss some - * terminology and mappings of data structures: - * - * - a BTL module represents a network port (in the case of the openib - * BTL, a LID) - * - a CPC module represents one way to make connections to a BTL module - * - hence, a BTL module has potentially multiple CPC modules - * associated with it - * - an endpoint represnts a connection between a local BTL module and - * a remote BTL module (in the openib BTL, because of BSRQ, an - * endpoint can contain multiple QPs) - * - when an endpoint is created, one of the CPC modules associated - * with the local BTL is selected and associated with the endpoint - * (obviously, it is a CPC module that is common between the local - * and remote BTL modules) - * - endpoints may be created and destroyed during the MPI job - * - endpoints are created lazily, during the first communication - * between two peers - * - endpoints are destroyed when two MPI processes become - * disconnected (e.g., MPI-2 dynamics or MPI_FINALIZE) - * - hence, BTL modules and CPC modules outlive endpoints. - * Specifically, BTL modules and CPC modules live from MPI_INIT to - * MPI_FINALIZE. endpoints come and go as MPI semantics demand it. - * - therefore, CPC modules need to cache information on endpoints that - * are specific to that connection. - * - * Component interface: - * - * - component_register(): The openib BTL's component_open() function - * calls the connect_base_register() function, which scans all - * compiled-in CPC's. If they have component_register() functions, - * they are called (component_register() functions are only allowed to - * register MCA parameters). - * - * NOTE: The connect_base_register() function will process the - * btl_openib_cpc_include and btl_openib_cpc_exclude MCA parameters - * and automatically include/exclude CPCs as relevant. If a CPC is - * excluded, none of its other interface functions will be invoked for - * the duration of the process. - * - * - component_init(): The openib BTL's component_init() function - * calls connect_base_init(), which will invoke this query function on - * each CPC to see if it wants to run at all. CPCs can gracefully - * remove themselves from consideration in this process by returning - * OPAL_ERR_NOT_SUPPORTED. - * - * - component_query(): The openib BTL's init_one_port() calls the - * connect_base_select_for_local_port() function, which, for each LID - * on that port, calls the component_query() function on every - * available CPC on that LID. This function is intended to see if a - * CPC can run on a sepcific openib BTL module (i.e., LID). If it - * can, the CPC is supposed to create a CPC module that is specific to - * that BTL/LID and return it. If it cannot, it should return - * OPAL_ERR_NOT_SUPPORTED and be gracefully skipped for this - * OpenFabrics port. - * - * component_finalize(): The openib BTL's component_close() function - * calls connect_base_finalize(), which, in turn, calls the - * component_finalize() function on all available CPCs. Note that all - * CPC modules will have been finalized by this point; the CPC - * component_finalize() function is a chance for the CPC to clean up - * any component-specific resources. - * - * Module interface: - * - * cbm_component member: A pointer pointing to the single, global - * instance of the CPC component. This member is used for creating a - * unique index representing the modules' component so that it can be - * shared with remote peer processes. - * - * cbm_priority member: An integer between 0 and 100, inclusive, - * representing the priority of this CPC. - * - * cbm_modex_message member: A pointer to a blob buffer that will be - * included in the modex message for this port for this CPC (it is - * assumed that this blob is a) only understandable by the - * corresponding CPC in the peer process, and b) contains specific - * addressing/contact information for *this* port's CPC module). - * - * cbm_modex_message_len member: The length of the cbm_modex_message - * blob, in bytes. - * - * cbm_endpoint_init(): Called during endpoint creation, allowing a - * CPC module to cache information on the endpoint. A pointer to the - * endpoint's CPC module is already cached on the endpoint. - * - * cbm_start_connect(): initiate a connection to a remote peer. The - * CPC is responsible for setting itself up for asyncronous operation - * for progressing the outgoing connection request. - * - * cbm_endpoint_finalize(): Called during the endpoint destrouction, - * allowing the CPC module to destroy anything that it cached on the - * endpoint. - * - * cbm_finalize(): shut down all asynchronous handling and clean up - * any state that was setup for this CPC module/BTL. Some CPCs setup - * asynchronous support on a per-HCA/NIC basis (vs. per-port/LID). It - * is the reponsibility of the CPC to figure out such issues (e.g., - * via reference counting) -- there is no notification from the - * upper-level BTL about when an entire HCA/NIC is no longer being - * used. There is only this function, which tells when a specific - * CPC/BTL module is no longer being used. - * - * cbm_uses_cts: a bool that indicates whether the CPC will use the - * CTS protocol or not. - * - if true: the CPC will post the fragment on - * endpoint->endpoint_cts_frag as a receive buffer and will *not* - * call opal_btl_openib_post_recvs(). - * - if false: the CPC will call opal_btl_openib_post_recvs() before - * calling opal_btl_openib_cpc_complete(). - * - * There are two functions in the main openib BTL that the CPC may - * call: - * - * - opal_btl_openib_post_recvs(endpoint): once a QP is locally - * connected to the remote side (but we don't know if the remote side - * is connected to us yet), this function is invoked to post buffers - * on the QP, setup credits for the endpoint, etc. This function is - * *only* invoked if the CPC's cbm_uses_cts is false. - * - * - opal_btl_openib_cpc_complete(endpoint): once that a CPC knows - * that a QP is connected on *both* sides, this function is invoked to - * tell the main openib BTL "ok, you can use this connection now." - * (e.g., the main openib BTL will either invoke the CTS protocol or - * start sending out fragments that were queued while the connection - * was establishing, etc.). - */ -#ifndef BTL_OPENIB_CONNECT_H -#define BTL_OPENIB_CONNECT_H - -BEGIN_C_DECLS - -#define BCF_MAX_NAME 64 - -/** - * Must forward declare these structs to avoid include file loops. - */ -struct mca_btl_openib_hca_t; -struct mca_btl_openib_module_t; -struct mca_btl_base_endpoint_t; - -/** - * This is struct is defined below - */ -struct opal_btl_openib_connect_base_module_t; - -/************************************************************************/ - -/** - * Function to register MCA params in the connect functions. It - * returns no value, so it cannot fail. - */ -typedef void (*opal_btl_openib_connect_base_component_register_fn_t)(void); - -/** - * This function is invoked once by the openib BTL component during - * startup. It is intended to have CPC component-wide startup. - * - * Return value: - * - * - OPAL_SUCCESS: this CPC component will be used in selection during - * this process. - * - * - OPAL_ERR_NOT_SUPPORTED: this CPC component will be silently - * ignored in this process. - * - * - Other OPAL_ERR_* values: the error will be propagated upwards, - * likely causing a fatal error (and/or the openib BTL component - * being ignored). - */ -typedef int (*opal_btl_openib_connect_base_component_init_fn_t)(void); - -/** - * Query the CPC to see if it wants to run on a specific port (i.e., a - * specific BTL module). If the component init function previously - * returned OPAL_SUCCESS, this function is invoked once per BTL module - * creation (i.e., for each port found by an MPI process). If this - * CPC wants to be used on this BTL module, it returns a CPC module - * that is specific to this BTL module. - * - * The BTL module in question is passed to the function; all of its - * attributes can be used to query to see if it's eligible for this - * CPC. - * - * If it is eligible, the CPC is responsible for creating a - * corresponding CPC module, filling in all the relevant fields on the - * modules, and for setting itself up to run (per above) and returning - * a CPC module (this is effectively the "module_init" function). - * Note that the module priority must be between 0 and 100 - * (inclusive). When multiple CPCs are eligible for a single module, - * the CPC with the highest priority will be used. - * - * Return value: - * - * - OPAL_SUCCESS if this CPC is eligible for and was able to be setup - * for this BTL module. It is assumed that the CPC is now completely - * setup to run on this openib module (per description above). - * - * - OPAL_ERR_NOT_SUPPORTED if this CPC cannot support this BTL - * module. This is not an error; it's just the CPC saying "sorry, I - * cannot support this BTL module." - * - * - Other OPAL_ERR_* code: an error occurred. - */ -typedef int (*opal_btl_openib_connect_base_func_component_query_t) - (struct mca_btl_openib_module_t *btl, - struct opal_btl_openib_connect_base_module_t **cpc); - -/** - * This function is invoked once by the openib BTL component during - * shutdown. It is intended to have CPC component-wide shutdown. - */ -typedef int (*opal_btl_openib_connect_base_component_finalize_fn_t)(void); - -/** - * CPC component struct - */ -struct opal_btl_openib_connect_base_component_t { - /** Name of this set of connection functions */ - char cbc_name[BCF_MAX_NAME]; - - /** Register function. Can be NULL. */ - opal_btl_openib_connect_base_component_register_fn_t cbc_register; - - /** CPC component init function. Can be NULL. */ - opal_btl_openib_connect_base_component_init_fn_t cbc_init; - - /** Query the CPC component to get a CPC module corresponding to - an openib BTL module. Cannot be NULL. */ - opal_btl_openib_connect_base_func_component_query_t cbc_query; - - /** CPC component finalize function. Can be NULL. */ - opal_btl_openib_connect_base_component_finalize_fn_t cbc_finalize; -}; -/** - * Convenience typedef - */ -typedef struct opal_btl_openib_connect_base_component_t opal_btl_openib_connect_base_component_t; - -/************************************************************************/ - -/** - * Function called when an endpoint has been created and has been - * associated with a CPC. - */ -typedef int (*opal_btl_openib_connect_base_module_endpoint_init_fn_t) - (struct mca_btl_base_endpoint_t *endpoint); - -/** - * Function to initiate a connection to a remote process. - */ -typedef int (*opal_btl_openib_connect_base_module_start_connect_fn_t) - (struct opal_btl_openib_connect_base_module_t *cpc, - struct mca_btl_base_endpoint_t *endpoint); - -/** - * Function called when an endpoint is being destroyed. - */ -typedef int (*opal_btl_openib_connect_base_module_endpoint_finalize_fn_t) - (struct mca_btl_base_endpoint_t *endpoint); - -/** - * Function to finalize the CPC module. It is called once when the - * CPC module's corresponding openib BTL module is being finalized. - */ -typedef int (*opal_btl_openib_connect_base_module_finalize_fn_t) - (struct mca_btl_openib_module_t *btl, - struct opal_btl_openib_connect_base_module_t *cpc); - -/** - * Meta data about a CPC module. This is in a standalone struct - * because it is used in both the CPC module struct and the - * openib_btl_proc_t struct to hold information received from the - * modex. - */ -typedef struct opal_btl_openib_connect_base_module_data_t { - /** Pointer back to the component. Used by the base and openib - btl to calculate this module's index for the modex. */ - opal_btl_openib_connect_base_component_t *cbm_component; - - /** Priority of the CPC module (must be >=0 and <=100) */ - uint8_t cbm_priority; - - /** Blob that the CPC wants to include in the openib modex message - for a specific port, or NULL if the CPC does not want to - include a message in the modex. */ - void *cbm_modex_message; - - /** Length of the cbm_modex_message blob (0 if - cbm_modex_message==NULL). The message is intended to be short - (because the size of the modex broadcast is a function of - sum(cbm_modex_message_len[i]) for - i=(0...total_num_ports_in_MPI_job) -- e.g., IBCM imposes its - own [very short] limits (per IBTA volume 1, chapter 12). */ - uint8_t cbm_modex_message_len; -} opal_btl_openib_connect_base_module_data_t; - -/** - * Struct for holding CPC module and associated meta data - */ -typedef struct opal_btl_openib_connect_base_module_t { - /** Meta data about the module */ - opal_btl_openib_connect_base_module_data_t data; - - /** Endpoint initialization function */ - opal_btl_openib_connect_base_module_endpoint_init_fn_t cbm_endpoint_init; - - /** Connect function */ - opal_btl_openib_connect_base_module_start_connect_fn_t cbm_start_connect; - - /** Endpoint finalization function */ - opal_btl_openib_connect_base_module_endpoint_finalize_fn_t cbm_endpoint_finalize; - - /** Finalize the cpc module */ - opal_btl_openib_connect_base_module_finalize_fn_t cbm_finalize; - - /** Whether this module will use the CTS protocol or not. This - directly states whether this module will call - mca_btl_openib_endpoint_post_recvs() or not: true = this - module will *not* call _post_recvs() and instead will post the - receive buffer provided at endpoint->endpoint_cts_frag on qp - 0. */ - bool cbm_uses_cts; -} opal_btl_openib_connect_base_module_t; - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt b/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt deleted file mode 100644 index 18dc23cb6e..0000000000 --- a/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt +++ /dev/null @@ -1,57 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI's OpenFabrics IB CPC -# support. -# -[no cpcs for port] -No OpenFabrics connection schemes reported that they were able to be -used on a specific port. As such, the openib BTL (OpenFabrics -support) will be disabled for this port. - - Local host: %s - Local device: %s - Local port: %d - CPCs attempted: %s -# -[cpc name not found] -An invalid CPC name was specified via the btl_openib_cpc_%s MCA -parameter. - - Local host: %s - btl_openib_cpc_%s value: %s - Invalid name: %s - All possible valid names: %s -# -[inline truncated] -WARNING: The btl_openib_max_inline_data MCA parameter was used to -specify how much inline data should be used, but a device reduced this -value. This is not an error; it simply means that your run will use -a smaller inline data value than was requested. - - Local host: %s - Local device: %s - Local port: %d - Requested value: %d - Value used by device: %d -# -[ibv_create_qp failed] -A process failed to create a queue pair. This usually means either -the device has run out of queue pairs (too many connections) or -there are insufficient resources available to allocate a queue pair -(out of memory). The latter can happen if either 1) insufficient -memory is available, or 2) no more physical memory can be registered -with the device. - -For more information on memory registration see the Open MPI FAQs at: -http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages - -Local host: %s -Local device: %s -Queue pair type: %s diff --git a/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-rdmacm.txt b/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-rdmacm.txt deleted file mode 100644 index 3c29264362..0000000000 --- a/opal/mca/btl/openib/connect/help-mpi-btl-openib-cpc-rdmacm.txt +++ /dev/null @@ -1,67 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI's OpenFabrics RDMA CM -# support (the openib BTL). -# -[could not find matching endpoint] -The OpenFabrics device in an MPI process received an RDMA CM connect -request for a peer that it could not identify as part of this MPI job. -This should not happen. Your process is likely to abort; sorry. - - Local host: %s - Local device: %s - Remote address: %s - Remote TCP port: %d -# -[illegal tcp port] -The btl_openib_connect_rdmacm_port MCA parameter was used to specify -an illegal TCP port value. TCP ports must be between 0 and 65536 -(ports below 1024 can only be used by root). - - TCP port: %d - -This value was ignored. -# -[illegal retry count] -The btl_openib_connect_rdmacm_retry_count MCA parameter was used to specify -an illegal retry count. - - Retry count: %d - -# -[illegal timeout] -The btl_openib_connect_rdmacm_resolve_timeout parameter was used to -specify an illegal timeout value. Timeout values are specified in -miliseconds and must be greater than 0. - - Timeout value: %d - -This value was ignored. -# -[rdma cm device removal] -The RDMA CM returned that the device Open MPI was trying to use has -been removed. - - Local host: %s - Local device: %s - -Your MPI job will now abort, sorry. -# -[rdma cm event error] -The RDMA CM returned an event error while attempting to make a -connection. This type of error usually indicates a network -configuration error. - - Local host: %s - Local device: %s - Error name: %s - Peer: %s - -Your MPI job will now abort, sorry. diff --git a/opal/mca/btl/openib/help-mpi-btl-openib.txt b/opal/mca/btl/openib/help-mpi-btl-openib.txt deleted file mode 100644 index c3bfd7f9ed..0000000000 --- a/opal/mca/btl/openib/help-mpi-btl-openib.txt +++ /dev/null @@ -1,725 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2006 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. -# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. -# Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. -# Copyright (c) 2018 Los Alamos National Security, LLC. All rights -# reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI's OpenFabrics support -# (the openib BTL). -# -[ini file:file not found] -The Open MPI OpenFabrics (openib) BTL component was unable to find or -read an INI file that was requested via the -btl_openib_device_param_files MCA parameter. Please check this file -and/or modify the btl_openib_evice_param_files MCA parameter: - - %s -# -[ini file:not in a section] -In parsing the OpenFabrics (openib) BTL parameter file, values were -found that were not in a valid INI section. These values will be -ignored. Please re-check this file: - - %s - -At line %d, near the following text: - - %s -# -[ini file:unexpected token] -In parsing the OpenFabrics (openib) BTL parameter file, unexpected -tokens were found (this may cause significant portions of the INI file -to be ignored). Please re-check this file: - - %s - -At line %d, near the following text: - - %s -# -[ini file:expected equals] -In parsing the OpenFabrics (openib) BTL parameter file, unexpected -tokens were found (this may cause significant portions of the INI file -to be ignored). An equals sign ("=") was expected but was not found. -Please re-check this file: - - %s - -At line %d, near the following text: - - %s -# -[ini file:expected newline] -In parsing the OpenFabrics (openib) BTL parameter file, unexpected -tokens were found (this may cause significant portions of the INI file -to be ignored). A newline was expected but was not found. Please -re-check this file: - - %s - -At line %d, near the following text: - - %s -# -[ini file:unknown field] -In parsing the OpenFabrics (openib) BTL parameter file, an -unrecognized field name was found. Please re-check this file: - - %s - -At line %d, the field named: - - %s - -This field, and any other unrecognized fields, will be skipped. -# -[no device params found] -WARNING: No preset parameters were found for the device that Open MPI -detected: - - Local host: %s - Device name: %s - Device vendor ID: 0x%04x - Device vendor part ID: %d - -Default device parameters will be used, which may result in lower -performance. You can edit any of the files specified by the -btl_openib_device_param_files MCA parameter to set values for your -device. - -NOTE: You can turn off this warning by setting the MCA parameter - btl_openib_warn_no_device_params_found to 0. -# -[init-fail-no-mem] -The OpenFabrics (openib) BTL failed to initialize while trying to -allocate some locked memory. This typically can indicate that the -memlock limits are set too low. For most HPC installations, the -memlock limits should be set to "unlimited". The failure occured -here: - - Local host: %s - OMPI source: %s:%d - Function: %s() - Device: %s - Memlock limit: %s - -You may need to consult with your system administrator to get this -problem fixed. This FAQ entry on the Open MPI web site may also be -helpful: - - http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages -# -[init-fail-create-q] -The OpenFabrics (openib) BTL failed to initialize while trying to -create an internal queue. This typically indicates a failed -OpenFabrics installation, faulty hardware, or that Open MPI is -attempting to use a feature that is not supported on your hardware -(i.e., is a shared receive queue specified in the -btl_openib_receive_queues MCA parameter with a device that does not -support it?). The failure occured here: - - Local host: %s - OMPI source: %s:%d - Function: %s() - Error: %s (errno=%d) - Device: %s - -You may need to consult with your system administrator to get this -problem fixed. -# -[pp rnr retry exceeded] -The OpenFabrics "receiver not ready" retry count on a per-peer -connection between two MPI processes has been exceeded. In general, -this should not happen because Open MPI uses flow control on per-peer -connections to ensure that receivers are always ready when data is -sent. - -This error usually means one of two things: - -1. There is something awry within the network fabric itself. -2. A bug in Open MPI has caused flow control to malfunction. - -#1 is usually more likely. You should note the hosts on which this -error has occurred; it has been observed that rebooting or removing a -particular host from the job can sometimes resolve this issue. - -Below is some information about the host that raised the error and the -peer to which it was connected: - - Local host: %s - Local device: %s - Peer host: %s - -You may need to consult with your system administrator to get this -problem fixed. -# -[srq rnr retry exceeded] -The OpenFabrics "receiver not ready" retry count on a shared receive -queue or XRC receive queue has been exceeded. This error can occur if -the mca_btl_openib_ib_rnr_retry is set to a value less than 7 (where 7 -the default value and effectively means "infinite retry"). If your -rnr_retry value is 7, there might be something awry within the network -fabric itself. In this case, you should note the hosts on which this -error has occurred; it has been observed that rebooting or removing a -particular host from the job can sometimes resolve this issue. - -Below is some information about the host that raised the error and the -peer to which it was connected: - - Local host: %s - Local device: %s - Peer host: %s - -You may need to consult with your system administrator to get this -problem fixed. -# -[pp retry exceeded] -The InfiniBand retry count between two MPI processes has been -exceeded. "Retry count" is defined in the InfiniBand spec 1.2 -(section 12.7.38): - - The total number of times that the sender wishes the receiver to - retry timeout, packet sequence, etc. errors before posting a - completion error. - -This error typically means that there is something awry within the -InfiniBand fabric itself. You should note the hosts on which this -error has occurred; it has been observed that rebooting or removing a -particular host from the job can sometimes resolve this issue. - -Two MCA parameters can be used to control Open MPI's behavior with -respect to the retry count: - -* btl_openib_ib_retry_count - The number of times the sender will - attempt to retry (defaulted to 7, the maximum value). -* btl_openib_ib_timeout - The local ACK timeout parameter (defaulted - to 20). The actual timeout value used is calculated as: - - 4.096 microseconds * (2^btl_openib_ib_timeout) - - See the InfiniBand spec 1.2 (section 12.7.34) for more details. - -Below is some information about the host that raised the error and the -peer to which it was connected: - - Local host: %s - Local device: %s - Peer host: %s - -You may need to consult with your system administrator to get this -problem fixed. -# -[no active ports found] -WARNING: There is at least non-excluded one OpenFabrics device found, -but there are no active ports detected (or Open MPI was unable to use -them). This is most certainly not what you wanted. Check your -cables, subnet manager configuration, etc. The openib BTL will be -ignored for this job. - - Local host: %s -# -[error in device init] -WARNING: There was an error initializing an OpenFabrics device. - - Local host: %s - Local device: %s -# -[no devices right type] -WARNING: No OpenFabrics devices of the right type were found within -the requested bus distance. The OpenFabrics BTL will be ignored for -this run. - - Local host: %s - Requested type: %s - -If the "requested type" is "", this usually means that *no* -OpenFabrics devices were found within the requested bus distance. - -Note starting with Open MPI 4.0, only iWarp and RoCE devices are considered -for selection by default. Set the btl_openib_allow_ib MCA -parameter to "true" to allow use of Infiniband devices. -# -[default subnet prefix] -WARNING: There are more than one active ports on host '%s', but the -default subnet GID prefix was detected on more than one of these -ports. If these ports are connected to different physical IB -networks, this configuration will fail in Open MPI. This version of -Open MPI requires that every physically separate IB subnet that is -used between connected MPI processes must have different subnet ID -values. - -Please see this FAQ entry for more details: - - http://www.open-mpi.org/faq/?category=openfabrics#ofa-default-subnet-gid - -NOTE: You can turn off this warning by setting the MCA parameter - btl_openib_warn_default_gid_prefix to 0. -# -[ibv_fork requested but not supported] -WARNING: fork() support was requested for the OpenFabrics (openib) -BTL, but it is not supported on the host %s. Deactivating the -OpenFabrics BTL. -# -[ibv_fork_init fail] -WARNING: fork() support was requested for the OpenFabrics (openib) -BTL, but the library call ibv_fork_init() failed on the host %s. -Deactivating the OpenFabrics BTL. -# -[wrong buffer alignment] -Wrong buffer alignment %d configured on host '%s'. Should be bigger -than zero and power of two. Use default %d instead. -# -[of error event] -The OpenFabrics stack has reported a network error event. Open MPI -will try to continue, but your job may end up failing. - - Local host: %s - MPI process PID: %d - Error number: %d (%s) - -This error may indicate connectivity problems within the fabric; -please contact your system administrator. -# -[of unknown event] -The OpenFabrics stack has reported an unknown network error event. -Open MPI will try to continue, but the job may end up failing. - - Local host: %s - MPI process PID: %d - Error number: %d - -This error may indicate that you are using an OpenFabrics library -version that is not currently supported by Open MPI. You might try -recompiling Open MPI against your OpenFabrics library installation to -get more information. -# -[specified include and exclude] -ERROR: You have specified more than one of the btl_openib_if_include, -btl_openib_if_exclude, btl_openib_ipaddr_include, or btl_openib_ipaddr_exclude -MCA parameters. These four parameters are mutually exclusive; you can only -specify one. - -For reference, the values that you specified are: - - btl_openib_if_include: %s - btl_openib_if_exclude: %s - btl_openib_ipaddr_include: %s - btl_openib_ipaddr_exclude: %s -# -[nonexistent port] -WARNING: One or more nonexistent OpenFabrics devices/ports were -specified: - - Host: %s - MCA parameter: mca_btl_if_%sclude - Nonexistent entities: %s - -These entities will be ignored. You can disable this warning by -setting the btl_openib_warn_nonexistent_if MCA parameter to 0. -# -[invalid mca param value] -WARNING: An invalid MCA parameter value was found for the OpenFabrics -(openib) BTL. - - Problem: %s - Resolution: %s -# -[no qps in receive_queues] -WARNING: No queue pairs were defined in the btl_openib_receive_queues -MCA parameter. At least one queue pair must be defined. The -OpenFabrics (openib) BTL will therefore be deactivated for this run. - - Local host: %s -# -[invalid qp type in receive_queues] -WARNING: An invalid queue pair type was specified in the -btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL -will be deactivated for this run. - -Valid queue pair types are "P" for per-peer and "S" for shared receive -queue. - - Local host: %s - btl_openib_receive_queues: %s - Bad specification: %s -# -[invalid pp qp specification] -WARNING: An invalid per-peer receive queue specification was detected -as part of the btl_openib_receive_queues MCA parameter. The -OpenFabrics (openib) BTL will therefore be deactivated for this run. - -Per-peer receive queues require between 2 and 5 parameters: - - 1. Buffer size in bytes (mandatory) - 2. Number of buffers (mandatory) - 3. Low buffer count watermark (optional; defaults to (num_buffers / 2)) - 4. Credit window size (optional; defaults to (low_watermark / 2), - must be > 0) - 5. Number of buffers reserved for credit messages (optional; - defaults to (num_buffers*2-1)/credit_window) - - Example: P,128,256,128,16 - - 128 byte buffers - - 256 buffers to receive incoming MPI messages - - When the number of available buffers reaches 128, re-post 128 more - buffers to reach a total of 256 - - If the number of available credits reaches 16, send an explicit - credit message to the sender - - Defaulting to ((256 * 2) - 1) / 16 = 31; this many buffers are - reserved for explicit credit messages - - Local host: %s - Bad queue specification: %s -# -[invalid srq specification] -WARNING: An invalid shared receive queue specification was detected as -part of the btl_openib_receive_queues MCA parameter. The OpenFabrics -(openib) BTL will therefore be deactivated for this run. - -Shared receive queues can take between 2 and 6 parameters: - - 1. Buffer size in bytes (mandatory) - 2. Number of buffers (mandatory) - 3. Low buffer count watermark (optional; defaults to (num_buffers / 2)) - 4. Maximum number of outstanding sends a sender can have (optional; - defaults to (low_watermark / 4) - 5. Start value of number of receive buffers that will be pre-posted (optional; defaults to (num_buffers / 4)) - 6. Event limit buffer count watermark (optional; defaults to (3/16 of start value of buffers number)) - - Example: S,1024,256,128,32,32,8 - - 1024 byte buffers - - 256 buffers to receive incoming MPI messages - - When the number of available buffers reaches 128, re-post 128 more - buffers to reach a total of 256 - - A sender will not send to a peer unless it has less than 32 - outstanding sends to that peer. - - 32 receive buffers will be preposted. - - When the number of unused shared receive buffers reaches 8, more - buffers (32 in this case) will be posted. - - Local host: %s - Bad queue specification: %s -# -[rd_num must be > rd_low] -WARNING: The number of buffers for a queue pair specified via the -btl_openib_receive_queues MCA parameter must be greater than the low -buffer count watermark. The OpenFabrics (openib) BTL will therefore -be deactivated for this run. - - Local host: %s - Bad queue specification: %s -# -[rd_num must be >= rd_init] -WARNING: The number of buffers for a queue pair specified via the -btl_openib_receive_queues MCA parameter (parameter #2) must be -greater or equal to the initial SRQ size (parameter #5). -The OpenFabrics (openib) BTL will therefore be deactivated for this run. - - Local host: %s - Bad queue specification: %s -# -[srq_limit must be > rd_num] -WARNING: The number of buffers for a queue pair specified via the -btl_openib_receive_queues MCA parameter (parameter #2) must be greater than the limit -buffer count (parameter #6). The OpenFabrics (openib) BTL will therefore -be deactivated for this run. - - Local host: %s - Bad queue specification: %s -# -[biggest qp size is too small] -WARNING: The largest queue pair buffer size specified in the -btl_openib_receive_queues MCA parameter is smaller than the maximum -send size (i.e., the btl_openib_max_send_size MCA parameter), meaning -that no queue is large enough to receive the largest possible incoming -message fragment. The OpenFabrics (openib) BTL will therefore be -deactivated for this run. - - Local host: %s - Largest buffer size: %d - Maximum send fragment size: %d -# -[biggest qp size is too big] -WARNING: The largest queue pair buffer size specified in the -btl_openib_receive_queues MCA parameter is larger than the maximum -send size (i.e., the btl_openib_max_send_size MCA parameter). This -means that memory will be wasted because the largest possible incoming -message fragment will not fill a buffer allocated for incoming -fragments. - - Local host: %s - Largest buffer size: %d - Maximum send fragment size: %d -# -[freelist too small] -WARNING: The maximum freelist size that was specified was too small -for the requested receive queue sizes. The maximum freelist size must -be at least equal to the sum of the largest number of buffers posted -to a single queue plus the corresponding number of reserved/credit -buffers for that queue. It is suggested that the maximum be quite a -bit larger than this for performance reasons. - - Local host: %s - Specified freelist size: %d - Minimum required freelist size: %d -# -[XRC with PP or SRQ] -WARNING: An invalid queue pair type was specified in the -btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL -will be deactivated for this run. - -Note that XRC ("X") queue pairs cannot be used with per-peer ("P") and -SRQ ("S") queue pairs. This restriction may be removed in future -versions of Open MPI. - - Local host: %s - btl_openib_receive_queues: %s -# -[XRC with BTLs per LID] -WARNING: An invalid queue pair type was specified in the -btl_openib_receive_queues MCA parameter. The OpenFabrics (openib) BTL -will be deactivated for this run. - -XRC ("X") queue pairs can not be used when (btls_per_lid > 1). This -restriction may be removed in future versions of Open MPI. - - Local host: %s - btl_openib_receive_queues: %s - btls_per_lid: %d -# -[XRC on device without XRC support] -WARNING: You configured the OpenFabrics (openib) BTL to run with %d -XRC queues. The device %s does not have XRC capabilities; the -OpenFabrics btl will ignore this device. If no devices are found with -XRC capabilities, the OpenFabrics BTL will be disabled. - - Local host: %s -# -[No XRC support] -WARNING: The Open MPI build was compiled without XRC support, but XRC -("X") queues were specified in the btl_openib_receive_queues MCA -parameter. The OpenFabrics (openib) BTL will therefore be deactivated -for this run. - - Local host: %s - btl_openib_receive_queues: %s -# -[non optimal rd_win] -WARNING: rd_win specification is non optimal. For maximum performance it is -advisable to configure rd_win bigger than (rd_num - rd_low), but currently -rd_win = %d and (rd_num - rd_low) = %d. -# -[apm without lmc] -WARNING: You can't enable APM support with LMC bit configured to 0. -APM support will be disabled. -# -[apm with wrong lmc] -Can not provide %d alternative paths with LMC bit configured to %d. -# -[apm not enough ports] -WARNING: For APM over ports ompi require at least 2 active ports and -only single active port was found. Disabling APM over ports -# -[locally conflicting receive_queues] -Open MPI detected two devices on a single server that have different -"receive_queues" parameter values (in the openib BTL). Open MPI -currently only supports one OpenFabrics receive_queues value in an MPI -job, even if you have different types of OpenFabrics adapters on the -same host. - -Device 2 (in the details shown below) will be ignored for the duration -of this MPI job. - -You can fix this issue by one or more of the following: - - 1. Set the MCA parameter btl_openib_receive_queues to a value that - is usable by all the OpenFabrics devices that you will use. - 2. Use the btl_openib_if_include or btl_openib_if_exclue MCA - parameters to select exactly which OpenFabrics devices to use in - your MPI job. - -Finally, note that the "receive_queues" values may have been set by -the Open MPI device default settings file. You may want to look in -this file and see if your devices are getting receive_queues values -from this file: - - %s/mca-btl-openib-device-params.ini - -Here is more detailed information about the recieive_queus value -conflict: - - Local host: %s - Device 1: %s (vendor 0x%x, part ID %d) - Receive queues: %s - Device 2: %s (vendor 0x%x, part ID %d) - Receive queues: %s -# -[eager RDMA and progress threads] -WARNING: The openib BTL was directed to use "eager RDMA" for short -messages, but the openib BTL was compiled with progress threads -support. Short eager RDMA is not yet supported with progress threads; -its use has been disabled in this job. - -This is a warning only; you job will attempt to continue. -# -[ptmalloc2 with no threads] -WARNING: It appears that ptmalloc2 was compiled into this process via --lopenmpi-malloc, but there is no thread support. This combination is -known to cause memory corruption in the openib BTL. Open MPI is -therefore disabling the use of the openib BTL in this process for this -run. - - Local host: %s -# -[cannot raise btl error] -The OpenFabrics driver in Open MPI tried to raise a fatal error, but -failed. Hopefully there was an error message before this one that -gave some more detailed information. - - Local host: %s - Source file: %s - Source line: %d - -Your job is now going to abort, sorry. -# -[no iwarp support] -Open MPI does not support iWARP devices with this version of OFED. -You need to upgrade to a later version of OFED (1.3 or later) for Open -MPI to support iWARP devices. - -(This message is being displayed because you told Open MPI to use -iWARP devices via the btl_openib_device_type MCA parameter) -# -[invalid ipaddr_inexclude] -WARNING: An invalid value was given for btl_openib_ipaddr_%s. This -value will be ignored. - - Local host: %s - Value: %s - Message: %s -# -[unsupported queues configuration] -The Open MPI receive queue configuration for the OpenFabrics devices -on two nodes are incompatible, meaning that MPI processes on two -specific nodes were unable to communicate with each other. This -generally happens when you are using OpenFabrics devices from -different vendors on the same network. You should be able to use the -mca_btl_openib_receive_queues MCA parameter to set a uniform receive -queue configuration for all the devices in the MPI job, and therefore -be able to run successfully. - - Local host: %s - Local adapter: %s (vendor 0x%x, part ID %d) - Local queues: %s - - Remote host: %s - Remote adapter: (vendor 0x%x, part ID %d) - Remote queues: %s -# -[conflicting transport types] -Open MPI detected two different OpenFabrics transport types in the same Infiniband network. -Such mixed network trasport configuration is not supported by Open MPI. - - Local host: %s - Local adapter: %s (vendor 0x%x, part ID %d) - Local transport type: %s - - Remote host: %s - Remote Adapter: (vendor 0x%x, part ID %d) - Remote transport type: %s -# -[gid index too large] -Open MPI tried to use a GID index that was too large for an -OpenFabrics device (i.e., the GID index does not exist on this -device). - - Local host: %s - Local adapter: %s - Local port: %d - - Requested GID index: %d (specified by the btl_openib_gid_index MCA param) - Max allowable GID index: %d - -Use "ibv_devinfo -v" on the local host to see the GID table of this -device. -[reg mem limit low] -WARNING: It appears that your OpenFabrics subsystem is configured to only -allow registering part of your physical memory. This can cause MPI jobs to -run with erratic performance, hang, and/or crash. - -This may be caused by your OpenFabrics vendor limiting the amount of -physical memory that can be registered. You should investigate the -relevant Linux kernel module parameters that control how much physical -memory can be registered, and increase them to allow registering all -physical memory on your machine. - -See this Open MPI FAQ item for more information on these Linux kernel module -parameters: - - http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages - - Local host: %s - Registerable memory: %lu MiB - Total memory: %lu MiB - -%s -[CUDA_no_gdr_support] -You requested to run with CUDA GPU Direct RDMA support but the Open MPI -library was not built with that support. The Open MPI library must be -configured with CUDA 6.0 or later. - - Local host: %s -[driver_no_gdr_support] -You requested to run with CUDA GPU Direct RDMA support but this OFED -installation does not have that support. Contact Mellanox to figure -out how to get an OFED stack with that support. - - Local host: %s -[no_fork_with_gdr] -You cannot have fork support and CUDA GPU Direct RDMA support on at the -same time. Please disable one of them. Deactivating the openib BTL. - - Local host: %s -# -[CUDA_gdr_and_nopinned] -You requested to run with CUDA GPU Direct RDMA support but also with -"leave pinned" turned off. This will result in very poor performance -with CUDA GPU Direct RDMA. Either disable GPU Direct RDMA support or -enable "leave pinned" support. Deactivating the openib BTL. - - Local host: %s -# -[do_not_set_openib_value] -Open MPI has detected that you have attempted to set the btl_openib_cuda_max_send_size -value. This is not supported. Setting back to default value of 0. - - Local host: %s -[ib port not selected] -By default, for Open MPI 4.0 and later, infiniband ports on a device -are not used by default. The intent is to use UCX for these devices. -You can override this policy by setting the btl_openib_allow_ib MCA parameter -to true. - - Local host: %s - Local adapter: %s - Local port: %d -# - diff --git a/opal/mca/btl/openib/mca-btl-openib-device-params.ini b/opal/mca/btl/openib/mca-btl-openib-device-params.ini deleted file mode 100644 index 4a0a62467d..0000000000 --- a/opal/mca/btl/openib/mca-btl-openib-device-params.ini +++ /dev/null @@ -1,351 +0,0 @@ -# -# Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2006-2011 Mellanox Technologies. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# - -# This is the default NIC/HCA parameters file for Open MPI's OpenIB -# BTL. If NIC/HCA vendors wish to add their respective values into -# this file (that is distributed with Open MPI), please contact the -# Open MPI development team. See http://www.open-mpi.org/ for -# details. - -# This file is in the "ini" style, meaning that it has sections -# identified section names enclosed in square brackets (e.g., -# "[Section name]") followed by "key = value" pairs indicating values -# for a specific NIC/HCA vendor and model. NICs/HCAs are identified -# by their vendor ID and vendor part ID, which can be obtained by -# running the diagnostic utility command "ibv_devinfo". The fields -# "vendor_id" and "vendor_part"id" are the vendor ID and vendor part -# ID, respectively. - -# The sections in this file only accept a few fields: - -# vendor_id: a comma-delimited list of integers of NIC/HCA vendor IDs, -# expressed either in decimal or hexidecimal (e.g., "13" or "0xd"). -# Individual values can be taken directly from the output of -# "ibv_devinfo". NIC/HCA vendor ID's correspond to IEEE OUI's, for -# which you can find the canonical list here: -# http://standards.ieee.org/regauth/oui/. Example: -# -# vendor_id = 0x05ad -# -# Note: Several vendors resell Mellanox hardware and put their own firmware -# on the cards, therefore overriding the default Mellanox vendor ID. -# -# Mellanox 0x02c9 -# Cisco 0x05ad -# Silverstorm 0x066a -# Voltaire 0x08f1 -# HP 0x1708 -# Sun 0x03ba -# Bull 0x119f - -# vendor_part_id: a comma-delimited list of integers of different -# NIC/HCA models from a single vendor, expressed in either decimal or -# hexidecimal (e.g., "13" or "0xd"). Individual values can be -# obtained from the output of the "ibv_devinfo". Example: -# -# vendor_part_id = 25208,25218 - -# mtu: an integer indicating the maximum transfer unit (MTU) to be -# used with this NIC/HCA. The effective MTU will be the minimum of an -# NIC's/HCA's MTU value and its peer NIC's/HCA's MTU value. Valid -# values are 256, 512, 1024, 2048, and 4096. Example: -# -# mtu = 1024 - -# use_eager_rdma: an integer indicating whether RDMA should be used -# for eager messages. 0 values indicate "no" (false); non-zero values -# indicate "yes" (true). This flag should only be enabled for -# NICs/HCAs that can provide guarantees about ordering of data in -# memory -- that the last byte of an incoming RDMA write will always -# be written last. Certain cards cannot provide this guarantee, while -# others can. - -# use_eager_rdma = 1 - -# receive_queues: a list of "bucket shared receive queues" (BSRQ) that -# are opened between MPI process peer pairs for point-to-point -# communications of messages shorter than the total length required -# for RDMA transfer. The use of multiple RQs, each with different -# sized posted receive buffers can allow [much] better registered -# memory utilization -- MPI messages are sent on the QP with the -# smallest buffer size that will fit the message. Note that flow -# control messages are always sent across the QP with the smallest -# buffer size. Also note that the buffers *must* be listed in -# increasing buffer size. This parameter matches the -# mca_btl_openib_receive_queues MCA parameter; see the ompi_info help -# message and FAQ for a description of its values. BSRQ -# specifications are found in this precedence: - -# highest: specifying the mca_btl_openib_receive_queues MCA param -# next: finding a value in this file -# lowest: using the default mca_btl_openib_receive_queues MCA param value - -# receive_queues = P,128,256,192,128:S,65536,256,192,128 - -# max_inline_data: an integer specifying the maximum inline data (in -# bytes) supported by the device. -1 means to use a run-time probe to -# figure out the maximum value supported by the device. - -# max_inline_data = 1024 - -# rdmacm_reject_causes_connect_error: a boolean indicating whether -# when an RDMA CM REJECT is issued on the device, instead of getting -# the expected REJECT event back, you might get a CONNECT_ERROR event. -# Open MPI uses RDMA CM REJECT messages in its normal wireup -# procedure; some connections are *expected* to be rejected. However, -# with some older drivers, if process A issues a REJECT, process B -# will receive a CONNECT_ERROR event instead of a REJECT event. So if -# this flag is set to true and we receive a CONNECT_ERROR event on a -# connection where we are expecting a REJECT, then just treat the -# CONNECT_ERROR exactly as we would have treated the REJECT. Setting -# this flag to true allows Open MPI to work around the behavior -# described above. It is [mostly] safe to set this flag to true even -# after a driver has been fixed; the scope of where this flag is used -# is small enough that it *shouldn't* mask real CONNECT_ERROR events. - -# rdmacm_reject_causes_connect_error = 1 - -############################################################################ - -[default] -# These are the default values, identified by the vendor and part ID -# numbers of 0 and 0. If queried NIC/HCA does not return vendor and -# part ID numbers that match any of the sections in this file, the -# values in this section are used. Vendor IDs and part IDs can be hex -# or decimal. -vendor_id = 0 -vendor_part_id = 0 -use_eager_rdma = 0 -mtu = 1024 -max_inline_data = 128 - -############################################################################ - -[Mellanox Tavor Infinihost] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3 -vendor_part_id = 23108 -use_eager_rdma = 1 -mtu = 1024 -max_inline_data = 128 - -############################################################################ - -[Mellanox Arbel InfiniHost III MemFree/Tavor] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3 -vendor_part_id = 25208,25218 -use_eager_rdma = 1 -mtu = 1024 -max_inline_data = 128 - -############################################################################ - -[Mellanox Sinai Infinihost III] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3 -vendor_part_id = 25204,24204 -use_eager_rdma = 1 -mtu = 2048 -max_inline_data = 128 - -############################################################################ - -# A.k.a. ConnectX -[Mellanox Hermon] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3,0x119f -vendor_part_id = 25408,25418,25428,25448,26418,26428,26438,26448,26468,26478,26488,4099,4103,4100 -use_eager_rdma = 1 -mtu = 2048 -max_inline_data = 128 - -############################################################################ - -[Mellanox ConnectIB] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3,0x119f -vendor_part_id = 4113 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 256 - -############################################################################ - -[Mellanox ConnectX4] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3,0x119f -vendor_part_id = 4115,4117 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 256 - -############################################################################ - -[Mellanox ConnectX5] -vendor_id = 0x2c9,0x5ad,0x66a,0x8f1,0x1708,0x03ba,0x15b3,0x119f -vendor_part_id = 4119,4121 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 256 - -############################################################################ - -[IBM eHCA 4x and 12x] -vendor_id = 0x5076 -vendor_part_id = 0 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,128,256,192,128:P,65536,256,192,128 -max_inline_data = 0 - -############################################################################ - -[IBM eHCA-2 4x and 12x] -vendor_id = 0x5076 -vendor_part_id = 1 -use_eager_rdma = 1 -mtu = 4096 -receive_queues = P,128,256,192,128:P,65536,256,192,128 -max_inline_data = 0 - -############################################################################ - -# See http://lists.openfabrics.org/pipermail/general/2008-June/051920.html -# 0x1fc1 and 0x1077 are PCI ID's; at least one of QL's OUIs is 0x1175 - -[QLogic InfiniPath 1] -vendor_id = 0x1fc1,0x1077,0x1175 -vendor_part_id = 13 -use_eager_rdma = 1 -mtu = 2048 -max_inline_data = 0 - -[QLogic InfiniPath 2] -vendor_id = 0x1fc1,0x1077,0x1175 -vendor_part_id = 16,29216 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 0 - -[QLogic InfiniPath 3] -vendor_id = 0x1fc1,0x1077,0x1175 -vendor_part_id = 16,29474 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 0 - -[QLogic FastLinQ QL41000] -vendor_id = 0x1077 -vendor_part_id = 32880 -receive_queues = P,65536,64 - -############################################################################ - -# Chelsio's OUI is 0x0743. 0x1425 is the PCI ID. - -[Chelsio T3] -vendor_id = 0x1425 -vendor_part_id = 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0030,0x0031,0x0032,0x0035,0x0036 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,256,192,128 -max_inline_data = 64 - -[Chelsio T4] -vendor_id = 0x1425 -vendor_part_id = 0xa000,0x4400,0x4401,0x4402,0x4403,0x4404,0x4405,0x4406,0x4407,0x4408,0x4409,0x440a,0x440b,0x440c,0x440d,0x440e,0x4480,0x4481 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,64 -max_inline_data = 280 - -[Chelsio T5] -vendor_id = 0x1425 -vendor_part_id = 0xb000,0xb001,0x5400,0x5401,0x5402,0x5403,0x5404,0x5405,0x5406,0x5407,0x5408,0x5409,0x540a,0x540b,0x540c,0x540d,0x540e,0x540f,0x5410,0x5411,0x5412,0x5413 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,64 -max_inline_data = 280 - -[Chelsio T6] -vendor_id = 0x1425 -vendor_part_id = 0x6400,0x6401,0x6402,0x6403,0x6404,0x6405,0x6406,0x6407,0x6408,0x6409,0x640d,0x6410,0x6411,0x6414,0x6415 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,64 -max_inline_data = 280 - -############################################################################ - -# I'm *assuming* that 0x4040 is the PCI ID... - -[NetXen] -vendor_id = 0x4040 -vendor_part_id = 0x0001,0x0002,0x0003,0x0004,0x0005,0x0024,0x0025,0x0100 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,248,192,128 -max_inline_data = 64 - -############################################################################ - -# NetEffect's OUI is 0x1255. 0x1678 is the PCI ID. ...but then -# NetEffect was bought by Intel. Intel's OUI is 0x1b21. - -[NetEffect/Intel NE020] -vendor_id = 0x1678,0x1255,0x1b21 -vendor_part_id = 0x0100,0x0110 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,256,192,128 -max_inline_data = 64 - -[Intel HFI1] -vendor_id = 0x1175 -vendor_part_id = 9456,9457 -use_eager_rdma = 1 -mtu = 4096 -max_inline_data = 0 - -############################################################################ - -# Intel has several OUI's, including 0x8086. Amusing. :-) Intel has -# advised us (June, 2013) to ignore the Intel Phi OpenFabrics -# device... at least for now. - -[Intel Xeon Phi] -vendor_id = 0x8086 -vendor_part_id = 0 -ignore_device = 1 - -############################################################################ - -# IBM Soft iWARP device. - -[IBM Soft iWARP] -vendor_id = 0x626d74 -vendor_part_id = 0 -use_eager_rdma = 1 -mtu = 2048 -receive_queues = P,65536,64 -max_inline_data = 72 - -############################################################################ - -# Broadcom NetXtreme-E RDMA Ethernet Controller - -[Broadcom BCM57XXX] -vendor_id = 0x14e4 -vendor_part_id = 0x1605,0x1606,0x1614,0x16c0,0x16c1,0x16ce,0x16cf,0x16d6,0x16d7,0x16d8,0x16d9,0x16df,0x16e2,0x16e3,0x16e5,0x16eb,0x16ed,0x16ef,0x16f0,0x16f1 -use_eager_rdma = 1 -mtu = 1024 -receive_queues = P,65536,256,192,128 -max_inline_data = 96 - -[Broadcom BCM58XXX] -vendor_id = 0x14e4 -vendor_part_id = 0xd800,0xd802,0xd804 -use_eager_rdma = 1 -mtu = 1024 -receive_queues = P,65536,256,192,128 -max_inline_data = 96 diff --git a/opal/mca/btl/openib/owner.txt b/opal/mca/btl/openib/owner.txt deleted file mode 100644 index 92eb51d94b..0000000000 --- a/opal/mca/btl/openib/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner:Chelsio -status:maintenance