From 77038b65a8e339074866c1dc54f5f379ddd68915 Mon Sep 17 00:00:00 2001 From: Andrew Friedley Date: Thu, 5 Jul 2007 23:42:54 +0000 Subject: [PATCH] Bring the UD BTL over to the trunk, named 'ofud'. This commit was SVN r15298. --- ompi/mca/btl/ofud/Makefile.am | 68 +++ ompi/mca/btl/ofud/btl_ofud.c | 691 +++++++++++++++++++++++++ ompi/mca/btl/ofud/btl_ofud.h | 374 +++++++++++++ ompi/mca/btl/ofud/btl_ofud_component.c | 537 +++++++++++++++++++ ompi/mca/btl/ofud/btl_ofud_endpoint.c | 137 +++++ ompi/mca/btl/ofud/btl_ofud_endpoint.h | 82 +++ ompi/mca/btl/ofud/btl_ofud_frag.c | 112 ++++ ompi/mca/btl/ofud/btl_ofud_frag.h | 148 ++++++ ompi/mca/btl/ofud/btl_ofud_proc.c | 206 ++++++++ ompi/mca/btl/ofud/btl_ofud_proc.h | 79 +++ ompi/mca/btl/ofud/configure.m4 | 43 ++ ompi/mca/btl/ofud/configure.params | 26 + 12 files changed, 2503 insertions(+) create mode 100644 ompi/mca/btl/ofud/Makefile.am create mode 100644 ompi/mca/btl/ofud/btl_ofud.c create mode 100644 ompi/mca/btl/ofud/btl_ofud.h create mode 100644 ompi/mca/btl/ofud/btl_ofud_component.c create mode 100644 ompi/mca/btl/ofud/btl_ofud_endpoint.c create mode 100644 ompi/mca/btl/ofud/btl_ofud_endpoint.h create mode 100644 ompi/mca/btl/ofud/btl_ofud_frag.c create mode 100644 ompi/mca/btl/ofud/btl_ofud_frag.h create mode 100644 ompi/mca/btl/ofud/btl_ofud_proc.c create mode 100644 ompi/mca/btl/ofud/btl_ofud_proc.h create mode 100644 ompi/mca/btl/ofud/configure.m4 create mode 100644 ompi/mca/btl/ofud/configure.params diff --git a/ompi/mca/btl/ofud/Makefile.am b/ompi/mca/btl/ofud/Makefile.am new file mode 100644 index 0000000000..c05d0cc1d9 --- /dev/null +++ b/ompi/mca/btl/ofud/Makefile.am @@ -0,0 +1,68 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + + + +AM_CPPFLAGS=$(btl_ofud_CPPFLAGS) + +sources = \ + btl_ofud.c \ + btl_ofud.h \ + btl_ofud_component.c \ + btl_ofud_endpoint.c \ + btl_ofud_endpoint.h \ + btl_ofud_frag.c \ + btl_ofud_frag.h \ + btl_ofud_proc.c \ + btl_ofud_proc.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_btl_ofud_DSO +lib = +lib_sources = +component = mca_btl_ofud.la +component_sources = $(sources) +else +lib = libmca_btl_ofud.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_btl_ofud_la_SOURCES = $(component_sources) +mca_btl_ofud_la_LDFLAGS = -module -avoid-version $(btl_ofud_LDFLAGS) +mca_btl_ofud_la_LIBADD = \ + $(btl_ofud_LIBS) \ + $(top_ompi_builddir)/ompi/libmpi.la \ + $(top_ompi_builddir)/orte/libopen-rte.la \ + $(top_ompi_builddir)/opal/libopen-pal.la + + +noinst_LTLIBRARIES = $(lib) +libmca_btl_ofud_la_SOURCES = $(lib_sources) +libmca_btl_ofud_la_LDFLAGS= -module -avoid-version $(btl_ofud_LDFLAGS) +libmca_btl_ofud_la_LIBADD=$(btl_ofud_LIBS) diff --git a/ompi/mca/btl/ofud/btl_ofud.c b/ompi/mca/btl/ofud/btl_ofud.c new file mode 100644 index 0000000000..799835ec92 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +#include "ompi_config.h" +#include "opal/prefetch.h" +#include "opal/util/output.h" +#include "ompi/datatype/convertor.h" +#include "ompi/datatype/datatype.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" + +#include "btl_ofud.h" +#include "btl_ofud_frag.h" +#include "btl_ofud_proc.h" +#include "btl_ofud_endpoint.h" + + +mca_btl_ud_module_t mca_btl_ofud_module = { + { + &mca_btl_ofud_component.super, + 0, /* eager_limit */ + 0, /* min_send_size */ + 0, /* max_send_size */ + 0, /* rdma_pipeline_send_length */ + 0, /* rdma_pipeline_frag_size */ + 0, /* min_rdma_pipeline_size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + MCA_BTL_FLAGS_SEND, + mca_btl_ud_add_procs, + mca_btl_ud_del_procs, + mca_btl_ud_register, + mca_btl_ud_finalize, + mca_btl_ud_alloc, + mca_btl_ud_free, + mca_btl_ud_prepare_src, + NULL, /*mca_btl_ud_prepare_dst */ + mca_btl_ud_send, + NULL, /*mca_btl_ud_put */ + NULL, /*mca_btl_ud_get */ + mca_btl_base_dump, + NULL, /* mpool */ + NULL, /* register error */ + mca_btl_udapl_ft_event + } +}; + + + +/* + * Add procs to this BTL module, receiving endpoint information from the modex. + */ + +int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **ompi_procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable) +{ + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; + struct ibv_ah_attr ah_attr; + int i, rc; + + /* Set up the endpoint lookup table if it hasn't been already */ + /* We do this here so we can initialize the table to a reasonable size + based on nprocs */ +#if 0 + if(NULL == ud_btl->ep_lookup) { + ud_btl->ep_lookup = malloc(sizeof(opal_hash_table_t)); + OBJ_CONSTRUCT(ud_btl->ep_lookup, opal_hash_table_t); + opal_hash_table_init(ud_btl->ep_lookup, nprocs); + } +#endif + + for(i = 0; i < (int)nprocs; i++) { + struct ompi_proc_t* ompi_proc = ompi_procs[i]; + mca_btl_ud_proc_t* ib_proc; + mca_btl_base_endpoint_t* ib_peer; + + if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + + /* The btl_proc datastructure is shared by all IB BTL instances that are + * trying to reach this destination. Cache the peer instance on the + * btl_proc. + */ + ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t); + if(NULL == ib_peer) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + OPAL_THREAD_LOCK(&ib_proc->proc_lock); + rc = mca_btl_ud_proc_insert(ib_proc, ib_peer); + if(rc != OMPI_SUCCESS) { + OBJ_RELEASE(ib_peer); + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + continue; + } + + BTL_VERBOSE(("modex_recv QP num %d, LID = %d", + ib_peer->rem_addr.qp_num, ib_peer->rem_addr.lid)); + + /* Set up IB address handles for the endpoint */ + ah_attr.is_global = 0; + ah_attr.dlid = ib_peer->rem_addr.lid; + ah_attr.sl = mca_btl_ofud_component.ib_service_level; + ah_attr.src_path_bits = mca_btl_ofud_component.ib_src_path_bits; + ah_attr.port_num = ud_btl->ib_port_num; + + ib_peer->rmt_ah = ibv_create_ah(ud_btl->ib_pd, &ah_attr); + if(NULL == ib_peer->rmt_ah) { + BTL_ERROR(("error creating address handle: %s\n", strerror(errno))); + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + continue; + } + + /* Insert a pointer to the endpoint in the BTL lookup table */ +#if 0 + opal_hash_table_set_value_uint64(ud_btl->ep_lookup, + ((uint64_t)ib_peer->rem_addr.lid << 32) | + ib_peer->rem_addr.qp_num, + ib_peer); +#endif + + ompi_bitmap_set_bit(reachable, i); + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + peers[i] = ib_peer; + } + + return OMPI_SUCCESS; +} + + +/* + * Delete the proc as reachable from this btl module + */ + +int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t** procs, + struct mca_btl_base_endpoint_t** peers) +{ + size_t i; + + for(i = 0; i < nprocs; i++) { + mca_btl_ud_endpoint_t* endpoint = (mca_btl_ud_endpoint_t*)peers[i]; + mca_btl_ud_proc_t* proc = mca_btl_ud_proc_lookup_ompi(procs[i]); +#if 0 + opal_hash_table_remove_value_uint64(ud_btl->ep_lookup, + ((uint64_t)endpoint->rem_addr.lid << 32) | + endpoint->rem_addr.qp_num); +#endif + if(NULL != proc) { + mca_btl_ud_proc_remove(proc, endpoint); + } + + OBJ_RELEASE(endpoint); + } + + return OMPI_SUCCESS; +} + + +/* + * Register callback function to support send/recv semantics + */ + +int mca_btl_ud_register(struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata) +{ + + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; + + OPAL_THREAD_LOCK(&ud_btl->ud_lock); + ud_btl->ib_reg[tag].cbfunc = cbfunc; + ud_btl->ib_reg[tag].cbdata = cbdata; + OPAL_THREAD_UNLOCK(&ud_btl->ud_lock); + return OMPI_SUCCESS; +} + + +/** + * Allocate a segment. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + * + * When allocating a segment we pull a pre-alllocated segment + * from one of two free lists, an eager list and a max list + */ + +mca_btl_base_descriptor_t* mca_btl_ud_alloc(struct mca_btl_base_module_t* btl, + uint8_t order, + size_t size) +{ + mca_btl_ud_frag_t* frag = NULL; + int rc; + + if(OPAL_LIKELY(size <= mca_btl_ofud_module.super.btl_eager_limit)) { + MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc); + } + + if(NULL == frag) { + return NULL; + } + + frag->base.order = MCA_BTL_NO_ORDER; + frag->segment.seg_len = size; + return (mca_btl_base_descriptor_t*)frag; +} + + +/** + * Return a segment + * + * Return the segment to the appropriate + * preallocated segment list + */ + +int mca_btl_ud_free(struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des) +{ + mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des; + + if(OPAL_LIKELY(frag->type == MCA_BTL_UD_FRAG_SEND)) { + MCA_BTL_UD_RETURN_FRAG(btl, frag); + } else if(frag->type == MCA_BTL_UD_FRAG_USER && frag->ud_reg != NULL) { + btl->btl_mpool->mpool_deregister(btl->btl_mpool, + (mca_mpool_base_registration_t*)frag->ud_reg); + MCA_BTL_UD_RETURN_USER_FRAG(btl, frag); + } + + return OMPI_SUCCESS; +} + + +/** + * register user buffer or pack + * data into pre-registered buffer and return a + * descriptor that can be + * used for send/put. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + * + * prepare source's behavior depends on the following: + * Has a valid memory registration been passed to prepare_src? + * if so we attempt to use the pre-registred user-buffer, if the memory + * registration is to small (only a portion of the user buffer) then we must + * reregister the user buffer + * Has the user requested the memory to be left pinned? + * if so we insert the memory registration into a memory tree for later + * lookup, we may also remove a previous registration if a MRU (most recently + * used) list of registions is full, this prevents resources from being + * exhausted. + * Is the requested size larger than the btl's max send size? + * if so and we aren't asked to leave the registration pinned then we + * register the memory if the user's buffer is contiguous. + * Otherwise we choose from two free lists of pre-registered memory in which + * to pack the data into. + * + */ + +mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size) +{ + mca_btl_ud_frag_t* frag = NULL; + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data = *size; + int rc; + + if(ompi_convertor_need_buffers(convertor) == 0 && reserve == 0 && + (registration != NULL || max_data > btl->btl_max_send_size)) { + /* The user buffer is contigous and we are asked to send more than + the max send size. */ + + MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + frag->base.des_flags = 0; + frag->base.order = MCA_BTL_NO_ORDER; + + if(NULL == registration) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, + max_data, 0, ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { + MCA_BTL_UD_RETURN_USER_FRAG(btl, frag); + } + return NULL; + } + + frag->ud_reg = (mca_btl_ud_reg_t*)registration; + + frag->sg_entry.lkey = frag->ud_reg->mr->lkey; + frag->sg_entry.addr = (unsigned long)iov.iov_base; + + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + *size = max_data; + return &frag->base; + } + + if(max_data + reserve > btl->btl_eager_limit) { + max_data = btl->btl_eager_limit - reserve; + } + + MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc); + if(OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); + if(OPAL_UNLIKELY(rc < 0)) { + MCA_BTL_UD_RETURN_FRAG(btl, frag); + return NULL; + } + + frag->segment.seg_len = max_data + reserve; + frag->sg_entry.length = + max_data + reserve + sizeof(mca_btl_ud_header_t); + + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + frag->base.order = MCA_BTL_NO_ORDER; + *size = max_data; + + return &frag->base; +} + + +int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl) +{ + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; + int32_t i; + + for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { + ibv_destroy_qp(ud_btl->ib_qp[i]); + } + + ibv_dealloc_pd(ud_btl->ib_pd); + + OBJ_DESTRUCT(&ud_btl->ud_lock); + OBJ_DESTRUCT(&ud_btl->pending_frags); + OBJ_DESTRUCT(&ud_btl->send_frags); + OBJ_DESTRUCT(&ud_btl->user_frags); + OBJ_DESTRUCT(&ud_btl->recv_frags); + mca_mpool_base_module_destroy(ud_btl->super.btl_mpool); + return OMPI_SUCCESS; +} + + +/* + * Initiate a send. + */ + +int mca_btl_ud_send(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag) +{ + int rc; + + mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor; + MCA_BTL_UD_START_TIME(post_send); + frag->endpoint = endpoint; + frag->hdr->tag = tag; + + rc = mca_btl_ud_endpoint_post_send((mca_btl_ud_module_t*)btl, frag); + + MCA_BTL_UD_END_TIME(post_send); + return rc; +} + + +/* + * RDMA Memory Pool (de)register callbacks + */ + +static int mca_btl_ud_reg_mr(void* reg_data, void* base, size_t size, + mca_mpool_base_registration_t* reg) +{ + mca_btl_ud_module_t* mod = (mca_btl_ud_module_t*)reg_data; + mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg; + + ud_reg->mr = ibv_reg_mr(mod->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); + + if(NULL == ud_reg->mr) + return OMPI_ERR_OUT_OF_RESOURCE; + + return OMPI_SUCCESS; +} + + +static int mca_btl_ud_dereg_mr(void* reg_data, + mca_mpool_base_registration_t* reg) +{ + mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg; + + if(ud_reg->mr != NULL) { + if(ibv_dereg_mr(ud_reg->mr)) { + opal_output(0, "%s: error unpinning UD memory: %s\n", + __func__, strerror(errno)); + return OMPI_ERROR; + } + } + + ud_reg->mr = NULL; + return OMPI_SUCCESS; +} + + +/* + * Create a single UD queue pair. Since UD is connectionless, the QP is + * useable immediately. + */ + +/* TODO - can remove cq/psn args now with only one type of frag */ +static int mca_btl_ud_init_qp(mca_btl_ud_module_t* ud_btl, + struct ibv_cq* cq, + struct ibv_qp** qp, + uint32_t lcl_psn) +{ + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + + memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); + + qp_init_attr.send_cq = cq; + qp_init_attr.recv_cq = cq; + qp_init_attr.cap.max_send_wr = mca_btl_ofud_component.sd_num; + qp_init_attr.cap.max_recv_wr = mca_btl_ofud_component.rd_num; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + /* TODO - find the best value for max_inline_data */ + qp_init_attr.cap.max_inline_data = 200; + qp_init_attr.qp_type = IBV_QPT_UD; + + *qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr); + if(NULL == *qp) { + BTL_ERROR(("error creating QP: %s\n", strerror(errno))); + return OMPI_ERROR; + } + + if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) { + BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data")); + } + + BTL_VERBOSE((0, "ib_inline_max %d\n", ud_btl->ib_inline_max)); + + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.pkey_index = mca_btl_ofud_component.ib_pkey_ix; + qp_attr.qkey = mca_btl_ofud_component.ib_qkey; + qp_attr.port_num = ud_btl->ib_port_num; + + if(ibv_modify_qp(*qp, &qp_attr, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_QKEY)) { + BTL_ERROR(("error modifying QP to INIT: %s", strerror(errno))); + goto destroy_qp; + } + + qp_attr.qp_state = IBV_QPS_RTR; + if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) { + BTL_ERROR(("error modifing QP to RTR: %s", strerror(errno))); + goto destroy_qp; + } + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = lcl_psn; + if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { + BTL_ERROR(("error modifying QP to RTS: %s", strerror(errno))); + goto destroy_qp; + } + + return OMPI_SUCCESS; + +destroy_qp: + ibv_destroy_qp(*qp); + *qp = NULL; + return OMPI_ERROR; +} + + +/* + * Initialize the btl module by allocating a protection domain, + * memory pool, completion queue, and free lists + */ + +int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl) +{ + struct mca_mpool_base_resources_t mpool_resources; + struct ibv_context *ctx = ud_btl->ib_dev_context; + struct ibv_recv_wr* bad_wr; + mca_btl_ud_frag_t* frag; + ompi_free_list_item_t* item; + uint32_t length; + int32_t rc, i; + + ud_btl->sd_wqe = mca_btl_ofud_component.sd_num; + + ud_btl->ib_pd = ibv_alloc_pd(ctx); + if(NULL == ud_btl->ib_pd) { + BTL_ERROR(("error allocating PD for %s: %s\n", + ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); + return OMPI_ERROR; + } + + mpool_resources.reg_data = (void*)ud_btl; + mpool_resources.sizeof_reg = sizeof(mca_btl_ud_reg_t); + mpool_resources.register_mem = mca_btl_ud_reg_mr; + mpool_resources.deregister_mem = mca_btl_ud_dereg_mr; + ud_btl->super.btl_mpool = + mca_mpool_base_module_create(mca_btl_ofud_component.ud_mpool_name, + &ud_btl->super, &mpool_resources); + + if(NULL == ud_btl->super.btl_mpool) { + BTL_ERROR(("error creating IB mpool for %s: %s\n", + ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); + goto dealloc_pd; + } + + /* Create the completion queue */ + length = mca_btl_ofud_component.rd_num + mca_btl_ofud_component.sd_num; + + ud_btl->ib_cq = ibv_create_cq(ctx, length, NULL, NULL, 0); + if(NULL == ud_btl->ib_cq) { + BTL_ERROR(("error creating CQ for %s: %s\n", + ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); + goto mpool_destroy; + } + + /* Set up our packet sequence numbers */ + ud_btl->addr.psn = lrand48() & 0xffffff; + + /* Set up the QPs for this BTL */ + for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { + if(OMPI_SUCCESS != mca_btl_ud_init_qp(ud_btl, + ud_btl->ib_cq, &ud_btl->ib_qp[i], ud_btl->addr.psn)) { + goto qp_destroy; + } + } + + /* Place our QP numbers in our local address information */ + ud_btl->addr.qp_num = ud_btl->ib_qp[0]->qp_num; + ud_btl->ib_qp_next = 0; + + /*ud_btl->rd_posted = mca_btl_ofud_component.rd_num_init;*/ + + /* Initialize pool of receive fragments first, since an error may occur */ + /* TODO - no need for a free list with a static buffer count */ + OBJ_CONSTRUCT(&ud_btl->recv_frags, ompi_free_list_t); + length = sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) + + ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN; + + ompi_free_list_init(&ud_btl->recv_frags, + length + sizeof(mca_btl_ud_ib_header_t), + OBJ_CLASS(mca_btl_ud_recv_frag_t), + mca_btl_ofud_component.rd_num, + mca_btl_ofud_component.rd_num, + mca_btl_ofud_component.rd_num, + ud_btl->super.btl_mpool); +#if 0 + ompi_free_list_init(&ud_btl->recv_frags, + length + sizeof(mca_btl_ud_ib_header_t), + OBJ_CLASS(mca_btl_ud_recv_frag_t), + mca_btl_ofud_component.rd_num_init, + mca_btl_ofud_component.rd_num_max, + mca_btl_ofud_component.rd_num_inc, + ud_btl->super.btl_mpool); +#endif + + /* Post receive descriptors */ + for(i = 0; i < mca_btl_ofud_component.rd_num; i++) { + OMPI_FREE_LIST_GET(&ud_btl->recv_frags, item, rc); + frag = (mca_btl_ud_frag_t*)item; + + if(NULL == frag) { + BTL_ERROR(("error getting receive buffer from free list\n")); + goto obj_destruct; + } + + frag->type = MCA_BTL_UD_FRAG_RECV; + frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit + + sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t); + if(ibv_post_recv(ud_btl->ib_qp[0], + &frag->wr_desc.rd_desc, &bad_wr)) { + BTL_ERROR(("error posting recv, errno %s\n", strerror(errno))); + goto obj_destruct; + } + } + + /* No more errors anticipated - initialize everything else */ + OBJ_CONSTRUCT(&ud_btl->ud_lock, opal_mutex_t); + OBJ_CONSTRUCT(&ud_btl->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&ud_btl->send_frags, ompi_free_list_t); + OBJ_CONSTRUCT(&ud_btl->user_frags, ompi_free_list_t); + + ompi_free_list_init(&ud_btl->send_frags, + length, + OBJ_CLASS(mca_btl_ud_send_frag_t), + mca_btl_ofud_component.sd_num >> 1, + mca_btl_ofud_component.sd_num << 2, + mca_btl_ofud_component.sd_num >> 3, + ud_btl->super.btl_mpool); + + /* Initialize pool of user fragments */ + length = sizeof(mca_btl_ud_frag_t) + + sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN; + + ompi_free_list_init(&ud_btl->user_frags, + length, + OBJ_CLASS(mca_btl_ud_user_frag_t), + mca_btl_ofud_component.sd_num >> 1, + mca_btl_ofud_component.sd_num << 2, + mca_btl_ofud_component.sd_num >> 3, + ud_btl->super.btl_mpool); + + return OMPI_SUCCESS; + +obj_destruct: + OBJ_DESTRUCT(&ud_btl->recv_frags); +qp_destroy: + for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { + ibv_destroy_qp(ud_btl->ib_qp[i]); + } +mpool_destroy: + mca_mpool_base_module_destroy(ud_btl->super.btl_mpool); +dealloc_pd: + ibv_dealloc_pd(ud_btl->ib_pd); + return OMPI_ERROR; +} + + +int mca_btl_udapl_ft_event(int state) { + return OMPI_SUCCESS; +} + + diff --git a/ompi/mca/btl/ofud/btl_ofud.h b/ompi/mca/btl/ofud/btl_ofud.h new file mode 100644 index 0000000000..97e118222a --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud.h @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_UD_H +#define MCA_BTL_UD_H + +/* Number of QP's to stripe sends over - keep this as power of 2 */ +/* AWF - This is intentionally NOT an MCA parameter so that I can do fast + modular arithmetic with it. */ +#define MCA_BTL_UD_NUM_QP 4 + +/* Standard system includes */ +#include +#include + +/* Open MPI includes */ +#include "opal/class/opal_hash_table.h" +#include "opal/util/output.h" +#include "opal/sys/timer.h" +#include "ompi/class/ompi_free_list.h" +#include "ompi/class/ompi_bitmap.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/mpool/mpool.h" + +/* TODO - If I want this to go away, addr_t has to come over here */ +#include "btl_ofud_endpoint.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +/** + * UD Infiniband (IB) BTL component. + */ + +struct mca_btl_ud_component_t { + mca_btl_base_component_1_0_1_t super; /**< base BTL component */ + + uint32_t max_btls; /**< Maximum number of BTL modules */ + uint32_t num_btls; /**< Number of available/initialized BTL modules */ + + struct mca_btl_ud_module_t* ud_btls; /**< array of available BTLs */ + + opal_list_t ud_procs; /**< list of ib proc structures */ + opal_mutex_t ud_lock; /**< lock for accessing component state */ + + char* ud_mpool_name; /**< name of memory pool */ + + int32_t sd_num; /**< max send descriptors to post per BTL */ + int32_t sd_num_peer; /**< max send descriptors to post per endpoint */ + + int32_t rd_num; /**< number of receive descriptors per BTL */ +#if 0 + int32_t rd_num_init; /**< initial recv descriptors to post per BTL */ + int32_t rd_num_max; + int32_t rd_num_inc; +#endif + + uint32_t ib_pkey_ix; + uint32_t ib_qkey; + uint32_t ib_service_level; + uint32_t ib_src_path_bits; + +}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t; + +OMPI_MODULE_DECLSPEC extern mca_btl_ud_component_t mca_btl_ofud_component; + +typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t; + + +/** + * Profiling variables + */ + +#if OMPI_ENABLE_DEBUG +#define MCA_BTL_UD_ENABLE_PROFILE 0 +#else +#define MCA_BTL_UD_ENABLE_PROFILE 0 +#endif + +#if MCA_BTL_UD_ENABLE_PROFILE + +#define MCA_BTL_UD_PROFILE_VAR(var) \ + opal_timer_t avg_ ## var; \ + opal_timer_t cnt_ ## var; \ + opal_timer_t tmp_ ## var + +struct mca_btl_ud_profile_t { + MCA_BTL_UD_PROFILE_VAR(post_send); + MCA_BTL_UD_PROFILE_VAR(ibv_post_send); +}; + +typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t; +extern mca_btl_ud_profile_t mca_btl_ud_profile; + +#endif + + +/** + * UD/IB BTL Interface + */ + +struct mca_btl_ud_module_t { + mca_btl_base_module_t super; + mca_btl_ud_recv_reg_t ib_reg[256]; /* protected by ib_lock */ + + uint8_t ib_port_num; + struct ibv_device* ib_dev; + struct ibv_context* ib_dev_context; + struct ibv_pd* ib_pd; + struct ibv_cq* ib_cq; + + struct mca_btl_ud_addr_t addr; /**< local address information */ + + ompi_free_list_t send_frags; /**< send fragments & buffers */ + ompi_free_list_t user_frags; /**< user data fragments */ + ompi_free_list_t recv_frags; /**< receive fragments & buffers */ + + opal_list_t pending_frags; /**< list of pending send frags */ + + opal_mutex_t ud_lock; /**< lock for ib_reg and pending_frags */ + + size_t ib_inline_max; /**< max size of IB inline send */ + + /*int32_t rd_posted;*/ /**< number of receives currently posted */ + + int32_t sd_wqe; /**< available send WQ entries */ + /* No lock needed, these are incremented/decremented atomically */ + + /*opal_hash_table_t* ep_lookup;*/ + /**< hash table for fast lookup of endpoint structures in recv path */ + /* lid:qpnum is key, value is mca_btl_ud_endpoint_t* */ + + struct ibv_qp* ib_qp[MCA_BTL_UD_NUM_QP]; + uint32_t ib_qp_next; + /**< Local QPs and stripe counters */ + /* No lock needed - counters only ever increase by 1 */ +}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t; + +struct mca_btl_ud_frag_t; +extern mca_btl_ud_module_t mca_btl_ofud_module; + + +/** + * Open the component; register UD/IB parameters with the MCA framework + */ + +extern int mca_btl_ud_component_open(void); + + +/** + * Any final cleanup before being unloaded. + */ + +extern int mca_btl_ud_component_close(void); + + +/** + * IB component initialization. + * + * @param num_btl_modules (OUT) + * Number of BTLs returned in BTL array. + * @param allow_multi_user_threads (OUT) + * Flag indicating wether BTL supports user threads (TRUE) + * @param have_hidden_threads (OUT) + * Flag indicating whether BTL uses threads (TRUE) + * + * (1) read interface list from verbs and compare against component parameters + * then create a BTL instance for selected interfaces + * (2) publish BTL addressing info + */ + +extern mca_btl_base_module_t** mca_btl_ud_component_init( + int *num_btl_modules, + bool allow_multi_user_threads, + bool have_hidden_threads); + + +/** + * UD/IB component progress. + */ +extern int mca_btl_ud_component_progress(void); + + +/** + * Register a callback function that is called on receipt + * of a fragment. + * + * @param btl (IN) BTL module + * @return Status indicating if cleanup was successful + */ + +int mca_btl_ud_register(struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata); + + +/** + * Cleanup any resources held by the BTL. + * + * @param btl BTL instance. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl); + + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) + * @param nprocs (IN) Number of processes + * @param procs (IN) Set of processes + * @param peers (OUT) Set of (optional) peer addressing info. + * @param peers (IN/OUT) Set of processes that are reachable via this BTL. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable); + + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) BTL instance + * @param nproc (IN) Number of processes. + * @param procs (IN) Set of processes. + * @param peers (IN) Set of peer data structures. + * @return Status indicating if cleanup was successful + * + */ + +extern int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers); + + +/** + * PML->BTL Initiate a send of the specified size. + * + * @param btl (IN) + * BTL instance + * @param btl_base_peer (IN) + * BTL peer addressing + * @param send_request (IN/OUT) + * Send request (allocated by PML via mca_btl_base_request_alloc_fn_t) + * @param size (IN) + * Number of bytes PML is requesting BTL to deliver + * @param flags (IN) + * Flags that should be passed to the peer via the message header. + * @param request (OUT) + * OMPI_SUCCESS if the BTL was able to queue one or more fragments + */ + +extern int mca_btl_ud_send(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag); + + +/** + * Allocate a descriptor. + * + * @param btl (IN) BTL module + * @param size (IN) Requested descriptor size. + */ + +extern mca_btl_base_descriptor_t* mca_btl_ud_alloc( + struct mca_btl_base_module_t* btl, + uint8_t order, + size_t size); + + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param descriptor (IN) Allocated descriptor. + */ + +extern int mca_btl_ud_free(struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des); + + +/** + * Pack data and return a descriptor that can be + * used for send/put. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + */ + +mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size); + + + +int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl); + +/** + * Fault Tolerance Event Notification Function + * @param state Checkpoint State + * @return OMPI_SUCCESS or failure status + */ + +extern int mca_btl_udapl_ft_event(int state); + + + +/* + * Profiling stuff + */ + +#if MCA_BTL_UD_ENABLE_PROFILE + +#define MCA_BTL_UD_START_TIME(var) \ + ((mca_btl_ud_profile.tmp_ ## var) = opal_sys_timer_get_cycles()) + +#define MCA_BTL_UD_END_TIME(var) \ +do { \ + mca_btl_ud_profile.avg_ ## var += \ + opal_sys_timer_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \ + mca_btl_ud_profile.cnt_ ## var++; \ +} while(0) + +#define MCA_BTL_UD_SHOW_TIME(var) \ + OPAL_OUTPUT((0, " " #var " avg %lu cnt %lu", \ + (mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \ + mca_btl_ud_profile.cnt_ ## var)); + +#else +#define MCA_BTL_UD_START_TIME(var) +#define MCA_BTL_UD_END_TIME(var) +#define MCA_BTL_UD_SHOW_TIME(var) +#endif + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_component.c b/ompi/mca/btl/ofud/btl_ofud_component.c new file mode 100644 index 0000000000..2c54aa8141 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_component.c @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "ompi_config.h" +#include "ompi/constants.h" +#include "opal/prefetch.h" +#include "opal/util/output.h" +#include "ompi/mca/btl/btl.h" +#include "opal/sys/timer.h" +#include "opal/mca/base/mca_base_param.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" +#include "ompi/mca/pml/base/pml_base_module_exchange.h" + +#include "btl_ofud.h" +#include "btl_ofud_frag.h" +#include "btl_ofud_endpoint.h" + + +mca_btl_ud_component_t mca_btl_ofud_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + { + /* Indicate that we are a pml v1.0.0 component (which also implies a + specific MCA version) */ + MCA_BTL_BASE_VERSION_1_0_1, + + "ofud", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_btl_ud_component_open, /* component open */ + mca_btl_ud_component_close /* component close */ + }, + + /* Next the MCA v1.0.0 component meta data */ + { + /* Whether the component is checkpointable or not */ + false + }, + + mca_btl_ud_component_init, + mca_btl_ud_component_progress, + } +}; + + +/* + * Profiling information + */ + +#if MCA_BTL_UD_ENABLE_PROFILE +mca_btl_ud_profile_t mca_btl_ud_profile = {0}; +#endif + + +/* + * utility routines for parameter registration + */ + +static inline void mca_btl_ud_param_reg_string(const char* param_name, + const char* param_desc, + const char* default_value, + char** out_value) +{ + mca_base_param_reg_string(&mca_btl_ofud_component.super.btl_version, + param_name, param_desc, false, false, + default_value, out_value); +} + +static inline void mca_btl_ud_param_reg_int(const char* param_name, + const char* param_desc, + int default_value, + int* out_value) +{ + mca_base_param_reg_int(&mca_btl_ofud_component.super.btl_version, + param_name, param_desc, false, false, + default_value, out_value); +} + + +/* + * Called by MCA framework to open the component, registers + * component parameters. + */ + +int mca_btl_ud_component_open(void) +{ + int val; + + /* initialize state */ + mca_btl_ofud_component.num_btls = 0; + mca_btl_ofud_component.ud_btls = NULL; + + /* initialize objects */ + OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_procs, opal_list_t); + + /* register IB component parameters */ + mca_btl_ud_param_reg_int("max_btls", + "Maximum number of HCAs/ports to use", + 4, (int*)&mca_btl_ofud_component.max_btls); + + mca_btl_ud_param_reg_string("mpool", "Name of the memory pool to be used", + "rdma", &mca_btl_ofud_component.ud_mpool_name); + + mca_btl_ud_param_reg_int("ib_pkey_index", "IB pkey index", + 0, (int*)&mca_btl_ofud_component.ib_pkey_ix); + mca_btl_ud_param_reg_int("ib_qkey", "IB qkey", + 0x01330133, (int*)&mca_btl_ofud_component.ib_qkey); + mca_btl_ud_param_reg_int("ib_service_level", "IB service level", + 0, (int*)&mca_btl_ofud_component.ib_service_level); + mca_btl_ud_param_reg_int("ib_src_path_bits", "IB source path bits", + 0, (int*)&mca_btl_ofud_component.ib_src_path_bits); + + mca_btl_ud_param_reg_int("sd_num", "maximum send descriptors to post", + 128, (int*)&mca_btl_ofud_component.sd_num); + mca_btl_ud_param_reg_int("sd_num_peer", + "maximum send descriptors to post to one peer", + 8, (int*)&mca_btl_ofud_component.sd_num_peer); + + mca_btl_ud_param_reg_int("rd_num_init", "number of receive buffers", + 6000, (int*)&mca_btl_ofud_component.rd_num); +#if 0 + mca_btl_ud_param_reg_int("rd_num_init", "initial receive buffers", + 3000, (int*)&mca_btl_ofud_component.rd_num_init); + mca_btl_ud_param_reg_int("rd_num_max", "maximum receive buffers", + 4500, (int*)&mca_btl_ofud_component.rd_num_max); + mca_btl_ud_param_reg_int("rd_num_inc", + "number of buffers to post when rate is high", + 25, (int*)&mca_btl_ofud_component.rd_num_inc); +#endif + + /* TODO - this assumes a 2k UD MTU - query/do something more intelligent */ + /*mca_btl_ud_param_reg_int("eager_limit", "eager send limit", + 2048, &val); */ + mca_btl_ud_param_reg_int("min_send_size", "minimum send size", + 2048, &val); + mca_btl_ofud_module.super.btl_min_send_size = val; + mca_btl_ud_param_reg_int("max_send_size", "maximum send size", + 2048, &val); + mca_btl_ofud_module.super.btl_eager_limit = val; + mca_btl_ofud_module.super.btl_max_send_size = val; + + mca_btl_ud_param_reg_int("exclusivity", "BTL exclusivity", + MCA_BTL_EXCLUSIVITY_DEFAULT, + (int*)&mca_btl_ofud_module.super.btl_exclusivity); + mca_btl_ud_param_reg_int("bandwidth", + "Approximate maximum bandwidth of interconnect", + 800, (int*)&mca_btl_ofud_module.super.btl_bandwidth); + + mca_btl_ofud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t); + mca_btl_ofud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t); + + return OMPI_SUCCESS; +} + + +/* + * Component cleanup + */ + +int mca_btl_ud_component_close(void) +{ + OBJ_DESTRUCT(&mca_btl_ofud_component.ud_lock); + OBJ_DESTRUCT(&mca_btl_ofud_component.ud_procs); + + /* Calculate and print profiling numbers */ + MCA_BTL_UD_SHOW_TIME(post_send); + MCA_BTL_UD_SHOW_TIME(ibv_post_send); + + return OMPI_SUCCESS; +} + + +/* + * Register UD address information. The MCA framework + * will make this available to all peers. + */ + +static int mca_btl_ud_modex_send(void) +{ + int rc; + size_t i; + size_t size; + mca_btl_ud_addr_t* addrs = NULL; + + size = mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_addr_t); + if(size != 0) { + addrs = (mca_btl_ud_addr_t*)malloc(size); + if(NULL == addrs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + for(i = 0; i < mca_btl_ofud_component.num_btls; i++) { + mca_btl_ud_module_t* btl = &mca_btl_ofud_component.ud_btls[i]; + addrs[i] = btl->addr; + + BTL_VERBOSE((0, "modex_send QP num %x, LID = %x", + addrs[i].qp_num, addrs[i].lid)); + } + } + + rc = mca_pml_base_modex_send( + &mca_btl_ofud_component.super.btl_version, addrs, size); + if(NULL != addrs) { + free(addrs); + } + return rc; +} + +/* + * UD component initialization: + * (1) read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + * (2) post OOB receive for incoming connection attempts + * (3) register BTL parameters with the MCA + */ + +mca_btl_base_module_t** mca_btl_ud_component_init(int* num_btl_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + struct ibv_device **ib_devs; + struct ibv_device* ib_dev; + int32_t num_devs; + mca_btl_base_module_t** btls; + uint32_t i, j; + opal_list_t btl_list; + mca_btl_ud_module_t* ud_btl; + mca_btl_base_selected_module_t* ib_selected; + opal_list_item_t* item; + unsigned short seedv[3]; + char* btl_str; + char* tok; + + /* First, check if the UD BTL was specifically selected. + If not, then short out right away. */ + mca_base_param_lookup_string( + mca_base_param_find("btl", NULL, NULL), &btl_str); + if(NULL == btl_str) { + /* Can't specify UD with out any string at all.. bail out */ + return NULL; + } + + /* Try to find a 'ud' token */ + tok = strtok(btl_str, ","); + while(tok) { + if(!strcasecmp("ofud", tok)) { + break; + } + } + + if(NULL == tok) { + /* No valid 'ud' token found; bail out */ + return NULL; + } + + /* initialization */ + *num_btl_modules = 0; + num_devs = 0; + + seedv[0] = orte_process_info.my_name->vpid; + seedv[1] = opal_sys_timer_get_cycles(); + seedv[2] = opal_sys_timer_get_cycles(); + seed48(seedv); + + ib_devs = ibv_get_device_list(&num_devs); + + if(0 == num_devs) { + mca_btl_base_error_no_nics("OpenFabrics UD", "HCA"); + mca_btl_ud_modex_send(); + return NULL; + } + + /** We must loop through all the hca id's, get their handles and + for each hca we query the number of ports on the hca and set up + a distinct btl module for each hca port */ + + OBJ_CONSTRUCT(&btl_list, opal_list_t); + + for(i = 0; (int32_t)i < num_devs && + mca_btl_ofud_component.num_btls < mca_btl_ofud_component.max_btls; + i++) { + struct ibv_device_attr ib_dev_attr; + struct ibv_context* ib_dev_context; + + ib_dev = ib_devs[i]; + + ib_dev_context = ibv_open_device(ib_dev); + if(!ib_dev_context) { + BTL_ERROR(("error obtaining device context for %s: %s\n", + ibv_get_device_name(ib_dev), strerror(errno))); + return NULL; + } + + if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ + BTL_ERROR(("error obtaining device attributes for %s: %s\n", + ibv_get_device_name(ib_dev), strerror(errno))); + return NULL; + } + + + /* Note ports are 1 based hence j = 1 */ + for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++) { + struct ibv_port_attr ib_port_attr; + + if(ibv_query_port(ib_dev_context, (uint8_t)j, &ib_port_attr)) { + BTL_ERROR(("error getting port attributes for device %s port %d: %s", + ibv_get_device_name(ib_dev), j, strerror(errno))); + return NULL; + } + + if(IBV_PORT_ACTIVE == ib_port_attr.state) { + ud_btl = + (mca_btl_ud_module_t*)malloc(sizeof(mca_btl_ud_module_t)); + memcpy(ud_btl, &mca_btl_ofud_module, sizeof(mca_btl_ud_module_t)); + + ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); + ib_selected->btl_module = (mca_btl_base_module_t*)ud_btl; + + ud_btl->ib_dev = ib_dev; + ud_btl->ib_dev_context = ib_dev_context; + ud_btl->ib_port_num = (uint8_t)j; + ud_btl->addr.subnet = ib_port_attr.sm_lid; + ud_btl->addr.lid = ib_port_attr.lid; + + opal_list_append(&btl_list, (opal_list_item_t*) ib_selected); + if(++mca_btl_ofud_component.num_btls >= + mca_btl_ofud_component.max_btls) + break; + } + } + } + + + /* Allocate space for btl modules */ + mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*) + malloc(sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls); + if(NULL == mca_btl_ofud_component.ud_btls) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + btls = (struct mca_btl_base_module_t**) + malloc(mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*)); + if(NULL == btls) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + + for(i = 0; i < mca_btl_ofud_component.num_btls; i++){ + item = opal_list_remove_first(&btl_list); + ib_selected = (mca_btl_base_selected_module_t*)item; + ud_btl = (mca_btl_ud_module_t*)ib_selected->btl_module; + + memcpy(&(mca_btl_ofud_component.ud_btls[i]), + ud_btl, sizeof(mca_btl_ud_module_t)); + free(ib_selected); + free(ud_btl); + + ud_btl = &mca_btl_ofud_component.ud_btls[i]; + + /* Initialize module state */ + if(mca_btl_ud_module_init(ud_btl) != OMPI_SUCCESS) { + mca_btl_ofud_component.num_btls--; + i--; + + continue; + } + + btls[i] = &ud_btl->super; + } + + OBJ_DESTRUCT(&btl_list); + mca_btl_ud_modex_send(); + + /* Since not all modules may have initialized successfully, realloc + to free space from failed modules */ + mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*) + realloc(mca_btl_ofud_component.ud_btls, + sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls); + btls = (struct mca_btl_base_module_t**)realloc(btls, + mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*)); + + *num_btl_modules = mca_btl_ofud_component.num_btls; + + ibv_free_device_list(ib_devs); + return btls; +} + + +/* + * IB component progress. + */ + +#define MCA_BTL_UD_NUM_WC 500 + +int mca_btl_ud_component_progress(void) +{ + uint32_t i; + int count = 0, ne, j; + mca_btl_ud_frag_t* frag; + struct ibv_recv_wr* bad_wr; + struct ibv_recv_wr* head_wr; + mca_btl_ud_module_t* ud_btl; + mca_btl_base_recv_reg_t* reg; + struct ibv_wc* cwc; + struct ibv_wc wc[MCA_BTL_UD_NUM_WC]; + + /* Poll for completions */ + for(i = 0; i < mca_btl_ofud_component.num_btls; i++) { + ud_btl = &mca_btl_ofud_component.ud_btls[i]; + + ne = ibv_poll_cq(ud_btl->ib_cq, MCA_BTL_UD_NUM_WC, wc); + if(OPAL_UNLIKELY(ne < 0)) { + BTL_ERROR(("error polling CQ with %d: %s\n", + ne, strerror(errno))); + return OMPI_ERROR; + } + + head_wr = NULL; + + for(j = 0; j < ne; j++) { + cwc = &wc[j]; + if(OPAL_UNLIKELY(cwc->status != IBV_WC_SUCCESS)) { + BTL_ERROR(("error polling CQ with status %d for wr_id %llu opcode %d\n", + cwc->status, cwc->wr_id, cwc->opcode)); + return OMPI_ERROR; + } + + frag = (mca_btl_ud_frag_t*)(unsigned long)cwc->wr_id; + + /* Handle work completions */ + switch(frag->type) { + case MCA_BTL_UD_FRAG_SEND: + case MCA_BTL_UD_FRAG_USER: + { + mca_btl_ud_endpoint_t* endpoint = frag->endpoint; + assert(cwc->opcode == IBV_WC_SEND); + + frag->base.des_cbfunc(&ud_btl->super, + frag->endpoint, &frag->base, OMPI_SUCCESS); + + /* Increment send counter, post if any sends are queued */ + OPAL_THREAD_ADD32(&endpoint->sd_wqe, 1); + if(OPAL_UNLIKELY( + !opal_list_is_empty(&endpoint->pending_frags))) { + OPAL_THREAD_LOCK(&endpoint->pending_frags_lock); + frag = (mca_btl_ud_frag_t*) + opal_list_remove_first(&endpoint->pending_frags); + OPAL_THREAD_UNLOCK(&endpoint->pending_frags_lock); + + if(OPAL_LIKELY(NULL != frag)) { + mca_btl_ud_endpoint_post_send(ud_btl, frag); + } + } + + OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1); + if(OPAL_UNLIKELY( + !opal_list_is_empty(&ud_btl->pending_frags))) { + OPAL_THREAD_LOCK(&ud_btl->ud_lock); + frag = (mca_btl_ud_frag_t*) + opal_list_remove_first(&ud_btl->pending_frags); + OPAL_THREAD_UNLOCK(&ud_btl->ud_lock); + + if(OPAL_LIKELY(NULL != frag)) { + mca_btl_ud_endpoint_post_send(ud_btl, frag); + } + } + + continue; + } + case MCA_BTL_UD_FRAG_RECV: + assert(cwc->opcode == IBV_WC_RECV); + reg = &ud_btl->ib_reg[frag->hdr->tag]; + + frag->segment.seg_addr.pval = frag->hdr + 1; + frag->segment.seg_len = cwc->byte_len - + sizeof(mca_btl_ud_header_t) - + sizeof(mca_btl_ud_ib_header_t); + + reg->cbfunc(&ud_btl->super, + frag->hdr->tag, &frag->base, reg->cbdata); + + /* Add recv to linked list for reposting */ + frag->wr_desc.rd_desc.next = head_wr; + head_wr = &frag->wr_desc.rd_desc; + continue; + default: + BTL_ERROR(("Unhandled completion opcode %d frag type %d", + cwc->opcode, frag->type)); + break; + } + } + + count += ne; + + /* Repost any recv buffers all at once */ + if(OPAL_LIKELY(head_wr)) { + if(OPAL_UNLIKELY(ibv_post_recv( + ud_btl->ib_qp[0], head_wr, &bad_wr))) { + BTL_ERROR(("error posting recv: %s\n", strerror(errno))); + return OMPI_ERROR; + } + + head_wr = NULL; + } + } + + return count; +} + diff --git a/ompi/mca/btl/ofud/btl_ofud_endpoint.c b/ompi/mca/btl/ofud/btl_ofud_endpoint.c new file mode 100644 index 0000000000..08be4c4444 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_endpoint.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include +#include + +#include "ompi_config.h" +#include "opal/prefetch.h" +#include "ompi/types.h" +#include "ompi/class/ompi_free_list.h" + +#include "btl_ofud.h" +#include "btl_ofud_endpoint.h" +#include "btl_ofud_frag.h" + + +static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint); +static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); + + +/* First, we check the downcounter on the endpoint. + If it is 0, we queue this frag on the endpoint. + Otherwise, we check the BTL downcounter. + If it is 0, we queue this frag on the BTL. + Otherwise, we post the send. */ +#define CHECK_FRAG_QUEUES(sd_wqe, lock, queue, frag) \ +do { \ + if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&(sd_wqe), -1) < 0)) { \ + OPAL_THREAD_ADD32(&(sd_wqe), 1); \ + OPAL_THREAD_LOCK(&(lock)); \ + opal_list_append(&(queue), \ + (opal_list_item_t*)(frag)); \ + OPAL_THREAD_UNLOCK(&(lock)); \ + return OMPI_SUCCESS; \ + } \ +} while(0); + + +/* + * Post a send to the work queue + */ + +int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl, + mca_btl_ud_frag_t* frag) +{ + struct ibv_qp* ib_qp; + struct ibv_send_wr* bad_wr; + struct ibv_send_wr* wr = &frag->wr_desc.sr_desc; + mca_btl_ud_endpoint_t* endpoint = frag->endpoint; + int ret; + + /* Have to be careful here - UD adds a 40 byte header, but it is not + included on the sending side. */ + frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t); + wr->send_flags = IBV_SEND_SIGNALED; + + CHECK_FRAG_QUEUES(endpoint->sd_wqe, + endpoint->pending_frags_lock, endpoint->pending_frags, frag); + CHECK_FRAG_QUEUES(ud_btl->sd_wqe, + ud_btl->ud_lock, ud_btl->pending_frags, frag); + + /* We avoid locking here by allowing our stripe counter to count + until it wraps around uint32_t. This keeps the mod operation + out of the critical section, allowing us to use OPAL_THREAD_ADD32 + instead of a full mutex. */ + ib_qp = ud_btl->ib_qp[ud_btl->ib_qp_next % MCA_BTL_UD_NUM_QP]; + OPAL_THREAD_ADD32(&ud_btl->ib_qp_next, 1); + + wr->wr.ud.ah = endpoint->rmt_ah; + wr->wr.ud.remote_qpn = endpoint->rem_addr.qp_num; + + if(frag->sg_entry.length <= ud_btl->ib_inline_max) { + wr->send_flags = + IBV_SEND_SIGNALED|IBV_SEND_INLINE; + } + + /*frag->hdr->src_qpnum = ud_btl->addr.qp_num;*/ + + MCA_BTL_UD_START_TIME(ibv_post_send); + if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) { + opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d", + endpoint->sd_wqe, ud_btl->sd_wqe, + frag->sg_entry.length, ud_btl->ib_qp_next); + BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret))); + + } + MCA_BTL_UD_END_TIME(ibv_post_send); + + return OMPI_SUCCESS; +} + + +OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t, + opal_list_item_t, mca_btl_ud_endpoint_construct, + mca_btl_ud_endpoint_destruct); + +/* + * Construct/destruct an endpoint structure. + */ + +static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint) +{ +#if OMPI_ENABLE_DEBUG + memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t)); +#endif + + OBJ_CONSTRUCT(&endpoint->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&endpoint->pending_frags_lock, opal_mutex_t); + + endpoint->sd_wqe = mca_btl_ofud_component.sd_num_peer; +} + +static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) +{ + /* TODO - what about any pending frags? */ + OBJ_DESTRUCT(&endpoint->pending_frags); + OBJ_DESTRUCT(&endpoint->pending_frags_lock); +} + diff --git a/ompi/mca/btl/ofud/btl_ofud_endpoint.h b/ompi/mca/btl/ofud/btl_ofud_endpoint.h new file mode 100644 index 0000000000..d2086201aa --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_endpoint.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_IB_ENDPOINT_H +#define MCA_BTL_IB_ENDPOINT_H + +#include + +#include "opal/class/opal_list.h" +#include "opal/event/event.h" + +#include "btl_ofud.h" +#include "btl_ofud_frag.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +struct mca_btl_ud_addr_t { + uint32_t qp_num; + uint32_t psn; + uint16_t lid; + uint16_t subnet; +}; +typedef struct mca_btl_ud_addr_t mca_btl_ud_addr_t; + + +/** + * An abstraction that represents a connection to a endpoint process. + * An instance of mca_btl_base_endpoint_t is associated w/ each process + * and BTL pair and address information is exchanged at startup. + * The UD BTL is connectionless, so no connection is ever established. + */ + +struct mca_btl_base_endpoint_t { + opal_list_item_t super; + + mca_btl_ud_addr_t rem_addr; + /**< Remote address information */ + /* No lock needed, read-only past initialization */ + + struct ibv_ah* rmt_ah; + /**< Remote address handle */ + /* No lock needed, verbs are thread-safe */ + + opal_list_t pending_frags; + opal_mutex_t pending_frags_lock; + /**< list of pending frags and lock */ + + int32_t sd_wqe; + /**< number of available send wqe entries */ + /* No lock needed, OPAL_THREAD_ADD32 is used */ +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t); + +int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl, + struct mca_btl_ud_frag_t * frag); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_frag.c b/ompi/mca/btl/ofud/btl_ofud_frag.c new file mode 100644 index 0000000000..6fcd6f7515 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_frag.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ofud.h" +#include "btl_ofud_frag.h" +#include "ompi/mca/mpool/rdma/mpool_rdma.h" + + +static inline void mca_btl_ud_frag_common_constructor(mca_btl_ud_frag_t* frag) +{ + frag->ud_reg = (mca_btl_ud_reg_t*)frag->base.super.registration; + frag->sg_entry.lkey = frag->ud_reg->mr->lkey; + frag->base.des_flags = 0; + frag->base.order = MCA_BTL_NO_ORDER; +} + + +static void mca_btl_ud_send_frag_constructor(mca_btl_ud_frag_t* frag) +{ + frag->type = MCA_BTL_UD_FRAG_SEND; + mca_btl_ud_frag_common_constructor(frag); + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + + /* We do not include the mca_btl_ud_ib_header_t data when sending */ + frag->hdr = frag->base.super.ptr; + frag->segment.seg_addr.pval = frag->hdr + 1; + + frag->sg_entry.addr = (unsigned long)frag->hdr; + + frag->wr_desc.sr_desc.wr_id = (unsigned long)frag; + frag->wr_desc.sr_desc.sg_list = &frag->sg_entry; + frag->wr_desc.sr_desc.num_sge = 1; + frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; + frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->wr_desc.sr_desc.next = NULL; + frag->wr_desc.sr_desc.wr.ud.remote_qkey = mca_btl_ofud_component.ib_qkey; +} + + +static void mca_btl_ud_user_frag_constructor(mca_btl_ud_frag_t* frag) +{ + mca_btl_ud_send_frag_constructor(frag); + frag->type = MCA_BTL_UD_FRAG_USER; +} + + +static void mca_btl_ud_recv_frag_constructor(mca_btl_ud_frag_t* frag) +{ + frag->type = MCA_BTL_UD_FRAG_RECV; + mca_btl_ud_frag_common_constructor(frag); + frag->base.des_dst = &frag->segment; + frag->base.des_dst_cnt = 1; + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + + /* Receive frag headers start 40 bytes later */ + frag->hdr = (mca_btl_ud_header_t*)((uintptr_t)frag->base.super.ptr + + sizeof(mca_btl_ud_ib_header_t)); + frag->segment.seg_addr.pval = frag->hdr + 1; + + frag->sg_entry.addr = (uintptr_t)frag->base.super.ptr; + frag->segment.seg_len = mca_btl_ofud_module.super.btl_eager_limit; + frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit + + sizeof(mca_btl_ud_ib_header_t) + sizeof(mca_btl_ud_header_t); + + frag->wr_desc.rd_desc.wr_id = (unsigned long)frag; + frag->wr_desc.rd_desc.sg_list = &frag->sg_entry; + frag->wr_desc.rd_desc.num_sge = 1; + frag->wr_desc.rd_desc.next = NULL; +} + + +OBJ_CLASS_INSTANCE(mca_btl_ud_frag_t, + mca_btl_base_descriptor_t, + NULL, + NULL); + +OBJ_CLASS_INSTANCE(mca_btl_ud_send_frag_t, + mca_btl_base_descriptor_t, + mca_btl_ud_send_frag_constructor, + NULL); + +OBJ_CLASS_INSTANCE(mca_btl_ud_user_frag_t, + mca_btl_base_descriptor_t, + mca_btl_ud_user_frag_constructor, + NULL); + +OBJ_CLASS_INSTANCE(mca_btl_ud_recv_frag_t, + mca_btl_base_descriptor_t, + mca_btl_ud_recv_frag_constructor, + NULL); + diff --git a/ompi/mca/btl/ofud/btl_ofud_frag.h b/ompi/mca/btl/ofud/btl_ofud_frag.h new file mode 100644 index 0000000000..3fb76e6f24 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_frag.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UD_FRAG_H +#define MCA_BTL_UD_FRAG_H + +#define MCA_BTL_IB_FRAG_ALIGN (8) + +#include + +#include "ompi/mca/mpool/rdma/mpool_rdma.h" + +#include "btl_ofud.h" + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + + +/** + * Fragment types + */ +typedef enum { + MCA_BTL_UD_FRAG_SEND, + MCA_BTL_UD_FRAG_USER, + MCA_BTL_UD_FRAG_RECV +} mca_btl_ud_frag_type_t; + + +struct mca_btl_ud_reg_t { + mca_mpool_base_registration_t base; + struct ibv_mr* mr; +}; +typedef struct mca_btl_ud_reg_t mca_btl_ud_reg_t; + + +/* UD adds a 40 byte global routing header */ +/* This works in strange ways - the sending side does not need to explicitly + include this data in sg lists. Then, on the receiving side, the extra 40 + bytes magically appear. */ +struct mca_btl_ud_ib_header_t { + uint8_t ib_grh[40]; +}; +typedef struct mca_btl_ud_ib_header_t mca_btl_ud_ib_header_t; + +struct mca_btl_ud_header_t { + /*uint32_t src_qpnum;*/ + mca_btl_base_tag_t tag; +}; +typedef struct mca_btl_ud_header_t mca_btl_ud_header_t; + + +/** + * IB send fragment derived type. + */ + +struct mca_btl_ud_frag_t { + mca_btl_base_descriptor_t base; + mca_btl_base_segment_t segment; + + struct mca_btl_base_endpoint_t* endpoint; + + mca_btl_ud_frag_type_t type; + + union{ + struct ibv_recv_wr rd_desc; + struct ibv_send_wr sr_desc; + } wr_desc; + struct ibv_sge sg_entry; + + /* When this is a send frag, hdr points right after this, as expected. + But when this is a receive frag, we have an extra 40 bytes provided + by IB, so this points 40 bytes past the end of the frag. */ + mca_btl_ud_header_t* hdr; + + mca_btl_ud_reg_t* ud_reg; +}; +typedef struct mca_btl_ud_frag_t mca_btl_ud_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_user_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_user_frag_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_t); + + +/* + * Allocate/return a UD/IB send/user fragment + */ + +#define MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc) \ +{ \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->send_frags, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ +} + +#define MCA_BTL_UD_RETURN_FRAG(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN( \ + &((mca_btl_ud_module_t*)btl)->send_frags, \ + (ompi_free_list_item_t*)(frag)); \ +} + + +#define MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc) \ +{ \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->user_frags, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ +} + +#define MCA_BTL_UD_RETURN_USER_FRAG(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN( \ + &((mca_btl_ud_module_t*)btl)->user_frags, \ + (ompi_free_list_item_t*)(frag)); \ +} + + +struct mca_btl_ud_module_t; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_proc.c b/ompi/mca/btl/ofud/btl_ofud_proc.c new file mode 100644 index 0000000000..1e5d334855 --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_proc.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/pml/base/pml_base_module_exchange.h" + +#include "btl_ofud.h" +#include "btl_ofud_proc.h" + + +static void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc); +static void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc); + +OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t, + opal_list_item_t, mca_btl_ud_proc_construct, + mca_btl_ud_proc_destruct); + +void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc) +{ + proc->proc_ompi = 0; + proc->proc_addr_count = 0; + proc->proc_endpoints = 0; + proc->proc_endpoint_count = 0; + OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t); + + /* add to list of all proc instance */ + OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); + opal_list_append(&mca_btl_ofud_component.ud_procs, &proc->super); + OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); +} + +void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc) +{ + /* remove from list of all proc instances */ + OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); + opal_list_remove_item(&mca_btl_ofud_component.ud_procs, &proc->super); + OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); + + /* release resources */ + if(NULL != proc->proc_endpoints) { + free(proc->proc_endpoints); + } +} + + +/* + * Look for an existing IB process instance based on the associated + * ompi_proc_t instance. + */ + +mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc) +{ + mca_btl_ud_proc_t* ib_proc; + + OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); + + for(ib_proc = (mca_btl_ud_proc_t*) + opal_list_get_first(&mca_btl_ofud_component.ud_procs); + ib_proc != (mca_btl_ud_proc_t*) + opal_list_get_end(&mca_btl_ofud_component.ud_procs); + ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) { + if(ib_proc->proc_ompi == ompi_proc) { + OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); + return ib_proc; + } + } + OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); + return NULL; +} + + +/* + * Create a IB process structure. There is a one-to-one correspondence + * between a ompi_proc_t and a mca_btl_ud_proc_t instance. We cache + * additional data (specifically the list of mca_btl_ud_endpoint_t instances, + * and published addresses) associated w/ a given destination on this + * datastructure. + */ + +mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc) +{ + mca_btl_ud_proc_t* module_proc = NULL; + size_t size; + int rc; + + /* Check if we have already created a IB proc + * structure for this ompi process */ + module_proc = mca_btl_ud_proc_lookup_ompi(ompi_proc); + + if(module_proc != NULL) { + /* Gotcha! */ + return module_proc; + } + + /* Oops! First time, gotta create a new IB proc out of the ompi_proc ... */ + module_proc = OBJ_NEW(mca_btl_ud_proc_t); + /* Initialize number of peer */ + module_proc->proc_endpoint_count = 0; + module_proc->proc_ompi = ompi_proc; + + /* build a unique identifier (of arbitrary size) to represent the proc */ + module_proc->proc_guid = ompi_proc->proc_name; + + + /* query for the peer address info */ + rc = mca_pml_base_modex_recv(&mca_btl_ofud_component.super.btl_version, + ompi_proc, (void*)&module_proc->proc_addrs, + &size); + + if(OMPI_SUCCESS != rc) { + opal_output(0, + "[%s:%d] mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]", + __FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name)); + OBJ_RELEASE(module_proc); + return NULL; + } + + if((size % sizeof(mca_btl_ud_addr_t)) != 0) { + opal_output(0, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]", + __FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name)); + OBJ_RELEASE(module_proc); + return NULL; + } + + + module_proc->proc_addr_count = size / sizeof(mca_btl_ud_addr_t); + + + if (0 == module_proc->proc_addr_count) { + module_proc->proc_endpoints = NULL; + } else { + module_proc->proc_endpoints = (mca_btl_base_endpoint_t**) + malloc(module_proc->proc_addr_count * + sizeof(mca_btl_base_endpoint_t*)); + } + + if(NULL == module_proc->proc_endpoints) { + OBJ_RELEASE(module_proc); + return NULL; + } + return module_proc; +} + + +/* + * Insert an endpoint into the proc array and assign it an address. + * + * MUST be called with the proc lock held! + */ + +int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc, + mca_btl_base_endpoint_t* module_endpoint) +{ + module_endpoint->rem_addr = + module_proc->proc_addrs[module_proc->proc_endpoint_count]; + module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = + module_endpoint; + return OMPI_SUCCESS; +} + + +/* + * Remove an endpoint from the proc array. + */ + +int mca_btl_ud_proc_remove(mca_btl_ud_proc_t* proc, + mca_btl_base_endpoint_t* endpoint) +{ + size_t i; + + OPAL_THREAD_LOCK(&proc->proc_lock); + for(i = 0; i < proc->proc_endpoint_count; i++) { + if(proc->proc_endpoints[i] == endpoint) { + memmove(proc->proc_endpoints + i, proc->proc_endpoints + i + 1, + (proc->proc_endpoint_count -i - 1) * + sizeof(mca_btl_base_endpoint_t*)); + if(--proc->proc_endpoint_count == 0) { + OPAL_THREAD_UNLOCK(&proc->proc_lock); + OBJ_RELEASE(proc); + return OMPI_SUCCESS; + } + + break; + } + } + + OPAL_THREAD_UNLOCK(&proc->proc_lock); + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/btl/ofud/btl_ofud_proc.h b/ompi/mca/btl/ofud/btl_ofud_proc.h new file mode 100644 index 0000000000..c9994f8afa --- /dev/null +++ b/ompi/mca/btl/ofud/btl_ofud_proc.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UD_PROC_H +#define MCA_BTL_UD_PROC_H + +#include "opal/class/opal_object.h" +#include "orte/mca/ns/ns.h" +#include "ompi/proc/proc.h" + +#include "btl_ofud.h" +#include "btl_ofud_endpoint.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/** + * Represents the state of a remote process and the set of addresses + * that it exports. Also cache an instance of mca_btl_base_endpoint_t for + * each BTL instance that attempts to open a connection to the process. + */ + +struct mca_btl_ud_proc_t { + opal_list_item_t super; + /**< allow proc to be placed on a list */ + + ompi_proc_t *proc_ompi; + /**< pointer to corresponding ompi_proc_t */ + + orte_process_name_t proc_guid; + /**< globally unique identifier for the process */ + + struct mca_btl_ud_addr_t* proc_addrs; + size_t proc_addr_count; + /**< number of addresses published by endpoint */ + + struct mca_btl_base_endpoint_t **proc_endpoints; + /**< array of endpoints that have been created to access this proc */ + + size_t proc_endpoint_count; + /**< number of endpoints */ + + opal_mutex_t proc_lock; + /**< lock to protect against concurrent access to proc state */ +}; +typedef struct mca_btl_ud_proc_t mca_btl_ud_proc_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_proc_t); + + +mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc); + +mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc); + +int mca_btl_ud_proc_insert(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*); + +int mca_btl_ud_proc_remove(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ofud/configure.m4 b/ompi/mca/btl/ofud/configure.m4 new file mode 100644 index 0000000000..4d96a69a36 --- /dev/null +++ b/ompi/mca/btl/ofud/configure.m4 @@ -0,0 +1,43 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +# MCA_btl_ofud_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_btl_ofud_CONFIG],[ + OMPI_CHECK_OPENIB([btl_ofud], + [btl_ofud_happy="yes"], + [btl_ofud_happy="no"]) + + AS_IF([test "$btl_ofud_happy" = "yes"], + [btl_ofud_WRAPPER_EXTRA_LDFLAGS="$btl_ofud_LDFLAGS" + btl_ofud_WRAPPER_EXTRA_LIBS="$btl_ofud_LIBS" + $1], + [$2]) + + + # substitute in the things needed to build OFUD + AC_SUBST([btl_ofud_CFLAGS]) + AC_SUBST([btl_ofud_CPPFLAGS]) + AC_SUBST([btl_ofud_LDFLAGS]) + AC_SUBST([btl_ofud_LIBS]) +])dnl diff --git a/ompi/mca/btl/ofud/configure.params b/ompi/mca/btl/ofud/configure.params new file mode 100644 index 0000000000..b0b9b60057 --- /dev/null +++ b/ompi/mca/btl/ofud/configure.params @@ -0,0 +1,26 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=btl_ofud.c +PARAM_CONFIG_HEADER_FILE="ofud_config.h" +PARAM_CONFIG_FILES="Makefile"