diff --git a/ompi/mca/btl/ud/Makefile.am b/ompi/mca/btl/ud/Makefile.am new file mode 100644 index 0000000000..59946b0042 --- /dev/null +++ b/ompi/mca/btl/ud/Makefile.am @@ -0,0 +1,68 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + + + +AM_CPPFLAGS=$(btl_ud_CPPFLAGS) + +sources = \ + btl_ud.c \ + btl_ud.h \ + btl_ud_component.c \ + btl_ud_endpoint.c \ + btl_ud_endpoint.h \ + btl_ud_frag.c \ + btl_ud_frag.h \ + btl_ud_proc.c \ + btl_ud_proc.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_btl_ud_DSO +lib = +lib_sources = +component = mca_btl_ud.la +component_sources = $(sources) +else +lib = libmca_btl_ud.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component) +mca_btl_ud_la_SOURCES = $(component_sources) +mca_btl_ud_la_LDFLAGS = -module -avoid-version $(btl_ud_LDFLAGS) +mca_btl_ud_la_LIBADD = \ + $(btl_ud_LIBS) \ + $(top_ompi_builddir)/ompi/libmpi.la \ + $(top_ompi_builddir)/orte/liborte.la \ + $(top_ompi_builddir)/opal/libopal.la + + +noinst_LTLIBRARIES = $(lib) +libmca_btl_ud_la_SOURCES = $(lib_sources) +libmca_btl_ud_la_LDFLAGS= -module -avoid-version $(btl_ud_LDFLAGS) +libmca_btl_ud_la_LIBADD=$(btl_ud_LIBS) diff --git a/ompi/mca/btl/ud/btl_ud.c b/ompi/mca/btl/ud/btl_ud.c new file mode 100644 index 0000000000..cf4e280a34 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud.c @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include +#include +#include "opal/util/output.h" +#include "opal/util/if.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "btl_ud.h" +#include "btl_ud_frag.h" +#include "btl_ud_proc.h" +#include "btl_ud_endpoint.h" +#include "ompi/datatype/convertor.h" +#include "ompi/datatype/datatype.h" +#include "ompi/mca/mpool/base/base.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/mpool/openib/mpool_openib.h" +#include +#include +#include + + +mca_btl_ud_module_t mca_btl_ud_module = { + { + &mca_btl_ud_component.super, + 0, /* max size of first fragment */ + 0, /* min send fragment size */ + 0, /* max send fragment size */ + 0, /* min rdma fragment size */ + 0, /* max rdma fragment size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + MCA_BTL_FLAGS_SEND, + mca_btl_ud_add_procs, + mca_btl_ud_del_procs, + mca_btl_ud_register, + mca_btl_ud_finalize, + /* we need alloc free, pack */ + mca_btl_ud_alloc, + mca_btl_ud_free, + mca_btl_ud_prepare_src, + NULL, /*mca_btl_ud_prepare_dst */ + mca_btl_ud_send, + NULL, /*mca_btl_ud_put */ + NULL, /*mca_btl_ud_get */ + mca_btl_ud_dump + } +}; + + + +/* + * add a proc to this btl module + * creates an endpoint that is setup on the + * first send to the endpoint + */ +int mca_btl_ud_add_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **ompi_procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable) +{ + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; + int i, rc; + + for(i = 0; i < (int) nprocs; i++) { + + struct ompi_proc_t* ompi_proc = ompi_procs[i]; + mca_btl_ud_proc_t* ib_proc; + mca_btl_base_endpoint_t* ib_peer; + + if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Check to make sure that the peer has at least as many interface + * addresses exported as we are trying to use. If not, then + * don't bind this PTL instance to the proc. + */ + + OPAL_THREAD_LOCK(&ib_proc->proc_lock); + + /* The btl_proc datastructure is shared by all IB PTL + * instances that are trying to reach this destination. + * Cache the peer instance on the btl_proc. + */ + ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t); + if(NULL == ib_peer) { + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ib_peer->endpoint_btl = ud_btl; + ib_peer->subnet = ud_btl->port_info.subnet; + rc = mca_btl_ud_proc_insert(ib_proc, ib_peer); + if(rc != OMPI_SUCCESS) { + OBJ_RELEASE(ib_peer); + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + continue; + } + + ompi_bitmap_set_bit(reachable, i); + OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); + peers[i] = ib_peer; + } +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + ud_btl->rd_num = mca_btl_ud_component.rd_num + log2(nprocs) * mca_btl_ud_component.srq_rd_per_peer; + if(ud_btl->rd_num > mca_btl_ud_component.srq_rd_max) + ud_btl->rd_num = mca_btl_ud_component.srq_rd_max; + ud_btl->rd_low = ud_btl->rd_num - 1; + } +#endif + return OMPI_SUCCESS; +} + + +/* + * delete the proc as reachable from this btl module + */ +int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t ** peers) +{ + BTL_DEBUG(("TODO\n")); + return OMPI_SUCCESS; +} + +/* + *Register callback function to support send/recv semantics + */ +int mca_btl_ud_register( + struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata) +{ + + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*) btl; + + OPAL_THREAD_LOCK(&ud_btl->ib_lock); + ud_btl->ib_reg[tag].cbfunc = cbfunc; + ud_btl->ib_reg[tag].cbdata = cbdata; + OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); + return OMPI_SUCCESS; +} + + +/** + * Allocate a segment. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + * + * When allocating a segment we pull a pre-alllocated segment + * from one of two free lists, an eager list and a max list + */ +mca_btl_base_descriptor_t* mca_btl_ud_alloc( + struct mca_btl_base_module_t* btl, + size_t size) +{ + mca_btl_ud_frag_t* frag; + int rc; + + if(size <= mca_btl_ud_component.eager_limit){ + MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); + frag->segment.seg_len = size; + } else if(size <= mca_btl_ud_component.max_send_size) { + MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); + frag->segment.seg_len = size; + } else { + return NULL; + } + + frag->base.des_flags = 0; + return (mca_btl_base_descriptor_t*)frag; +} + + +/** + * Return a segment + * + * Return the segment to the appropriate + * preallocated segment list + */ +int mca_btl_ud_free(struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des) +{ + mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des; + + if(frag->size == 0) { + btl->btl_mpool->mpool_release(btl->btl_mpool, + (mca_mpool_base_registration_t*) + frag->ud_reg); + MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag); + + } + else if(frag->size == mca_btl_ud_component.max_send_size){ + MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag); + } else if(frag->size == mca_btl_ud_component.eager_limit){ + MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); + } else { + BTL_ERROR(("invalid descriptor")); + } + + return OMPI_SUCCESS; +} + + +/** + * register user buffer or pack + * data into pre-registered buffer and return a + * descriptor that can be + * used for send/put. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + * + * prepare source's behavior depends on the following: + * Has a valid memory registration been passed to prepare_src? + * if so we attempt to use the pre-registred user-buffer, if the memory registration + * is to small (only a portion of the user buffer) then we must reregister the user buffer + * Has the user requested the memory to be left pinned? + * if so we insert the memory registration into a memory tree for later lookup, we + * may also remove a previous registration if a MRU (most recently used) list of + * registions is full, this prevents resources from being exhausted. + * Is the requested size larger than the btl's max send size? + * if so and we aren't asked to leave the registration pinned than we register the memory if + * the users buffer is contiguous + * Otherwise we choose from two free lists of pre-registered memory in which to pack the data into. + * + */ +mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size +) +{ + mca_btl_ud_module_t* ud_btl; + mca_btl_ud_frag_t* frag; + mca_mpool_openib_registration_t * ud_reg; + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data = *size; + int32_t free_after; + int rc; + + ud_btl = (mca_btl_ud_module_t*) btl; + ud_reg = (mca_mpool_openib_registration_t*) registration; + + if(NULL != ud_reg && 0 == ompi_convertor_need_buffers(convertor)){ + size_t reg_len; + + /* the memory is already pinned and we have contiguous user data */ + + MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); + if(NULL == frag){ + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + + reg_len = (unsigned char*)ud_reg->base_reg.bound - (unsigned char*)iov.iov_base + 1; + + frag->sg_entry.length = max_data; + frag->sg_entry.lkey = ud_reg->mr->lkey; + + frag->sg_entry.addr = (unsigned long) iov.iov_base; + + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + frag->ud_reg = ud_reg; + btl->btl_mpool->mpool_retain(btl->btl_mpool, (mca_mpool_base_registration_t*) ud_reg); + return &frag->base; + + } else if( max_data > btl->btl_max_send_size && + ompi_convertor_need_buffers(convertor) == 0 && + reserve == 0) { + /* The user buffer is contigous and we are asked to send more than the max send size. */ + + MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); + if(NULL == frag){ + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + frag->base.des_flags = 0; + + + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, + iov.iov_base, + max_data, + 0, + (mca_mpool_base_registration_t**) &ud_reg); + if(OMPI_SUCCESS != rc || NULL == ud_reg) { + BTL_ERROR(("mpool_register(%p,%lu) failed", iov.iov_base, max_data)); + MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag); + return NULL; + } + + + frag->sg_entry.length = max_data; + frag->sg_entry.lkey = ud_reg->mr->lkey; + + frag->sg_entry.addr = (unsigned long) iov.iov_base; + + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->ud_reg = ud_reg; + + return &frag->base; + + } else + if (max_data+reserve <= btl->btl_eager_limit) { + /* the data is small enough to fit in the eager frag and + either we received no prepinned memory or leave pinned is + not set + */ + MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + + iov.iov_len = max_data; + iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + *size = max_data; + if( rc < 0 ) { + MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); + return NULL; + } + + frag->segment.seg_len = max_data + reserve; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + return &frag->base; + + } else { + MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); + if(NULL == frag) { + return NULL; + } + if(max_data + reserve > btl->btl_max_send_size){ + max_data = btl->btl_max_send_size - reserve; + } + iov.iov_len = max_data; + iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + + rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); + *size = max_data; + + if( rc < 0 ) { + MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag); + return NULL; + } + + frag->segment.seg_len = max_data + reserve; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags=0; + + return &frag->base; + } + return NULL; +} + + +int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl) +{ + mca_btl_ud_module_t* ud_btl; + ud_btl = (mca_btl_ud_module_t*) btl; + + return OMPI_SUCCESS; +} + +/* + * Initiate a send. + */ + +int mca_btl_ud_send( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag) + +{ + int rc; + + mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor; + MCA_BTL_UD_START_TIME(post_send); + frag->endpoint = endpoint; + frag->hdr->tag = tag; + rc = mca_btl_ud_endpoint_send(endpoint, frag); + MCA_BTL_UD_END_TIME(post_send); + return rc; +} + + +/* + * Initialize the btl module by allocating a protection domain + * and creating both the high and low priority completion queues + */ +int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl) +{ + /* Allocate Protection Domain */ + struct ibv_context *ctx; + + ctx = ud_btl->ib_dev_context; + + ud_btl->ib_pd = ibv_alloc_pd(ctx); + if(NULL == ud_btl->ib_pd) { + BTL_ERROR(("error allocating pd for %s errno says %s\n", + ibv_get_device_name(ud_btl->ib_dev), + strerror(errno))); + return OMPI_ERROR; + } + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + struct ibv_srq_init_attr attr; + attr.attr.max_wr = mca_btl_ud_component.srq_rd_max; + attr.attr.max_sge = mca_btl_ud_component.ib_sg_list_size; + + ud_btl->srd_posted_hp = 0; + ud_btl->srd_posted_lp = 0; + + ud_btl->srq_hp = ibv_create_srq(ud_btl->ib_pd, &attr); + if(NULL == ud_btl->srq_hp) { + BTL_ERROR(("error in ibv_create_srq\n")); + return OMPI_ERROR; + } + + ud_btl->srq_lp = ibv_create_srq(ud_btl->ib_pd, &attr); + if(NULL == ud_btl->srq_hp) { + BTL_ERROR(("error in ibv_create_srq\n")); + return OMPI_ERROR; + } + + } else { + ud_btl->srq_hp = NULL; + ud_btl->srq_lp = NULL; + } +#endif + + /* Create the low and high priority completion queues */ +#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3 + ud_btl->ib_cq_lp = + ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, NULL); +#else + ud_btl->ib_cq_lp = + ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, + NULL, NULL, 0); +#endif + + if(NULL == ud_btl->ib_cq_lp) { + BTL_ERROR(("error creating low priority cq for %s errno says %s\n", + ibv_get_device_name(ud_btl->ib_dev), + strerror(errno))); + return OMPI_ERROR; + } + +#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3 + ud_btl->ib_cq_hp = + ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, NULL); +#else + ud_btl->ib_cq_hp = + ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, + NULL, NULL, 0); +#endif + + if(NULL == ud_btl->ib_cq_hp) { + BTL_ERROR(("error creating high priority cq for %s errno says %s\n", + ibv_get_device_name(ud_btl->ib_dev), + strerror(errno))); + return OMPI_ERROR; + } + + /* Set up the QPs for this BTL */ + if(OMPI_SUCCESS != mca_btl_ud_endpoint_init_qp(&ud_btl->super, + ud_btl->ib_cq_hp, +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + ud_btl->srq_hp, +#endif + &ud_btl->qp_hp, + ud_btl->psn_hp)) { + return OMPI_ERROR; + } + + if(OMPI_SUCCESS != mca_btl_ud_endpoint_init_qp(&ud_btl->super, + ud_btl->ib_cq_lp, +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + ud_btl->srq_lp, +#endif + &ud_btl->qp_lp, + ud_btl->psn_lp)) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +/* + * Dump profiling information + */ +void mca_btl_ud_dump( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose) +{ + + mca_btl_base_dump(btl, endpoint, verbose); +} + diff --git a/ompi/mca/btl/ud/btl_ud.h b/ompi/mca/btl/ud/btl_ud.h new file mode 100644 index 0000000000..5704dca7bd --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud.h @@ -0,0 +1,597 @@ + +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PTL_UD_H +#define MCA_PTL_UD_H + +/* Standard system includes */ +#include +#include + +/* Open MPI includes */ +#include "ompi/class/ompi_free_list.h" +#include "ompi/class/ompi_bitmap.h" +#include "orte/class/orte_pointer_array.h" +#include "opal/class/opal_value_array.h" +#include "opal/event/event.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "opal/util/output.h" +#include "opal/sys/timer.h" +#include "ompi/mca/mpool/mpool.h" +#include "ompi/mca/btl/base/btl_base_error.h" + +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "btl_ud_endpoint.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +#define MCA_BTL_IB_LEAVE_PINNED 1 + +/** + * UD Infiniband (IB) BTL component. + */ + +struct mca_btl_ud_component_t { + mca_btl_base_component_1_0_0_t super; /**< base BTL component */ + + uint32_t ib_max_btls; + /**< maximum number of hcas available to the IB component */ + + uint32_t ib_num_btls; + /**< number of hcas available to the IB component */ + + struct mca_btl_ud_module_t *ud_btls; + /**< array of available PTLs */ + + int ib_free_list_num; + /**< initial size of free lists */ + + int ib_free_list_max; + /**< maximum size of free lists */ + + int ib_free_list_inc; + /**< number of elements to alloc when growing free lists */ + + opal_list_t ib_procs; + /**< list of ib proc structures */ + + opal_mutex_t ib_lock; + /**< lock for accessing module state */ + + char* ib_mpool_name; + /**< name of ib memory pool */ + + int32_t sd_num; /**< maximum number of send descriptors to post to a QP */ + int32_t rd_num; /**< number of receive descriptors to post to each QP */ + int32_t rd_low; /**< low water mark to reach before re-posting receive descriptors */ + + int32_t srq_rd_max; /* maximum number of receive descriptors posted */ + int32_t srq_rd_per_peer; /* number of receive descriptors to post per log2(peers) in SRQ mode */ + int32_t srq_sd_max; /* maximum number of send descriptors posted */ + + size_t eager_limit; + size_t max_send_size; + uint32_t reg_mru_len; + uint32_t use_srq; + + uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */ + uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/ + uint32_t ib_pkey_ix; + uint32_t ib_qkey; + uint32_t ib_psn; + uint32_t ib_service_level; + uint32_t ib_src_path_bits; + +}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t; + +extern mca_btl_ud_component_t mca_btl_ud_component; + +typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t; + + +/** + * Profiling variables + */ + +#if OMPI_ENABLE_DEBUG +#define MCA_BTL_UD_ENABLE_PROFILE 1 +#else +#define MCA_BTL_UD_ENABLE_PROFILE 0 +#endif + +#if MCA_BTL_UD_ENABLE_PROFILE + +#define MCA_BTL_UD_PROFILE_VAR(var) \ + opal_timer_t avg_ ## var; \ + opal_timer_t cnt_ ## var; \ + opal_timer_t tmp_ ## var + +struct mca_btl_ud_profile_t +{ + MCA_BTL_UD_PROFILE_VAR(post_send); + MCA_BTL_UD_PROFILE_VAR(endpoint_send_conn); + MCA_BTL_UD_PROFILE_VAR(ibv_post_send); + MCA_BTL_UD_PROFILE_VAR(full_send); +}; + +typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t; +extern mca_btl_ud_profile_t mca_btl_ud_profile; + +#endif + + +/** + * IB PTL Interface + */ +struct mca_btl_ud_module_t { + mca_btl_base_module_t super; /**< base PTL interface */ + mca_btl_ud_recv_reg_t ib_reg[256]; + mca_btl_ud_port_info_t port_info; /* contains only the subnet right now */ + uint8_t port_num; /**< ID of the PORT */ + struct ibv_device *ib_dev; /* the ib device */ + struct ibv_context *ib_dev_context; + struct ibv_pd *ib_pd; + struct ibv_cq *ib_cq_hp; + struct ibv_cq *ib_cq_lp; + struct ibv_port_attr* ib_port_attr; + + ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */ + ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */ + ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */ + + ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */ + ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */ + + opal_list_t pending_frags_hp; + /**< list of pending high priority frags */ + + opal_list_t pending_frags_lp; + /**< list of pending low priority frags */ + + opal_mutex_t ib_lock; /**< module level lock */ + + size_t ib_inline_max; /**< max size of inline send*/ + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + struct ibv_srq *srq_hp; + struct ibv_srq *srq_lp; + int32_t srd_posted_hp; + int32_t srd_posted_lp; +#endif + + int32_t rd_num; + int32_t rd_low; + + int32_t rd_posted_hp; /**< number of high priority descriptors posted */ + int32_t rd_posted_lp; /**< number of low priority descriptors posted */ + + int32_t sd_wqe_hp; /**< number of available send wqe entries */ + int32_t sd_wqe_lp; /**< number of available send wqe entries */ + + uint32_t psn_hp; + uint32_t psn_lp; + /* Local processes port sequence number (Low and High) */ + + struct ibv_qp* qp_hp; + struct ibv_qp* qp_lp; + /* Local QP (Low and High) */ +}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t; + +struct mca_btl_ud_frag_t; +extern mca_btl_ud_module_t mca_btl_ud_module; + +/** + * Register IB component parameters with the MCA framework + */ +extern int mca_btl_ud_component_open(void); + +/** + * Any final cleanup before being unloaded. + */ +extern int mca_btl_ud_component_close(void); + +/** + * IB component initialization. + * + * @param num_btl_modules (OUT) Number of BTLs returned in BTL array. + * @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE) + * @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE) + * + * (1) read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + * (2) setup IB listen socket for incoming connection attempts + * (3) publish BTL addressing info + * + */ +extern mca_btl_base_module_t** mca_btl_ud_component_init( + int *num_btl_modules, + bool allow_multi_user_threads, + bool have_hidden_threads +); + + +/** + * IB component progress. + */ +extern int mca_btl_ud_component_progress(void); + + +/** + * Register a callback function that is called on receipt + * of a fragment. + * + * @param btl (IN) BTL module + * @return Status indicating if cleanup was successful + * + * When the process list changes, the PML notifies the BTL of the + * change, to provide the opportunity to cleanup or release any + * resources associated with the peer. + */ + +int mca_btl_ud_register( + struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_module_recv_cb_fn_t cbfunc, + void* cbdata +); + + +/** + * Cleanup any resources held by the BTL. + * + * @param btl BTL instance. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_btl_ud_finalize( + struct mca_btl_base_module_t* btl +); + + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) + * @param nprocs (IN) Number of processes + * @param procs (IN) Set of processes + * @param peers (OUT) Set of (optional) peer addressing info. + * @param peers (IN/OUT) Set of processes that are reachable via this BTL. + * @return OMPI_SUCCESS or error status on failure. + * + */ + +extern int mca_btl_ud_add_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers, + ompi_bitmap_t* reachable +); + +/** + * PML->BTL notification of change in the process list. + * + * @param btl (IN) BTL instance + * @param nproc (IN) Number of processes. + * @param procs (IN) Set of processes. + * @param peers (IN) Set of peer data structures. + * @return Status indicating if cleanup was successful + * + */ +extern int mca_btl_ud_del_procs( + struct mca_btl_base_module_t* btl, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_btl_base_endpoint_t** peers +); + + +/** + * PML->BTL Initiate a send of the specified size. + * + * @param btl (IN) BTL instance + * @param btl_base_peer (IN) BTL peer addressing + * @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t) + * @param size (IN) Number of bytes PML is requesting BTL to deliver + * @param flags (IN) Flags that should be passed to the peer via the message header. + * @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments + */ +extern int mca_btl_ud_send( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* descriptor, + mca_btl_base_tag_t tag +); + + +/** + * Allocate a descriptor. + * + * @param btl (IN) BTL module + * @param size (IN) Requested descriptor size. + */ +extern mca_btl_base_descriptor_t* mca_btl_ud_alloc( + struct mca_btl_base_module_t* btl, + size_t size); + + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param descriptor (IN) Allocated descriptor. + */ +extern int mca_btl_ud_free( + struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des); + + +/** + * Pack data and return a descriptor that can be + * used for send/put. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + */ +mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size + ); + + +/** + * Return a send fragment to the modules free list. + * + * @param btl (IN) BTL instance + * @param frag (IN) IB send fragment + * + */ +extern void mca_btl_ud_send_frag_return( + struct mca_btl_base_module_t* btl, + struct mca_btl_ud_frag_t* + ); + + +int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl); + +void mca_btl_ud_dump( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose); + + +/* + * Profiling stuff + */ + +#if MCA_BTL_UD_ENABLE_PROFILE + +#define MCA_BTL_UD_START_TIME(var) \ + ((mca_btl_ud_profile.tmp_ ## var) = opal_sys_timer_get_cycles()) +#define MCA_BTL_UD_END_TIME(var) \ +do { \ + mca_btl_ud_profile.avg_ ## var += \ + opal_sys_timer_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \ + mca_btl_ud_profile.cnt_ ## var++; \ +} while(0) + +#define MCA_BTL_UD_SHOW_TIME(var) \ + BTL_VERBOSE((" " #var " avg %lu cnt %lu", \ + (mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \ + mca_btl_ud_profile.cnt_ ## var)); + +#else +#define MCA_BTL_UD_START_TIME(var) +#define MCA_BTL_UD_END_TIME(var) +#define MCA_BTL_UD_SHOW_TIME(var) +#endif + + +/* + * Post non-SRQ receive buffers + */ + +#define MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, \ + additional) \ +{ \ + do { \ + OPAL_THREAD_LOCK(&ud_btl->ib_lock); \ + if(ud_btl->rd_posted_hp <= mca_btl_ud_component.rd_low+additional && \ + ud_btl->rd_posted_hp < ud_btl->rd_num) { \ + MCA_BTL_UD_ENDPOINT_POST_RR_SUB(ud_btl->rd_num - \ + ud_btl->rd_posted_hp, \ + &ud_btl->recv_free_eager, \ + ud_btl->rd_posted_hp, \ + ud_btl->qp_hp); \ + } \ + OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \ + } while(0); \ +} + +#define MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, \ + additional) { \ + do { \ + OPAL_THREAD_LOCK(&ud_btl->ib_lock); \ + if(ud_btl->rd_posted_lp <= mca_btl_ud_component.rd_low+additional && \ + ud_btl->rd_posted_lp < ud_btl->rd_num){ \ + MCA_BTL_UD_ENDPOINT_POST_RR_SUB(ud_btl->rd_num - \ + ud_btl->rd_posted_lp, \ + &ud_btl->recv_free_max, \ + ud_btl->rd_posted_lp, \ + ud_btl->qp_lp \ + ); } \ + OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \ + } while(0); \ +} + +#define MCA_BTL_UD_ENDPOINT_POST_RR_SUB(cnt, \ + frag_list, \ + rd_posted, \ + qp ) \ +do { \ + int32_t i; \ + int rc; \ + int32_t num_post = cnt; \ + struct ibv_recv_wr* bad_wr; \ + for(i = 0; i < num_post; i++) { \ + opal_list_item_t* item; \ + mca_btl_ud_frag_t* frag; \ + OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ + frag->sg_entry.length = frag->size + sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t); \ + if(ibv_post_recv(qp, \ + &frag->wr_desc.rd_desc, \ + &bad_wr)) { \ + BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \ + return OMPI_ERROR; \ + }\ + }\ + OPAL_THREAD_ADD32(&(rd_posted), num_post); \ +} while(0); + +#if 0 +#define MCA_BTL_UD_ENDPOINT_POST_RR_SUB(cnt, \ + frag_list, \ + rd_posted, \ + qp ) \ +do { \ + int32_t i; \ + int rc; \ + int32_t num_post = cnt; \ + struct ibv_recv_wr* head_wr; \ + struct ibv_recv_wr* prev_wr; \ + opal_list_item_t* item; \ + mca_btl_ud_frag_t* frag; \ + OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ + frag = (mca_btl_ud_frag_t*)item; \ + head_wr = &frag->wr_desc.rd_desc; \ + prev_wr = head_wr; \ + OPAL_OUTPUT((0, "posting %d recvs\n", num_post)); \ + for(i = 1; i < num_post; i++) { \ + OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ + prev_wr->next = &frag->wr_desc.rd_desc; \ + prev_wr = prev_wr->next; \ + }\ + prev_wr->next = NULL; \ + if(ibv_post_recv(qp, head_wr, &prev_wr)) { \ + BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \ + return OMPI_ERROR; \ + }\ + OPAL_THREAD_ADD32(&(rd_posted), num_post); \ +} while(0); +#endif + +#define BTL_OPENIB_INSERT_PENDING(frag, frag_list, tokens, lock) \ +do{ \ + OPAL_THREAD_LOCK(&lock); \ + opal_list_append(&frag_list, (opal_list_item_t *)frag); \ + OPAL_THREAD_UNLOCK(&lock); \ + OPAL_THREAD_ADD32(&tokens, 1); \ + } while(0); + + +/* + * Post SRQ receive buffers + */ + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + +#define MCA_BTL_UD_POST_SRR_HIGH(ud_btl, additional) \ +{ \ + do{ \ + OPAL_THREAD_LOCK(&ud_btl->ib_lock); \ + if(ud_btl->srd_posted_hp <= ud_btl->rd_low+additional && \ + ud_btl->srd_posted_hp < ud_btl->rd_num){ \ + MCA_BTL_UD_POST_SRR_SUB(ud_btl->rd_num - \ + ud_btl->srd_posted_hp, \ + ud_btl, \ + &ud_btl->recv_free_eager, \ + &ud_btl->srd_posted_hp, \ + ud_btl->srq_hp); \ + } \ + OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \ + } while(0); \ +} + +#define MCA_BTL_UD_POST_SRR_LOW(ud_btl, additional) \ +{ \ + do { \ + OPAL_THREAD_LOCK(&ud_btl->ib_lock); \ + if(ud_btl->srd_posted_lp <= ud_btl->rd_low+additional && \ + ud_btl->srd_posted_lp < ud_btl->rd_num){ \ + MCA_BTL_UD_POST_SRR_SUB(ud_btl->rd_num - \ + ud_btl->srd_posted_lp, \ + ud_btl, \ + &ud_btl->recv_free_max, \ + &ud_btl->srd_posted_lp, \ + ud_btl->srq_lp); \ + } \ + OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \ + } while(0); \ +} + + +#define MCA_BTL_UD_POST_SRR_SUB(cnt, \ + ud_btl, \ + frag_list, \ + srd_posted, \ + srq) \ +{\ + do { \ + int32_t i; \ + int32_t num_post = cnt; \ + opal_list_item_t* item = NULL; \ + mca_btl_ud_frag_t* frag = NULL; \ + struct ibv_recv_wr *bad_wr; \ + int32_t rc; \ + for(i = 0; i < num_post; i++) { \ + OMPI_FREE_LIST_WAIT(frag_list, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ + frag->sg_entry.length = frag->size + \ + ((unsigned char*) frag->segment.seg_addr.pval- \ + (unsigned char*) frag->hdr); \ + if(ibv_post_srq_recv(srq, &frag->wr_desc.rd_desc, &bad_wr)) { \ + BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\ + strerror(errno))); \ + return OMPI_ERROR; \ + }\ + }\ + OPAL_THREAD_ADD32(srd_posted, num_post); \ + } while(0);\ +} + +#endif + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ud/btl_ud_component.c b/ompi/mca/btl/ud/btl_ud_component.c new file mode 100644 index 0000000000..754ab36291 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_component.c @@ -0,0 +1,768 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include "ompi/constants.h" +#include "opal/event/event.h" +#include "opal/util/if.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "opal/sys/timer.h" + +#include "opal/mca/base/mca_base_param.h" +#include "orte/mca/errmgr/errmgr.h" +#include "ompi/mca/mpool/base/base.h" +#include "btl_ud.h" +#include "btl_ud_frag.h" +#include "btl_ud_endpoint.h" +#include "ompi/mca/btl/base/base.h" + + +#include "ompi/datatype/convertor.h" +#include "ompi/mca/mpool/mpool.h" +#include +#include +#include +#include /* for strerror()*/ + +#include "ompi/mca/pml/base/pml_base_module_exchange.h" + +mca_btl_ud_component_t mca_btl_ud_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + + { + /* Indicate that we are a pml v1.0.0 component (which also implies a + specific MCA version) */ + + MCA_BTL_BASE_VERSION_1_0_0, + + "ud", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_btl_ud_component_open, /* component open */ + mca_btl_ud_component_close /* component close */ + }, + + /* Next the MCA v1.0.0 component meta data */ + + { + /* Whether the component is checkpointable or not */ + + false + }, + + mca_btl_ud_component_init, + mca_btl_ud_component_progress, + } +}; + + +/* + * Profiling information + */ + +#if MCA_BTL_UD_ENABLE_PROFILE +mca_btl_ud_profile_t mca_btl_ud_profile = {0}; +#endif + + +/* + * utility routines for parameter registration + */ + +static inline void mca_btl_ud_param_register_string( + const char* param_name, + const char* param_desc, + const char* default_value, + char** out_value) +{ + mca_base_param_reg_string(&mca_btl_ud_component.super.btl_version, + param_name, + param_desc, + false, + false, + default_value, + out_value); +} + +static inline void mca_btl_ud_param_register_int( + const char* param_name, + const char* param_desc, + int default_value, + int* out_value) +{ + mca_base_param_reg_int(&mca_btl_ud_component.super.btl_version, + param_name, + param_desc, + false, + false, + default_value, + out_value); +} + +/* + * Called by MCA framework to open the component, registers + * component parameters. + */ + +int mca_btl_ud_component_open(void) +{ + /* initialize state */ + mca_btl_ud_component.ib_num_btls=0; + mca_btl_ud_component.ud_btls=NULL; + + /* initialize objects */ + OBJ_CONSTRUCT(&mca_btl_ud_component.ib_procs, opal_list_t); + + /* register IB component parameters */ + mca_btl_ud_param_register_int ("max_btls", "maximum number of HCAs/ports to use", + 4, (int*)&mca_btl_ud_component.ib_max_btls); + mca_btl_ud_param_register_int ("free_list_num", "intial size of free lists", + 8, &mca_btl_ud_component.ib_free_list_num); + mca_btl_ud_param_register_int ("free_list_max", "maximum size of free lists", + -1, &mca_btl_ud_component.ib_free_list_max); + mca_btl_ud_param_register_int ("free_list_inc", "increment size of free lists", + 32, &mca_btl_ud_component.ib_free_list_inc); + mca_btl_ud_param_register_string("mpool", "name of the memory pool to be used", + "openib", &mca_btl_ud_component.ib_mpool_name); + mca_btl_ud_param_register_int("reg_mru_len", "length of the registration cache most recently used list", + 16, (int*) &mca_btl_ud_component.reg_mru_len); + mca_btl_ud_param_register_int("use_srq", "if 1 use the IB shared receive queue to post receive descriptors", + 0, (int*) &mca_btl_ud_component.use_srq); + mca_btl_ud_param_register_int("ib_cq_size", "size of the IB completion queue", + 2000, (int*) &mca_btl_ud_component.ib_cq_size); + mca_btl_ud_param_register_int("ib_sg_list_size", "size of IB segment list", + 4, (int*) &mca_btl_ud_component.ib_sg_list_size); + mca_btl_ud_param_register_int("ib_pkey_ix", "IB pkey index", + 0, (int*) &mca_btl_ud_component.ib_pkey_ix); + mca_btl_ud_param_register_int("ib_qkey", "IB qkey", + 0x01330133, (int*) &mca_btl_ud_component.ib_qkey); + mca_btl_ud_param_register_int("ib_psn", "IB Packet sequence starting number", + 0, (int*) &mca_btl_ud_component.ib_psn); + mca_btl_ud_param_register_int("ib_service_level", "IB service level", + 0, (int*) &mca_btl_ud_component.ib_service_level); + mca_btl_ud_param_register_int("ib_src_path_bits", "IB source path bits", + 0, (int*) &mca_btl_ud_component.ib_src_path_bits); + mca_btl_ud_param_register_int ("exclusivity", "BTL exclusivity", + MCA_BTL_EXCLUSIVITY_DEFAULT, (int*) &mca_btl_ud_module.super.btl_exclusivity); + mca_btl_ud_param_register_int("sd_num", "maximum descriptors to post to a QP", + 16, (int*) &mca_btl_ud_component.sd_num); + mca_btl_ud_param_register_int("rd_num", "number of receive descriptors to post to a QP", + 500, (int*) &mca_btl_ud_component.rd_num); + mca_btl_ud_param_register_int("rd_low", "low water mark before reposting occurs", + 300, (int*) &mca_btl_ud_component.rd_low); + mca_btl_ud_param_register_int("srq_rd_max", "Max number of receive descriptors posted per SRQ.", + 1000, (int*) &mca_btl_ud_component.srq_rd_max); + mca_btl_ud_param_register_int("srq_rd_per_peer", "Number of receive descriptors posted per peer. (SRQ)", + 16, (int*) &mca_btl_ud_component.srq_rd_per_peer); + mca_btl_ud_param_register_int("srq_sd_max", "Maximum number of send descriptors posted. (SRQ)", + 8, &mca_btl_ud_component.srq_sd_max); + + /* TODO - this assumes a 2k UD MTU - should query/do something more intelligent */ + mca_btl_ud_param_register_int ("eager_limit", "eager send limit", + 2047, (int*)&mca_btl_ud_module.super.btl_eager_limit); + mca_btl_ud_param_register_int ("min_send_size", "minimum send size", + 2048, (int*)&mca_btl_ud_module.super.btl_min_send_size); + mca_btl_ud_param_register_int ("max_send_size", "maximum send size", + 2048, (int*) &mca_btl_ud_module.super.btl_max_send_size); + mca_btl_ud_param_register_int("bandwidth", "Approximate maximum bandwidth of interconnect", + 800, (int*) &mca_btl_ud_module.super.btl_bandwidth); + + mca_btl_ud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t); + mca_btl_ud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t); + mca_btl_ud_component.max_send_size = mca_btl_ud_module.super.btl_max_send_size; + mca_btl_ud_component.eager_limit = mca_btl_ud_module.super.btl_eager_limit; + + return OMPI_SUCCESS; +} + +/* + * component cleanup - sanity checking of queue lengths + */ + +int mca_btl_ud_component_close(void) +{ + /* Calculate and print profiling numbers */ + MCA_BTL_UD_SHOW_TIME(post_send); + MCA_BTL_UD_SHOW_TIME(endpoint_send_conn); + MCA_BTL_UD_SHOW_TIME(ibv_post_send); + MCA_BTL_UD_SHOW_TIME(full_send); + + return OMPI_SUCCESS; +} + + +/* + * Register UD port information. The MCA framework + * will make this available to all peers. + */ + +static int +mca_btl_ud_modex_send(void) +{ + int rc; + size_t i; + size_t size; + mca_btl_ud_port_info_t *ports = NULL; + + size = mca_btl_ud_component.ib_num_btls * sizeof (mca_btl_ud_port_info_t); + if (size != 0) { + ports = (mca_btl_ud_port_info_t *)malloc (size); + if (NULL == ports) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + for (i = 0; i < mca_btl_ud_component.ib_num_btls; i++) { + mca_btl_ud_module_t *btl = &mca_btl_ud_component.ud_btls[i]; + ports[i] = btl->port_info; + } + } + rc = mca_pml_base_modex_send (&mca_btl_ud_component.super.btl_version, ports, size); + if (NULL != ports) { + free (ports); + } + return rc; +} + +/* + * UD component initialization: + * (1) read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + * (2) post OOB receive for incoming connection attempts + * (3) register BTL parameters with the MCA + */ + +mca_btl_base_module_t** mca_btl_ud_component_init(int *num_btl_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + struct ibv_device **ib_devs; + struct ibv_device* ib_dev; + int32_t num_devs; + mca_btl_base_module_t** btls; + uint32_t i,j, length; + struct mca_mpool_base_resources_t mpool_resources; + opal_list_t btl_list; + mca_btl_ud_module_t* ud_btl; + mca_btl_base_selected_module_t* ib_selected; + opal_list_item_t* item; + unsigned short seedv[3]; + + /* initialization */ + *num_btl_modules = 0; + num_devs = 0; + + seedv[0] = orte_process_info.my_name->vpid; + seedv[1] = opal_sys_timer_get_cycles(); + seedv[2] = opal_sys_timer_get_cycles(); + seed48(seedv); + +#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST + ib_devs = ibv_get_device_list(&num_devs); +#else + /* Determine the number of hca's available on the host */ + dev_list = ibv_get_devices(); + if (NULL == dev_list) { + mca_btl_base_error_no_nics("OpenIB", "HCA"); + mca_btl_ud_component.ib_num_btls = 0; + mca_btl_ud_modex_send(); + return NULL; + } + dlist_start(dev_list); + + dlist_for_each_data(dev_list, ib_dev, struct ibv_device) + num_devs++; +#endif + + if(0 == num_devs) { + mca_btl_base_error_no_nics("OpenIB", "HCA"); + mca_btl_ud_modex_send(); + return NULL; + } + +#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST == 0 + /* Allocate space for the ib devices */ + ib_devs = (struct ibv_device**) malloc(num_devs * sizeof(struct ibv_dev*)); + if(NULL == ib_devs) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + dlist_start(dev_list); + + i = 0; + dlist_for_each_data(dev_list, ib_dev, struct ibv_device) + ib_devs[i++] = ib_dev; +#endif + + /** We must loop through all the hca id's, get there handles and + for each hca we query the number of ports on the hca and set up + a distinct btl module for each hca port */ + + OBJ_CONSTRUCT(&btl_list, opal_list_t); + OBJ_CONSTRUCT(&mca_btl_ud_component.ib_lock, opal_mutex_t); + + + for(i = 0; (int32_t)i < num_devs + && mca_btl_ud_component.ib_num_btls < mca_btl_ud_component.ib_max_btls; i++){ + struct ibv_device_attr ib_dev_attr; + struct ibv_context* ib_dev_context; + + ib_dev = ib_devs[i]; + + ib_dev_context = ibv_open_device(ib_dev); + if(!ib_dev_context) { + BTL_ERROR((" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno))); + return NULL; + } + + if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ + BTL_ERROR(("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno))); + return NULL; + } + + + /* Note ports are 1 based hence j = 1 */ + + for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){ + struct ibv_port_attr* ib_port_attr; + ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr)); + if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){ + BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s", + ibv_get_device_name(ib_dev), j, strerror(errno))); + return NULL; + } + + if( IBV_PORT_ACTIVE == ib_port_attr->state ){ + ud_btl = (mca_btl_ud_module_t*) malloc(sizeof(mca_btl_ud_module_t)); + memcpy(ud_btl, &mca_btl_ud_module, sizeof(mca_btl_ud_module)); + + ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); + ib_selected->btl_module = (mca_btl_base_module_t*) ud_btl; + ud_btl->ib_dev = ib_dev; + ud_btl->ib_dev_context = ib_dev_context; + ud_btl->port_num = (uint8_t) j; + ud_btl->ib_port_attr = ib_port_attr; + ud_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */ + + opal_list_append(&btl_list, (opal_list_item_t*) ib_selected); + if(++mca_btl_ud_component.ib_num_btls >= mca_btl_ud_component.ib_max_btls) + break; + } + else{ + free(ib_port_attr); + } + } + } + + + /* Allocate space for btl modules */ + mca_btl_ud_component.ud_btls = (mca_btl_ud_module_t*) + malloc(sizeof(mca_btl_ud_module_t) * mca_btl_ud_component.ib_num_btls); + + if(NULL == mca_btl_ud_component.ud_btls) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + btls = (struct mca_btl_base_module_t**) + malloc(mca_btl_ud_component.ib_num_btls * sizeof(mca_btl_ud_module_t*)); + if(NULL == btls) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + + for(i = 0; i < mca_btl_ud_component.ib_num_btls; i++){ + item = opal_list_remove_first(&btl_list); + ib_selected = (mca_btl_base_selected_module_t*)item; + ud_btl = (mca_btl_ud_module_t*) ib_selected->btl_module; + memcpy(&(mca_btl_ud_component.ud_btls[i]), ud_btl , sizeof(mca_btl_ud_module_t)); + free(ib_selected); + free(ud_btl); + + ud_btl = &mca_btl_ud_component.ud_btls[i]; + ud_btl->rd_num = mca_btl_ud_component.rd_num; + ud_btl->rd_low = mca_btl_ud_component.rd_low; + + ud_btl->sd_wqe_lp = mca_btl_ud_component.sd_num; + ud_btl->sd_wqe_hp = mca_btl_ud_component.sd_num; + + /* Initialize module state */ + OBJ_CONSTRUCT(&ud_btl->ib_lock, opal_mutex_t); + OBJ_CONSTRUCT(&ud_btl->send_free_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&ud_btl->send_free_max, ompi_free_list_t); + OBJ_CONSTRUCT(&ud_btl->send_free_frag, ompi_free_list_t); + + OBJ_CONSTRUCT(&ud_btl->recv_free_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&ud_btl->recv_free_max, ompi_free_list_t); + + OBJ_CONSTRUCT(&ud_btl->pending_frags_hp, opal_list_t); + OBJ_CONSTRUCT(&ud_btl->pending_frags_lp, opal_list_t); + + if(mca_btl_ud_module_init(ud_btl) != OMPI_SUCCESS) { +#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST + ibv_free_device_list(ib_devs); +#else + free(ib_devs); +#endif + return NULL; + } + + mpool_resources.ib_pd = ud_btl->ib_pd; + + /* initialize the memory pool using the hca */ + ud_btl->super.btl_mpool = + mca_mpool_base_module_create(mca_btl_ud_component.ib_mpool_name, + &ud_btl->super, + &mpool_resources); + + if(NULL == ud_btl->super.btl_mpool) { + BTL_ERROR(("error creating openib memory pool! aborting ud btl initialization")); + return NULL; + } + + /* Initialize pool of send fragments */ + length = sizeof(mca_btl_ud_frag_t) + + sizeof(mca_btl_ud_header_t) + + ud_btl->super.btl_eager_limit + + 2*MCA_BTL_IB_FRAG_ALIGN; + + ompi_free_list_init(&ud_btl->send_free_eager, + length, + OBJ_CLASS(mca_btl_ud_send_frag_eager_t), + mca_btl_ud_component.ib_free_list_num, + mca_btl_ud_component.ib_free_list_max, + mca_btl_ud_component.ib_free_list_inc, + ud_btl->super.btl_mpool); + + ompi_free_list_init(&ud_btl->recv_free_eager, + length + sizeof(mca_btl_ud_ib_header_t), + OBJ_CLASS(mca_btl_ud_recv_frag_eager_t), + mca_btl_ud_component.ib_free_list_num, + mca_btl_ud_component.ib_free_list_max, + mca_btl_ud_component.ib_free_list_inc, + ud_btl->super.btl_mpool); + + length = sizeof(mca_btl_ud_frag_t) + + sizeof(mca_btl_ud_header_t) + + ud_btl->super.btl_max_send_size + + 2*MCA_BTL_IB_FRAG_ALIGN; + + ompi_free_list_init(&ud_btl->send_free_max, + length, + OBJ_CLASS(mca_btl_ud_send_frag_max_t), + mca_btl_ud_component.ib_free_list_num, + mca_btl_ud_component.ib_free_list_max, + mca_btl_ud_component.ib_free_list_inc, + ud_btl->super.btl_mpool); + + /* Initialize pool of receive fragments */ + ompi_free_list_init (&ud_btl->recv_free_max, + length + sizeof(mca_btl_ud_ib_header_t), + OBJ_CLASS (mca_btl_ud_recv_frag_max_t), + mca_btl_ud_component.ib_free_list_num, + mca_btl_ud_component.ib_free_list_max, + mca_btl_ud_component.ib_free_list_inc, + ud_btl->super.btl_mpool); + + length = sizeof(mca_btl_ud_frag_t) + + sizeof(mca_btl_ud_header_t)+ + 2*MCA_BTL_IB_FRAG_ALIGN; + + ompi_free_list_init(&ud_btl->send_free_frag, + length, + OBJ_CLASS(mca_btl_ud_send_frag_frag_t), + mca_btl_ud_component.ib_free_list_num, + mca_btl_ud_component.ib_free_list_max, + mca_btl_ud_component.ib_free_list_inc, + ud_btl->super.btl_mpool); + + /* Post receive descriptors */ + do { + int32_t i; + int rc; + struct ibv_recv_wr* bad_wr; + + for(i = 0; i < ud_btl->rd_num; i++) { + mca_btl_ud_frag_t* frag; + OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_eager, frag, rc); + frag->sg_entry.length = frag->size + + sizeof(mca_btl_ud_header_t) + + sizeof(mca_btl_ud_ib_header_t); + if(ibv_post_recv(ud_btl->qp_hp, + &frag->wr_desc.rd_desc, &bad_wr)) { + BTL_ERROR(("error posting recv, errno %s\n", + strerror(errno))); + return NULL; + } + + OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_max, frag, rc); + frag->sg_entry.length = frag->size + + sizeof(mca_btl_ud_header_t) + + sizeof(mca_btl_ud_ib_header_t); + if(ibv_post_recv(ud_btl->qp_lp, + &frag->wr_desc.rd_desc, &bad_wr)) { + BTL_ERROR(("error posting recv, errno %s\n", + strerror(errno))); + return NULL; + } + } + } while(0); + + + /* TODO - Put this somewhere else or clean up our macros */ +#if 0 +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 1); + MCA_BTL_UD_POST_SRR_LOW(ud_btl, 1); + } else { +#endif + MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 0); + MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 0); +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + } +#endif +#endif + btls[i] = &ud_btl->super; + } + + /* Post OOB receive to support dynamic connection setup */ + mca_btl_ud_post_recv(); + mca_btl_ud_modex_send(); + + *num_btl_modules = mca_btl_ud_component.ib_num_btls; +#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST + ibv_free_device_list(ib_devs); +#else + free(ib_devs); +#endif + return btls; +} + + +static inline int mca_btl_ud_handle_incoming_hp(mca_btl_ud_module_t *, + mca_btl_ud_frag_t *, size_t); + +static inline int mca_btl_ud_handle_incoming_hp( + mca_btl_ud_module_t *ud_btl, + mca_btl_ud_frag_t *frag, + size_t byte_len) +{ + struct ibv_recv_wr* bad_wr; + + /* advance the segment address past the header and adjust the length..*/ + frag->segment.seg_addr.pval = frag->hdr + 1; + frag->segment.seg_len = byte_len - + sizeof(mca_btl_ud_header_t) - sizeof(mca_btl_ud_ib_header_t); + + /* call registered callback */ + ud_btl->ib_reg[frag->hdr->tag].cbfunc(&ud_btl->super, + frag->hdr->tag, &frag->base, + ud_btl->ib_reg[frag->hdr->tag].cbdata); + + /* TODO - not resetting segment values here.. problem? */ + if(ibv_post_recv(ud_btl->qp_hp, &frag->wr_desc.rd_desc, &bad_wr)) { + BTL_ERROR(("error posting recv, errno %s\n", strerror(errno))); + return OMPI_ERROR; + } + +#if 0 + OMPI_FREE_LIST_RETURN(&(ud_btl->recv_free_eager), + (opal_list_item_t*) frag); + + /* repost receive descriptors */ +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + OPAL_THREAD_ADD32((int32_t*) &ud_btl->srd_posted_hp, -1); + MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 0); + } else { +#endif + OPAL_THREAD_ADD32((int32_t*) &ud_btl->rd_posted_hp, -1); + MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 0); + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + } +#endif +#endif + return OMPI_SUCCESS; +} + +/* + * IB component progress. + */ + +int mca_btl_ud_component_progress() +{ + uint32_t i; + int count = 0,ne = 0, ret; + mca_btl_ud_frag_t* frag; + struct ibv_recv_wr* bad_wr; + + /* Poll for completions */ + for(i = 0; i < mca_btl_ud_component.ib_num_btls; i++) { + struct ibv_wc wc; + mca_btl_ud_module_t* ud_btl = &mca_btl_ud_component.ud_btls[i]; + + ne=ibv_poll_cq(ud_btl->ib_cq_hp, 1, &wc); + if(ne < 0 ) { + BTL_ERROR(("error polling HP CQ with %d errno says %s\n", + ne, strerror(errno))); + return OMPI_ERROR; + } + else if(1 == ne) { + if(wc.status != IBV_WC_SUCCESS) { + BTL_ERROR(("error polling HP CQ with status %d for wr_id %llu opcode %d\n", + wc.status, wc.wr_id, wc.opcode)); + return OMPI_ERROR; + } + + /* Handle work completions */ + switch(wc.opcode) { + case IBV_WC_SEND : + frag = (mca_btl_ud_frag_t*)(unsigned long)wc.wr_id; + +#if MCA_BTL_UD_ENABLE_PROFILE + mca_btl_ud_profile.avg_full_send += + opal_sys_timer_get_cycles() - frag->tm; + mca_btl_ud_profile.cnt_full_send++; +#endif + + /* Process a completed send */ + frag->base.des_cbfunc(&ud_btl->super, + frag->endpoint, &frag->base, OMPI_SUCCESS); + + OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, 1); + if(!opal_list_is_empty(&ud_btl->pending_frags_hp)) { + frag = (mca_btl_ud_frag_t*) + opal_list_remove_first(&ud_btl->pending_frags_hp); + mca_btl_ud_endpoint_post_send(ud_btl, frag->endpoint, frag); + } + + count++; + break; + + case IBV_WC_RECV: + /* Process a RECV */ + frag = (mca_btl_ud_frag_t*)(unsigned long) wc.wr_id; + ret = mca_btl_ud_handle_incoming_hp(ud_btl, frag, wc.byte_len); + + if (ret != OMPI_SUCCESS) + return ret; + count++; + break; + + default: + BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode)); + break; + } + } + + ne=ibv_poll_cq(ud_btl->ib_cq_lp, 1, &wc ); + if(ne < 0){ + BTL_ERROR(("error polling LP CQ with %d errno says %s", + ne, strerror(errno))); + return OMPI_ERROR; + } + else if(1 == ne) { + if(wc.status != IBV_WC_SUCCESS) { + BTL_ERROR(("error polling LP CQ with status %d for wr_id %llu opcode %d", + wc.status, wc.wr_id, wc.opcode)); + return OMPI_ERROR; + } + + /* Handle n/w completions */ + switch(wc.opcode) { + case IBV_WC_SEND: + frag = (mca_btl_ud_frag_t*) (unsigned long) wc.wr_id; + + /* Process a completed send - receiver must return tokens */ + frag->base.des_cbfunc(&ud_btl->super, + frag->endpoint, &frag->base, OMPI_SUCCESS); + + OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, 1); + if(!opal_list_is_empty(&ud_btl->pending_frags_lp)) { + frag = (mca_btl_ud_frag_t*) + opal_list_remove_first(&ud_btl->pending_frags_lp); + mca_btl_ud_endpoint_post_send(ud_btl, frag->endpoint, frag); + } + + count++; + break; + + case IBV_WC_RECV: + /* Process a RECV */ + frag = (mca_btl_ud_frag_t*) (unsigned long) wc.wr_id; + + frag->segment.seg_addr.pval = frag->hdr + 1; + frag->segment.seg_len = + wc.byte_len - sizeof(mca_btl_ud_header_t) - + sizeof(mca_btl_ud_ib_header_t); + + /* call registered callback */ + ud_btl->ib_reg[frag->hdr->tag].cbfunc(&ud_btl->super, + frag->hdr->tag, &frag->base, + ud_btl->ib_reg[frag->hdr->tag].cbdata); + + if(ibv_post_recv(ud_btl->qp_lp, + &frag->wr_desc.rd_desc, &bad_wr)) { + BTL_ERROR(("error posting recv, errno %s\n", + strerror(errno))); + return OMPI_ERROR; + } +#if 0 + OMPI_FREE_LIST_RETURN( + &(ud_btl->recv_free_max), (opal_list_item_t*) frag); + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + /* repost receive descriptors */ + OPAL_THREAD_ADD32((int32_t*) &ud_btl->srd_posted_lp, -1); + MCA_BTL_UD_POST_SRR_LOW(ud_btl, 0); + } else { +#endif + /* repost receive descriptors */ + OPAL_THREAD_ADD32((int32_t*) &ud_btl->rd_posted_lp, -1); + MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 0); + +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + } +#endif +#endif + count++; + break; + + default: + BTL_ERROR(("Unhandled work completion opcode %d", wc.opcode)); + break; + } + } + + } + + return count; +} + diff --git a/ompi/mca/btl/ud/btl_ud_endpoint.c b/ompi/mca/btl/ud/btl_ud_endpoint.c new file mode 100644 index 0000000000..e3b54f7fe6 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_endpoint.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include +#include +#include "ompi/types.h" +#include "ompi/mca/pml/base/pml_base_sendreq.h" +#include "orte/mca/ns/base/base.h" +#include "orte/mca/oob/base/base.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/dss/dss.h" +#include "btl_ud.h" +#include "btl_ud_endpoint.h" +#include "btl_ud_proc.h" +#include "btl_ud_frag.h" +#include "ompi/class/ompi_free_list.h" +#include +#include + + +static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint); +static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); + + +/* + * post a send to the work queue + */ +inline int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl, + mca_btl_ud_endpoint_t * endpoint, + mca_btl_ud_frag_t * frag) +{ + struct ibv_qp* ib_qp; + struct ibv_send_wr* bad_wr; + + /* Have to be careful here - UD adds a 40 byte header, but it is not + included on the sending side. */ + frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t); + frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; + + if(frag->size == ud_btl->super.btl_eager_limit) { + if(OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, -1) < 0) { + OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, 1); + opal_list_append(&ud_btl->pending_frags_hp, + (opal_list_item_t*)frag); + return OMPI_SUCCESS; + } + + ib_qp = ud_btl->qp_hp; + frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_hp; + frag->wr_desc.sr_desc.wr.ud.remote_qpn = + endpoint->rem_info.rem_qp_num_hp; + + if(frag->sg_entry.length <= ud_btl->ib_inline_max) { + frag->wr_desc.sr_desc.send_flags = + IBV_SEND_SIGNALED|IBV_SEND_INLINE; + } + } else { + if(OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, -1) < 0) { + OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, 1); + opal_list_append(&ud_btl->pending_frags_lp, + (opal_list_item_t*)frag); + return OMPI_SUCCESS; + } + + ib_qp = ud_btl->qp_lp; + frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_lp; + frag->wr_desc.sr_desc.wr.ud.remote_qpn = + endpoint->rem_info.rem_qp_num_lp; + } + + /*BTL_VERBOSE(("Send to : %d, len : %d %d %d, frag : %p", + endpoint->endpoint_proc->proc_guid.vpid, + frag->sg_entry.length, frag->segment.seg_len, + ud_btl->ib_inline_max, frag)); */ + +#if MCA_BTL_UD_ENABLE_PROFILE + frag->tm = opal_sys_timer_get_cycles(); +#endif + + MCA_BTL_UD_START_TIME(ibv_post_send); + if(ibv_post_send(ib_qp, &frag->wr_desc.sr_desc, &bad_wr)) { + BTL_ERROR(("error posting send request errno says %d %s\n", + errno, strerror(errno))); + return OMPI_ERROR; + } + MCA_BTL_UD_END_TIME(ibv_post_send); + +#if 0 +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 1); + MCA_BTL_UD_POST_SRR_LOW(ud_btl, 1); + } else { +#endif + MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 1); + MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 1); +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + } +#endif +#endif + return OMPI_SUCCESS; +} + + + +OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t, + opal_list_item_t, mca_btl_ud_endpoint_construct, + mca_btl_ud_endpoint_destruct); + +/* + * Initialize state of the endpoint instance. + * + */ + +static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint) +{ + endpoint->endpoint_btl = 0; + endpoint->endpoint_proc = 0; + endpoint->endpoint_state = MCA_BTL_IB_CLOSED; + OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t); + OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t); + + memset(&endpoint->rem_info, 0, sizeof(struct mca_btl_ud_rem_info_t)); +} + +/* + * Destroy a endpoint + * + */ + +static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) +{ +} + +/* + * Send connection information to remote endpoint using OOB + * + */ + +static void mca_btl_ud_endpoint_send_cb(int status, orte_process_name_t* endpoint, + orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) +{ + OBJ_RELEASE(buffer); +} + + +static int mca_btl_ud_endpoint_send_connect_data(mca_btl_ud_endpoint_t* endpoint) +{ + mca_btl_ud_module_t* ud_btl = endpoint->endpoint_btl; + orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t); + int rc; + + if(NULL == buffer) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* pack the info in the send buffer */ + rc = orte_dss.pack(buffer, &ud_btl->qp_hp->qp_num, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dss.pack(buffer, &ud_btl->qp_lp->qp_num, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dss.pack(buffer, &ud_btl->psn_hp, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dss.pack(buffer, &ud_btl->psn_lp, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dss.pack(buffer, &ud_btl->ib_port_attr->lid, 1, ORTE_UINT16); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dss.pack(buffer, + &((mca_btl_ud_endpoint_t*)endpoint)->subnet, 1, ORTE_UINT16); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* send to endpoint */ + rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, + buffer, ORTE_RML_TAG_DYNAMIC-1, 0, mca_btl_ud_endpoint_send_cb, NULL); + + BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", + ud_btl->qp_hp->qp_num, + ud_btl->qp_lp->qp_num, + endpoint->endpoint_btl->ib_port_attr->lid)); + + if(rc < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } + return OMPI_SUCCESS; +} + + +/* + * Non blocking OOB recv callback. + * Read incoming QP and other info, and if this endpoint + * is trying to connect, reply with our QP info, + * otherwise try to modify QP's and establish + * reliable connection + * + */ + +static void mca_btl_ud_endpoint_recv( + int status, + orte_process_name_t* endpoint, + orte_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + struct ibv_ah_attr ah_attr; + mca_btl_ud_proc_t *ib_proc; + mca_btl_ud_endpoint_t *ib_endpoint = NULL; + mca_btl_ud_rem_info_t rem_info; + mca_btl_ud_module_t* ud_btl; + int rc; + uint32_t i; + size_t cnt = 1; + + /* start by unpacking data first so we know who is knocking at + our door */ + + rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_hp, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_lp, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + rc = orte_dss.unpack(buffer, &rem_info.rem_psn_hp, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + rc = orte_dss.unpack(buffer, &rem_info.rem_psn_lp, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + rc = orte_dss.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + rc = orte_dss.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return; + } + + /*BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d", + rem_info.rem_qp_num_hp, + rem_info.rem_qp_num_lp, + rem_info.rem_lid));*/ + + for(ib_proc = (mca_btl_ud_proc_t*) + opal_list_get_first(&mca_btl_ud_component.ib_procs); + ib_proc != (mca_btl_ud_proc_t*) + opal_list_get_end(&mca_btl_ud_component.ib_procs); + ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) { + + if(orte_ns.compare(ORTE_NS_CMP_ALL, + &ib_proc->proc_guid, endpoint) == 0) { + bool found = false; + + /* Try to get the endpoint instance of this proc */ + for(i = 0; i < ib_proc->proc_endpoint_count; i++) { + ib_endpoint = ib_proc->proc_endpoints[i]; + if(ib_endpoint->rem_info.rem_lid && + ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) { + /* we've seen them before! */ + found = true; + break; + } + } + /* If we haven't seen this remote lid before then try to match on + endpoint */ + for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) { + ib_endpoint = ib_proc->proc_endpoints[i]; + if(!ib_endpoint->rem_info.rem_lid && + ib_endpoint->subnet == rem_info.rem_subnet) { + /* found a match based on subnet! */ + found = true; + break; + } + } + /* try finding an open port, even if subnets don't match */ + for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) { + ib_endpoint = ib_proc->proc_endpoints[i]; + if(!ib_endpoint->rem_info.rem_lid) { + /* found an unused end-point */ + found = true; + break; + } + } + + if(!found) { + BTL_ERROR(("can't find suitable endpoint for this peer\n")); + return; + } + + OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); + + /* Update status */ + if(ib_endpoint->endpoint_state == MCA_BTL_IB_CLOSED) { + if(OMPI_SUCCESS != + mca_btl_ud_endpoint_send_connect_data(ib_endpoint)) { + BTL_ERROR(("error sending connect request, error code %d", rc)); + ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED; + OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); + return; + } + } + + /* Always 'CONNECTED' at this point */ + ud_btl = ib_endpoint->endpoint_btl; + memcpy(&ib_endpoint->rem_info, + &rem_info, sizeof(mca_btl_ud_rem_info_t)); + + ah_attr.is_global = 0; + ah_attr.dlid = rem_info.rem_lid; + ah_attr.sl = mca_btl_ud_component.ib_service_level; + ah_attr.src_path_bits = mca_btl_ud_component.ib_src_path_bits; + ah_attr.port_num = ud_btl->port_num; + + ib_endpoint->rmt_ah_hp = ibv_create_ah(ud_btl->ib_pd, &ah_attr); + if(NULL == ib_endpoint->rmt_ah_hp) { + BTL_ERROR(("error creating address handle errno says %s\n", + strerror(errno))); + ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED; + OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); + return; + } + + ib_endpoint->rmt_ah_lp = ibv_create_ah(ud_btl->ib_pd, &ah_attr); + if(NULL == ib_endpoint) { + BTL_ERROR(("error creating address handle errno says %s\n", + strerror(errno))); + ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED; + OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); + return; + } + + ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; + + /*BTL_VERBOSE(("connected! QP num = %d, Low Priority QP num %d, LID = %d", + ib_endpoint->rem_info.rem_qp_num_hp, + ib_endpoint->rem_info.rem_qp_num_lp, + ib_endpoint->rem_info.rem_lid));*/ + + /* Post our queued sends */ + while(!opal_list_is_empty(&(ib_endpoint->pending_send_frags))) { + mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*) + opal_list_remove_first(&(ib_endpoint->pending_send_frags)); + if(OMPI_SUCCESS != mca_btl_ud_endpoint_post_send( + ud_btl, ib_endpoint, frag)) { + BTL_ERROR(("ERROR posting send")); + ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED; + break; + } + } + + OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); + break; + } + } +} + +/* + * Post the OOB recv (for receiving the peers information) + */ +void mca_btl_ud_post_recv() +{ + orte_rml.recv_buffer_nb( + ORTE_RML_NAME_ANY, + ORTE_RML_TAG_DYNAMIC-1, + ORTE_RML_PERSISTENT, + mca_btl_ud_endpoint_recv, + NULL); +} + +/* + * Attempt to send a fragment using a given endpoint. If the endpoint is not + * connected, queue the fragment and start the connection as required. + */ + +int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint, + mca_btl_ud_frag_t* frag) +{ + int rc = OMPI_SUCCESS; + bool call_progress = false; + + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + switch(endpoint->endpoint_state) { + case MCA_BTL_IB_CONNECTED: + { + MCA_BTL_UD_START_TIME(endpoint_send_conn); + rc = mca_btl_ud_endpoint_post_send( + endpoint->endpoint_btl, endpoint, frag); + MCA_BTL_UD_END_TIME(endpoint_send_conn); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + return rc; + } + case MCA_BTL_IB_CLOSED: + /* Send connection info over to remote endpoint */ + endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; + rc = mca_btl_ud_endpoint_send_connect_data(endpoint); + if(OMPI_SUCCESS != rc) { + BTL_ERROR(("error sending connect request, error code %d", rc)); + endpoint->endpoint_state = MCA_BTL_IB_FAILED; + return rc; + } + + /** + * As long as we expect a message from the peer (in order to setup + * the connection) let the event engine pool the OOB events. Note: + * we increment it once per active connection. + */ + opal_progress_event_increment(); + call_progress = true; + + /* No break here - fall through */ + case MCA_BTL_IB_CONNECTING: + opal_list_append(&endpoint->pending_send_frags, + (opal_list_item_t *)frag); + break; + case MCA_BTL_IB_FAILED: + BTL_ERROR(("endpoint FAILED")); + default: + rc = OMPI_ERR_UNREACH; + } + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if(call_progress) opal_progress(); + return rc; +} + + +/* + * Create the queue pair note that this is just the initial + * queue pair creation and we need to get the remote queue pair + * info from the peer before the qp is usable, + */ +/* TODO - maybe start to push this off into its own file? */ + +int mca_btl_ud_endpoint_init_qp( + mca_btl_base_module_t* btl, + struct ibv_cq* cq, +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + struct ibv_srq* srq, +#endif + struct ibv_qp** qp, + uint32_t lcl_psn + ) +{ + mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; + struct ibv_qp* my_qp; + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + + memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); + + qp_init_attr.send_cq = cq; + qp_init_attr.recv_cq = cq; + qp_init_attr.cap.max_send_wr = mca_btl_ud_component.rd_num; + qp_init_attr.cap.max_recv_wr = mca_btl_ud_component.rd_num; + qp_init_attr.cap.max_send_sge = mca_btl_ud_component.ib_sg_list_size; + qp_init_attr.cap.max_recv_sge = mca_btl_ud_component.ib_sg_list_size; + qp_init_attr.qp_type = IBV_QPT_UD; +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + if(mca_btl_ud_component.use_srq) { + qp_init_attr.srq = srq; + } +#endif + my_qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr); + + if(NULL == my_qp) { + BTL_ERROR(("error creating qp errno says %s", strerror(errno))); + return OMPI_ERROR; + } + + (*qp) = my_qp; + if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) { + BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data")); + } + + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.pkey_index = mca_btl_ud_component.ib_pkey_ix; + qp_attr.qkey = mca_btl_ud_component.ib_qkey; + qp_attr.port_num = ud_btl->port_num; + + if(ibv_modify_qp(*qp, &qp_attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_QKEY)) { + BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); + return OMPI_ERROR; + } + + qp_attr.qp_state = IBV_QPS_RTR; + if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) { + BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno))); + return OMPI_ERROR; + } + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = lcl_psn; + if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { + BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno))); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/btl/ud/btl_ud_endpoint.h b/ompi/mca/btl/ud/btl_ud_endpoint.h new file mode 100644 index 0000000000..f018758155 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_endpoint.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_IB_ENDPOINT_H +#define MCA_BTL_IB_ENDPOINT_H + +#include "opal/class/opal_list.h" +#include "opal/event/event.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/btl/btl.h" +#include "btl_ud_frag.h" +#include "btl_ud.h" +#include +#include +#include +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/mca/mpool/openib/mpool_openib.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t); + + +struct mca_btl_ud_frag_t; + +struct mca_btl_ud_port_info_t { + uint16_t subnet; +}; +typedef struct mca_btl_ud_port_info_t mca_btl_ud_port_info_t; + + +/** + * State of IB endpoint connection. + */ + +typedef enum { + /* Defines the state in which this BTL instance + * has started the process of connection */ + MCA_BTL_IB_CONNECTING, + + /* Connected ... both sender & receiver have + * buffers associated with this connection */ + MCA_BTL_IB_CONNECTED, + + /* Connection is closed, there are no resources + * associated with this */ + MCA_BTL_IB_CLOSED, + + /* Maximum number of retries have been used. + * Report failure on send to upper layer */ + MCA_BTL_IB_FAILED +} mca_btl_ud_endpoint_state_t; + +struct mca_btl_ud_rem_info_t { + + uint32_t rem_qp_num_hp; + uint32_t rem_qp_num_lp; + /* Remote QP number (Low and High priority) */ + + uint16_t rem_lid; + /* Local identifier of the remote process */ + + + uint32_t rem_psn_hp; + uint32_t rem_psn_lp; + /* Remote processes port sequence number (Low and High) */ + + uint16_t rem_subnet; + /* subnet of remote process */ +}; +typedef struct mca_btl_ud_rem_info_t mca_btl_ud_rem_info_t; + + + +/** + * An abstraction that represents a connection to a endpoint process. + * An instance of mca_btl_base_endpoint_t is associated w/ each process + * and BTL pair at startup. Normally connections are established as-needed. + * The UD BTL is connectionless, so no connection is ever established. + */ + +struct mca_btl_base_endpoint_t { + opal_list_item_t super; + + struct mca_btl_ud_module_t* endpoint_btl; + /**< BTL instance that created this connection */ + + struct mca_btl_ud_proc_t* endpoint_proc; + /**< proc structure corresponding to endpoint */ + + mca_btl_ud_endpoint_state_t endpoint_state; + /**< current state of the connection */ + + opal_mutex_t endpoint_lock; + /**< lock for concurrent access to endpoint state */ + + opal_list_t pending_send_frags; + /**< list of pending send frags for this endpoint */ + + mca_btl_ud_rem_info_t rem_info; + + struct ibv_ah* rmt_ah_hp; + struct ibv_ah* rmt_ah_lp; + /* Local Address Handle (Low and High) */ + + uint16_t subnet; /**< subnet of this endpoint*/ +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t; + +int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint, struct mca_btl_ud_frag_t* frag); +inline int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl, + mca_btl_ud_endpoint_t * endpoint, + struct mca_btl_ud_frag_t * frag); +int mca_btl_ud_endpoint_connect(mca_btl_base_endpoint_t*); +void mca_btl_ud_post_recv(void); +int mca_btl_ud_endpoint_init_qp( + mca_btl_base_module_t* btl, + struct ibv_cq* cq, +#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ + struct ibv_srq* srq, +#endif + struct ibv_qp** qp, + uint32_t lcl_psn); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ud/btl_ud_frag.c b/ompi/mca/btl/ud/btl_ud_frag.c new file mode 100644 index 0000000000..1911920255 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_frag.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_ud_frag.h" +#include "ompi/mca/mpool/openib/mpool_openib.h" + + + +static inline void mca_btl_ud_frag_common_constructor( mca_btl_ud_frag_t* frag) +{ + frag->ud_reg = (mca_mpool_openib_registration_t*)frag->base.super.user_data; + frag->sg_entry.lkey = frag->ud_reg->mr->lkey; + frag->base.des_flags = 0; +} + + +static void mca_btl_ud_send_frag_common_constructor(mca_btl_ud_frag_t* frag) +{ + + mca_btl_ud_frag_common_constructor(frag); + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + + /* We do not include the mca_btl_ud_ib_header_t data when sending */ + frag->hdr = (mca_btl_ud_header_t*)(frag+1); + frag->segment.seg_addr.pval = frag->hdr + 1; + + frag->sg_entry.addr = (unsigned long)frag->hdr; + + frag->wr_desc.sr_desc.wr_id = (unsigned long) frag; + frag->wr_desc.sr_desc.sg_list = &frag->sg_entry; + frag->wr_desc.sr_desc.num_sge = 1; + frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; + frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->wr_desc.sr_desc.next = NULL; + frag->wr_desc.sr_desc.wr.ud.remote_qkey = mca_btl_ud_component.ib_qkey; +} + +static void mca_btl_ud_recv_frag_common_constructor(mca_btl_ud_frag_t* frag) +{ + mca_btl_ud_frag_common_constructor(frag); + frag->base.des_dst = &frag->segment; + frag->base.des_dst_cnt = 1; + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + + /* Receive frag headers start 40 bytes later */ + frag->hdr = (mca_btl_ud_header_t*)((unsigned char*)frag) + + sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_ib_header_t); + frag->segment.seg_addr.pval = frag->hdr + 1; + + frag->sg_entry.addr = (unsigned long)(frag + 1); + frag->segment.seg_len = frag->size; + frag->sg_entry.length = frag->size + + sizeof(mca_btl_ud_ib_header_t) + sizeof(mca_btl_ud_header_t); + + frag->wr_desc.rd_desc.wr_id = (unsigned long) frag; + frag->wr_desc.rd_desc.sg_list = &frag->sg_entry; + frag->wr_desc.rd_desc.num_sge = 1; + frag->wr_desc.rd_desc.next = NULL; +} + +static void mca_btl_ud_send_frag_eager_constructor(mca_btl_ud_frag_t* frag) +{ + frag->size = mca_btl_ud_component.eager_limit; + mca_btl_ud_send_frag_common_constructor(frag); +} + + +static void mca_btl_ud_send_frag_max_constructor(mca_btl_ud_frag_t* frag) +{ + frag->size = mca_btl_ud_component.max_send_size; + mca_btl_ud_send_frag_common_constructor(frag); +} + +static void mca_btl_ud_recv_frag_max_constructor(mca_btl_ud_frag_t* frag) +{ + frag->size = mca_btl_ud_component.max_send_size; + mca_btl_ud_recv_frag_common_constructor(frag); +} + + +static void mca_btl_ud_recv_frag_eager_constructor(mca_btl_ud_frag_t* frag) +{ + frag->size = mca_btl_ud_component.eager_limit; + mca_btl_ud_recv_frag_common_constructor(frag); +} + +static void mca_btl_ud_send_frag_frag_constructor(mca_btl_ud_frag_t* frag) +{ + frag->size = 0; + mca_btl_ud_send_frag_common_constructor(frag); +} + + +OBJ_CLASS_INSTANCE( + mca_btl_ud_frag_t, + mca_btl_base_descriptor_t, + NULL, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_ud_send_frag_eager_t, + mca_btl_base_descriptor_t, + mca_btl_ud_send_frag_eager_constructor, + NULL); + + +OBJ_CLASS_INSTANCE( + mca_btl_ud_send_frag_max_t, + mca_btl_base_descriptor_t, + mca_btl_ud_send_frag_max_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_ud_send_frag_frag_t, + mca_btl_base_descriptor_t, + mca_btl_ud_send_frag_frag_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_ud_recv_frag_eager_t, + mca_btl_base_descriptor_t, + mca_btl_ud_recv_frag_eager_constructor, + NULL); + + +OBJ_CLASS_INSTANCE( + mca_btl_ud_recv_frag_max_t, + mca_btl_base_descriptor_t, + mca_btl_ud_recv_frag_max_constructor, + NULL); + + diff --git a/ompi/mca/btl/ud/btl_ud_frag.h b/ompi/mca/btl/ud/btl_ud_frag.h new file mode 100644 index 0000000000..1623cdf04e --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_frag.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UD_FRAG_H +#define MCA_BTL_UD_FRAG_H + + +#define MCA_BTL_IB_FRAG_ALIGN (8) +#include "ompi_config.h" +#include "btl_ud.h" + +#include +#include "ompi/mca/mpool/openib/mpool_openib.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t); + +/* UD adds a 40 byte global routing header */ +/* This works in strange ways - the sending side does not need to explicitly + include this data in sg lists. Then, on the receiving side, the extra 40 + bytes magically appear. */ +struct mca_btl_ud_ib_header_t { + uint8_t ib_grh[40]; +}; +typedef struct mca_btl_ud_ib_header_t mca_btl_ud_ib_header_t; + +struct mca_btl_ud_header_t { + mca_btl_base_tag_t tag; +}; +typedef struct mca_btl_ud_header_t mca_btl_ud_header_t; + +/** + * IB send fragment derived type. + */ +struct mca_btl_ud_frag_t { + mca_btl_base_descriptor_t base; + mca_btl_base_segment_t segment; + struct mca_btl_base_endpoint_t *endpoint; + size_t size; + union{ + struct ibv_recv_wr rd_desc; + struct ibv_send_wr sr_desc; + } wr_desc; + struct ibv_sge sg_entry; + + /* When this is a send frag, hdr points right after this, as expected. + But when this is a receive frag, we have an extra 40 bytes provided + by IB, so this points 40 bytes past the end of the frag. */ + mca_btl_ud_header_t *hdr; + + mca_mpool_openib_registration_t* ud_reg; + opal_timer_t tm; +}; +typedef struct mca_btl_ud_frag_t mca_btl_ud_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t); + + +typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_eager_t; + +OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_eager_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_max_t; + +OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_max_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_frag_t; + +OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_frag_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_eager_t; + +OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_eager_t); + +typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_max_t; + +OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_max_t); + + +/* + * Allocate an IB send descriptor + * + */ + +#define MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc) \ +{ \ + \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_eager, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ +} + +#define MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_eager, (opal_list_item_t*)(frag)); \ +} + + +#define MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc) \ +{ \ + \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_max, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ +} + +#define MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_max, (opal_list_item_t*)(frag)); \ +} + + +#define MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc) \ +{ \ + \ + opal_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_frag, item, rc); \ + frag = (mca_btl_ud_frag_t*) item; \ +} + +#define MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_frag, (opal_list_item_t*)(frag)); \ +} + + +struct mca_btl_ud_module_t; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ud/btl_ud_proc.c b/ompi/mca/btl/ud/btl_ud_proc.c new file mode 100644 index 0000000000..73b78d8b47 --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_proc.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/class/opal_hash_table.h" +#include "ompi/mca/pml/base/pml_base_module_exchange.h" + +#include "btl_ud.h" +#include "btl_ud_proc.h" + +static void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc); +static void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc); + +OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t, + opal_list_item_t, mca_btl_ud_proc_construct, + mca_btl_ud_proc_destruct); + +void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc) +{ + proc->proc_ompi = 0; + proc->proc_port_count = 0; + proc->proc_endpoints = 0; + proc->proc_endpoint_count = 0; + OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t); + /* add to list of all proc instance */ + OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock); + opal_list_append(&mca_btl_ud_component.ib_procs, &proc->super); + OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock); +} + +/* + * Cleanup ib proc instance + */ + +void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc) +{ + /* remove from list of all proc instances */ + OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock); + opal_list_remove_item(&mca_btl_ud_component.ib_procs, &proc->super); + OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock); + + /* release resources */ + if(NULL != proc->proc_endpoints) { + free(proc->proc_endpoints); + } +} + + +/* + * Look for an existing IB process instances based on the associated + * ompi_proc_t instance. + */ +static mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc) +{ + mca_btl_ud_proc_t* ib_proc; + + OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock); + + for(ib_proc = (mca_btl_ud_proc_t*) + opal_list_get_first(&mca_btl_ud_component.ib_procs); + ib_proc != (mca_btl_ud_proc_t*) + opal_list_get_end(&mca_btl_ud_component.ib_procs); + ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) { + if(ib_proc->proc_ompi == ompi_proc) { + OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock); + return ib_proc; + } + } + OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock); + return NULL; +} + +/* + * Create a IB process structure. There is a one-to-one correspondence + * between a ompi_proc_t and a mca_btl_ud_proc_t instance. We cache + * additional data (specifically the list of mca_btl_ud_endpoint_t instances, + * and published addresses) associated w/ a given destination on this + * datastructure. + */ + +mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc) +{ + mca_btl_ud_proc_t* module_proc = NULL; + size_t size; + int rc; + + /* Check if we have already created a IB proc + * structure for this ompi process */ + module_proc = mca_btl_ud_proc_lookup_ompi(ompi_proc); + + if(module_proc != NULL) { + /* Gotcha! */ + return module_proc; + } + + /* Oops! First time, gotta create a new IB proc + * out of the ompi_proc ... */ + module_proc = OBJ_NEW(mca_btl_ud_proc_t); + /* Initialize number of peer */ + module_proc->proc_endpoint_count = 0; + module_proc->proc_ompi = ompi_proc; + + /* build a unique identifier (of arbitrary + * size) to represent the proc */ + module_proc->proc_guid = ompi_proc->proc_name; + + + /* query for the peer address info */ + rc = mca_pml_base_modex_recv( + &mca_btl_ud_component.super.btl_version, + ompi_proc, + (void*)&module_proc->proc_ports, + &size + ); + + + + if(OMPI_SUCCESS != rc) { + opal_output(0, "[%s:%d] mca_pml_base_modex_recv failed for peer [%d,%d,%d]", + __FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name)); + OBJ_RELEASE(module_proc); + return NULL; + } + + if((size % sizeof(mca_btl_ud_port_info_t)) != 0) { + opal_output(0, "[%s:%d] invalid module address for peer [%d,%d,%d]", + __FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name)); + OBJ_RELEASE(module_proc); + return NULL; + } + + + module_proc->proc_port_count = size/sizeof(mca_btl_ud_port_info_t); + + + if (0 == module_proc->proc_port_count) { + module_proc->proc_endpoints = NULL; + } else { + module_proc->proc_endpoints = (mca_btl_base_endpoint_t**) + malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*)); + } + + if(NULL == module_proc->proc_endpoints) { + OBJ_RELEASE(module_proc); + return NULL; + } + return module_proc; +} + + +/* + * Note that this routine must be called with the lock on the process + * already held. Insert a btl instance into the proc array and assign + * it an address. + */ +int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc, + mca_btl_base_endpoint_t* module_endpoint) +{ + /* insert into endpoint array */ + module_endpoint->endpoint_proc = module_proc; + module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/btl/ud/btl_ud_proc.h b/ompi/mca/btl/ud/btl_ud_proc.h new file mode 100644 index 0000000000..569a960f9a --- /dev/null +++ b/ompi/mca/btl/ud/btl_ud_proc.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UD_PROC_H +#define MCA_BTL_UD_PROC_H + +#include "orte/mca/ns/ns.h" +#include "opal/class/opal_object.h" +#include "ompi/proc/proc.h" +#include "btl_ud.h" +#include "btl_ud_endpoint.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_btl_ud_proc_t); + +/** + * Represents the state of a remote process and the set of addresses + * that it exports. Also cache an instance of mca_btl_base_endpoint_t for + * each + * BTL instance that attempts to open a connection to the process. + */ +struct mca_btl_ud_proc_t { + opal_list_item_t super; + /**< allow proc to be placed on a list */ + + ompi_proc_t *proc_ompi; + /**< pointer to corresponding ompi_proc_t */ + + orte_process_name_t proc_guid; + /**< globally unique identifier for the process */ + + struct mca_btl_ud_port_info_t* proc_ports; + size_t proc_port_count; + /**< number of ports published by endpoint */ + + struct mca_btl_base_endpoint_t **proc_endpoints; + /**< array of endpoints that have been created to access this proc */ + + size_t proc_endpoint_count; + /**< number of endpoints */ + + opal_mutex_t proc_lock; + /**< lock to protect against concurrent access to proc state */ +}; +typedef struct mca_btl_ud_proc_t mca_btl_ud_proc_t; + +mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc); +int mca_btl_ud_proc_insert(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/ud/configure.m4 b/ompi/mca/btl/ud/configure.m4 new file mode 100644 index 0000000000..d05c04d4e1 --- /dev/null +++ b/ompi/mca/btl/ud/configure.m4 @@ -0,0 +1,43 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +# MCA_btl_ud_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_btl_ud_CONFIG],[ + OMPI_CHECK_OPENIB([btl_ud], + [btl_ud_happy="yes"], + [btl_ud_happy="no"]) + + AS_IF([test "$btl_ud_happy" = "yes"], + [btl_ud_WRAPPER_EXTRA_LDFLAGS="$btl_ud_LDFLAGS" + btl_ud_WRAPPER_EXTRA_LIBS="$btl_ud_LIBS" + $1], + [$2]) + + + # substitute in the things needed to build ud + AC_SUBST([btl_ud_CFLAGS]) + AC_SUBST([btl_ud_CPPFLAGS]) + AC_SUBST([btl_ud_LDFLAGS]) + AC_SUBST([btl_ud_LIBS]) +])dnl diff --git a/ompi/mca/btl/ud/configure.params b/ompi/mca/btl/ud/configure.params new file mode 100644 index 0000000000..005492e473 --- /dev/null +++ b/ompi/mca/btl/ud/configure.params @@ -0,0 +1,26 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sandia National Laboratories. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=btl_ud.c +PARAM_CONFIG_HEADER_FILE="ud_config.h" +PARAM_CONFIG_FILES="Makefile"