diff --git a/ompi/mca/btl/ofud/Makefile.am b/ompi/mca/btl/ofud/Makefile.am deleted file mode 100644 index 323c2721f6..0000000000 --- a/ompi/mca/btl/ofud/Makefile.am +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Sandia National Laboratories. All rights -# reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS=$(btl_ofud_CPPFLAGS) - -sources = \ - btl_ofud.c \ - btl_ofud.h \ - btl_ofud_component.c \ - btl_ofud_endpoint.c \ - btl_ofud_endpoint.h \ - btl_ofud_frag.c \ - btl_ofud_frag.h \ - btl_ofud_proc.c \ - btl_ofud_proc.h - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_ompi_btl_ofud_DSO -lib = -lib_sources = -component = mca_btl_ofud.la -component_sources = $(sources) -else -lib = libmca_btl_ofud.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_btl_ofud_la_SOURCES = $(component_sources) -mca_btl_ofud_la_LDFLAGS = -module -avoid-version $(btl_ofud_LDFLAGS) -mca_btl_ofud_la_LIBADD = $(btl_ofud_LIBS) - -noinst_LTLIBRARIES = $(lib) -libmca_btl_ofud_la_SOURCES = $(lib_sources) -libmca_btl_ofud_la_LDFLAGS= -module -avoid-version $(btl_ofud_LDFLAGS) -libmca_btl_ofud_la_LIBADD = $(btl_ofud_LIBS) diff --git a/ompi/mca/btl/ofud/btl_ofud.c b/ompi/mca/btl/ofud/btl_ofud.c deleted file mode 100644 index a098dfa4c7..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2008 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include -#include - -#include "ompi_config.h" -#include "opal/class/opal_bitmap.h" -#include "opal/prefetch.h" -#include "opal/util/output.h" -#include "opal/datatype/opal_convertor.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/btl/base/btl_base_error.h" -#include "ompi/mca/mpool/base/base.h" -#include "ompi/mca/mpool/mpool.h" - -#include "btl_ofud.h" -#include "btl_ofud_frag.h" -#include "btl_ofud_proc.h" -#include "btl_ofud_endpoint.h" - - -mca_btl_ud_module_t mca_btl_ofud_module = { - { - &mca_btl_ofud_component.super, - 0, /* eager_limit */ - 0, /* min_send_size */ - 0, /* max_send_size */ - 0, /* rdma_pipeline_send_length */ - 0, /* rdma_pipeline_frag_size */ - 0, /* min_rdma_pipeline_size */ - 0, /* exclusivity */ - 0, /* latency */ - 0, /* bandwidth */ - MCA_BTL_FLAGS_SEND, - 0, /* segment length */ - mca_btl_ud_add_procs, - mca_btl_ud_del_procs, - NULL, - mca_btl_ud_finalize, - mca_btl_ud_alloc, - mca_btl_ud_free, - mca_btl_ud_prepare_src, - NULL, /*mca_btl_ud_prepare_dst */ - mca_btl_ud_send, - NULL, /* send immediate */ - NULL, /*mca_btl_ud_put */ - NULL, /*mca_btl_ud_get */ - mca_btl_base_dump, - NULL, /* mpool */ - NULL, /* register error */ - mca_btl_ud_ft_event - } -}; - - - -/* - * Add procs to this BTL module, receiving endpoint information from the modex. - */ - -int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct ompi_proc_t **ompi_procs, - struct mca_btl_base_endpoint_t** peers, - opal_bitmap_t* reachable) -{ - mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; - struct ibv_ah_attr ah_attr; - int i, rc; - - /* Set up the endpoint lookup table if it hasn't been already */ - /* We do this here so we can initialize the table to a reasonable size - based on nprocs */ -#if 0 - if(NULL == ud_btl->ep_lookup) { - ud_btl->ep_lookup = malloc(sizeof(opal_hash_table_t)); - OBJ_CONSTRUCT(ud_btl->ep_lookup, opal_hash_table_t); - opal_hash_table_init(ud_btl->ep_lookup, nprocs); - } -#endif - - for(i = 0; i < (int)nprocs; i++) { - struct ompi_proc_t* ompi_proc = ompi_procs[i]; - mca_btl_ud_proc_t* ib_proc; - mca_btl_base_endpoint_t* ib_peer; - - if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - - /* The btl_proc datastructure is shared by all IB BTL instances that are - * trying to reach this destination. Cache the peer instance on the - * btl_proc. - */ - ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t); - if(NULL == ib_peer) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - OPAL_THREAD_LOCK(&ib_proc->proc_lock); - rc = mca_btl_ud_proc_insert(ib_proc, ib_peer); - if(rc != OMPI_SUCCESS) { - OBJ_RELEASE(ib_peer); - OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); - continue; - } - - BTL_VERBOSE(("modex_recv QP num %d, LID = %d", - ib_peer->rem_addr.qp_num, ib_peer->rem_addr.lid)); - - /* Set up IB address handles for the endpoint */ - ah_attr.is_global = 0; - ah_attr.dlid = ib_peer->rem_addr.lid; - ah_attr.sl = mca_btl_ofud_component.ib_service_level; - ah_attr.src_path_bits = mca_btl_ofud_component.ib_src_path_bits; - ah_attr.port_num = ud_btl->ib_port_num; - - ib_peer->rmt_ah = ibv_create_ah(ud_btl->ib_pd, &ah_attr); - if(NULL == ib_peer->rmt_ah) { - BTL_ERROR(("error creating address handle: %s\n", strerror(errno))); - OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); - continue; - } - - /* Insert a pointer to the endpoint in the BTL lookup table */ -#if 0 - opal_hash_table_set_value_uint64(ud_btl->ep_lookup, - ((uint64_t)ib_peer->rem_addr.lid << 32) | - ib_peer->rem_addr.qp_num, - ib_peer); -#endif - - opal_bitmap_set_bit(reachable, i); - OPAL_THREAD_UNLOCK(&ib_proc->proc_lock); - peers[i] = ib_peer; - } - - return OMPI_SUCCESS; -} - - -/* - * Delete the proc as reachable from this btl module - */ - -int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct ompi_proc_t** procs, - struct mca_btl_base_endpoint_t** peers) -{ - size_t i; - - for(i = 0; i < nprocs; i++) { - mca_btl_ud_endpoint_t* endpoint = (mca_btl_ud_endpoint_t*)peers[i]; - mca_btl_ud_proc_t* proc = mca_btl_ud_proc_lookup_ompi(procs[i]); -#if 0 - opal_hash_table_remove_value_uint64(ud_btl->ep_lookup, - ((uint64_t)endpoint->rem_addr.lid << 32) | - endpoint->rem_addr.qp_num); -#endif - if(NULL != proc) { - mca_btl_ud_proc_remove(proc, endpoint); - } - - OBJ_RELEASE(endpoint); - } - - return OMPI_SUCCESS; -} - - -/** - * Allocate a segment. - * - * @param btl (IN) BTL module - * @param size (IN) Request segment size. - * - * When allocating a segment we pull a pre-alllocated segment - * from one of two free lists, an eager list and a max list - */ - -mca_btl_base_descriptor_t* mca_btl_ud_alloc(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - uint8_t order, - size_t size, - uint32_t flags) -{ - mca_btl_ud_frag_t* frag = NULL; - int rc; - - if(OPAL_LIKELY(size <= mca_btl_ofud_module.super.btl_eager_limit)) { - MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc); - } - - if(NULL == frag) { - return NULL; - } - - frag->base.order = MCA_BTL_NO_ORDER; - frag->base.des_flags = flags; - frag->segment.seg_len = size; - return (mca_btl_base_descriptor_t*)frag; -} - - -/** - * Return a segment - * - * Return the segment to the appropriate - * preallocated segment list - */ - -int mca_btl_ud_free(struct mca_btl_base_module_t* btl, - mca_btl_base_descriptor_t* des) -{ - mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des; - - if(OPAL_LIKELY(frag->type == MCA_BTL_UD_FRAG_SEND)) { - MCA_BTL_UD_RETURN_FRAG(btl, frag); - } else if(frag->type == MCA_BTL_UD_FRAG_USER && frag->ud_reg != NULL) { - btl->btl_mpool->mpool_deregister(btl->btl_mpool, - (mca_mpool_base_registration_t*)frag->ud_reg); - MCA_BTL_UD_RETURN_USER_FRAG(btl, frag); - } - - return OMPI_SUCCESS; -} - - -/** - * register user buffer or pack - * data into pre-registered buffer and return a - * descriptor that can be - * used for send/put. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - * - * prepare source's behavior depends on the following: - * Has a valid memory registration been passed to prepare_src? - * if so we attempt to use the pre-registred user-buffer, if the memory - * registration is to small (only a portion of the user buffer) then we must - * reregister the user buffer - * Has the user requested the memory to be left pinned? - * if so we insert the memory registration into a memory tree for later - * lookup, we may also remove a previous registration if a MRU (most recently - * used) list of registions is full, this prevents resources from being - * exhausted. - * Is the requested size larger than the btl's max send size? - * if so and we aren't asked to leave the registration pinned then we - * register the memory if the user's buffer is contiguous. - * Otherwise we choose from two free lists of pre-registered memory in which - * to pack the data into. - * - */ - -mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_ud_frag_t* frag = NULL; - struct iovec iov; - uint32_t iov_count = 1; - size_t max_data = *size; - int rc; - - if(opal_convertor_need_buffers(convertor) == 0 && reserve == 0 && - (registration != NULL || max_data > btl->btl_max_send_size)) { - /* The user buffer is contigous and we are asked to send more than - the max send size. */ - - MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->base.des_flags = flags; - frag->base.order = MCA_BTL_NO_ORDER; - - if(NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, - max_data, 0, ®istration); - if(OMPI_SUCCESS != rc || NULL == registration) { - MCA_BTL_UD_RETURN_USER_FRAG(btl, frag); - } - return NULL; - } - - frag->ud_reg = (mca_btl_ud_reg_t*)registration; - - frag->sg_entry.lkey = frag->ud_reg->mr->lkey; - frag->sg_entry.addr = (unsigned long)iov.iov_base; - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - *size = max_data; - return &frag->base; - } - - if(max_data + reserve > btl->btl_eager_limit) { - max_data = btl->btl_eager_limit - reserve; - } - - MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc); - if(OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; - - rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - if(OPAL_UNLIKELY(rc < 0)) { - MCA_BTL_UD_RETURN_FRAG(btl, frag); - return NULL; - } - - frag->segment.seg_len = max_data + reserve; - frag->sg_entry.length = - max_data + reserve + sizeof(mca_btl_ud_header_t); - - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = flags; - frag->base.order = MCA_BTL_NO_ORDER; - *size = max_data; - - return &frag->base; -} - - -int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl) -{ - mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl; - int32_t i; - - for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { - ibv_destroy_qp(ud_btl->ib_qp[i]); - } - - ibv_dealloc_pd(ud_btl->ib_pd); - - OBJ_DESTRUCT(&ud_btl->ud_lock); - OBJ_DESTRUCT(&ud_btl->pending_frags); - OBJ_DESTRUCT(&ud_btl->send_frags); - OBJ_DESTRUCT(&ud_btl->user_frags); - OBJ_DESTRUCT(&ud_btl->recv_frags); - mca_mpool_base_module_destroy(ud_btl->super.btl_mpool); - return OMPI_SUCCESS; -} - - -/* - * Initiate a send. - */ - -int mca_btl_ud_send(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor, - mca_btl_base_tag_t tag) -{ - int rc; - - mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor; - MCA_BTL_UD_START_TIME(post_send); - frag->endpoint = endpoint; - frag->hdr->tag = tag; - - rc = mca_btl_ud_endpoint_post_send((mca_btl_ud_module_t*)btl, frag); - - MCA_BTL_UD_END_TIME(post_send); - return rc; -} - - -/* - * RDMA Memory Pool (de)register callbacks - */ - -static int mca_btl_ud_reg_mr(void* reg_data, void* base, size_t size, - mca_mpool_base_registration_t* reg) -{ - mca_btl_ud_module_t* mod = (mca_btl_ud_module_t*)reg_data; - mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg; - - ud_reg->mr = ibv_reg_mr(mod->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); - - if(NULL == ud_reg->mr) - return OMPI_ERR_OUT_OF_RESOURCE; - - return OMPI_SUCCESS; -} - - -static int mca_btl_ud_dereg_mr(void* reg_data, - mca_mpool_base_registration_t* reg) -{ - mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg; - - if(ud_reg->mr != NULL) { - if(ibv_dereg_mr(ud_reg->mr)) { - opal_output(0, "%s: error unpinning UD memory: %s\n", - __func__, strerror(errno)); - return OMPI_ERROR; - } - } - - ud_reg->mr = NULL; - return OMPI_SUCCESS; -} - - -/* - * Create a single UD queue pair. Since UD is connectionless, the QP is - * useable immediately. - */ - -/* TODO - can remove cq/psn args now with only one type of frag */ -static int mca_btl_ud_init_qp(mca_btl_ud_module_t* ud_btl, - struct ibv_cq* cq, - struct ibv_qp** qp, - uint32_t lcl_psn) -{ - struct ibv_qp_attr qp_attr; - struct ibv_qp_init_attr qp_init_attr; - - memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); - - qp_init_attr.send_cq = cq; - qp_init_attr.recv_cq = cq; - qp_init_attr.cap.max_send_wr = mca_btl_ofud_component.sd_num; - qp_init_attr.cap.max_recv_wr = mca_btl_ofud_component.rd_num; - qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.cap.max_recv_sge = 1; - /* TODO - find the best value for max_inline_data */ - qp_init_attr.cap.max_inline_data = 200; - qp_init_attr.qp_type = IBV_QPT_UD; - - *qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr); - if(NULL == *qp) { - BTL_ERROR(("error creating QP: %s\n", strerror(errno))); - return OMPI_ERROR; - } - - if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) { - BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data")); - } - - BTL_VERBOSE(("ib_inline_max %lu\n", (unsigned long) ud_btl->ib_inline_max)); - - qp_attr.qp_state = IBV_QPS_INIT; - qp_attr.pkey_index = mca_btl_ofud_component.ib_pkey_ix; - qp_attr.qkey = mca_btl_ofud_component.ib_qkey; - qp_attr.port_num = ud_btl->ib_port_num; - - if(ibv_modify_qp(*qp, &qp_attr, - IBV_QP_STATE | IBV_QP_PKEY_INDEX | - IBV_QP_PORT | IBV_QP_QKEY)) { - BTL_ERROR(("error modifying QP to INIT: %s", strerror(errno))); - goto destroy_qp; - } - - qp_attr.qp_state = IBV_QPS_RTR; - if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) { - BTL_ERROR(("error modifing QP to RTR: %s", strerror(errno))); - goto destroy_qp; - } - - qp_attr.qp_state = IBV_QPS_RTS; - qp_attr.sq_psn = lcl_psn; - if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { - BTL_ERROR(("error modifying QP to RTS: %s", strerror(errno))); - goto destroy_qp; - } - - return OMPI_SUCCESS; - -destroy_qp: - ibv_destroy_qp(*qp); - *qp = NULL; - return OMPI_ERROR; -} - - -/* - * Initialize the btl module by allocating a protection domain, - * memory pool, completion queue, and free lists - */ - -int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl) -{ - struct mca_mpool_base_resources_t mpool_resources; - struct ibv_context *ctx = ud_btl->ib_dev_context; - struct ibv_recv_wr* bad_wr; - mca_btl_ud_frag_t* frag; - ompi_free_list_item_t* item; - uint32_t length,length_payload; - int32_t rc, i; - - ud_btl->sd_wqe = mca_btl_ofud_component.sd_num; - - ud_btl->ib_pd = ibv_alloc_pd(ctx); - if(NULL == ud_btl->ib_pd) { - BTL_ERROR(("error allocating PD for %s: %s\n", - ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); - return OMPI_ERROR; - } - - mpool_resources.pool_name = "ofud"; - mpool_resources.reg_data = (void*)ud_btl; - mpool_resources.sizeof_reg = sizeof(mca_btl_ud_reg_t); - mpool_resources.register_mem = mca_btl_ud_reg_mr; - mpool_resources.deregister_mem = mca_btl_ud_dereg_mr; - ud_btl->super.btl_mpool = - mca_mpool_base_module_create(mca_btl_ofud_component.ud_mpool_name, - &ud_btl->super, &mpool_resources); - - if(NULL == ud_btl->super.btl_mpool) { - BTL_ERROR(("error creating IB mpool for %s: %s\n", - ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); - goto dealloc_pd; - } - - /* Create the completion queue */ - length = mca_btl_ofud_component.rd_num + mca_btl_ofud_component.sd_num; - - ud_btl->ib_cq = ibv_create_cq(ctx, length, NULL, NULL, 0); - if(NULL == ud_btl->ib_cq) { - BTL_ERROR(("error creating CQ for %s: %s\n", - ibv_get_device_name(ud_btl->ib_dev), strerror(errno))); - goto mpool_destroy; - } - - /* Set up our packet sequence numbers */ - ud_btl->addr.psn = lrand48() & 0xffffff; - - /* Set up the QPs for this BTL */ - for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { - if(OMPI_SUCCESS != mca_btl_ud_init_qp(ud_btl, - ud_btl->ib_cq, &ud_btl->ib_qp[i], ud_btl->addr.psn)) { - goto qp_destroy; - } - } - - /* Place our QP numbers in our local address information */ - ud_btl->addr.qp_num = ud_btl->ib_qp[0]->qp_num; - ud_btl->ib_qp_next = 0; - - /*ud_btl->rd_posted = mca_btl_ofud_component.rd_num_init;*/ - - /* Initialize pool of receive fragments first, since an error may occur */ - /* TODO - no need for a free list with a static buffer count */ - OBJ_CONSTRUCT(&ud_btl->recv_frags, ompi_free_list_t); - length = sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) + - ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN; - - length_payload=sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) + - ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN - - sizeof(mca_btl_ud_recv_frag_t); - - ompi_free_list_init_new(&ud_btl->recv_frags, - length + sizeof(mca_btl_ud_ib_header_t), - opal_cache_line_size, - OBJ_CLASS(mca_btl_ud_recv_frag_t), - length_payload,opal_cache_line_size, - mca_btl_ofud_component.rd_num, - mca_btl_ofud_component.rd_num, - mca_btl_ofud_component.rd_num, - ud_btl->super.btl_mpool); -#if 0 - ompi_free_list_init_new(&ud_btl->recv_frags, - length + sizeof(mca_btl_ud_ib_header_t), - opal_cache_line_size, - OBJ_CLASS(mca_btl_ud_recv_frag_t), - length_payload,opal_cache_line_size, - mca_btl_ofud_component.rd_num_init, - mca_btl_ofud_component.rd_num_max, - mca_btl_ofud_component.rd_num_inc, - ud_btl->super.btl_mpool); -#endif - - /* Post receive descriptors */ - for(i = 0; i < mca_btl_ofud_component.rd_num; i++) { - OMPI_FREE_LIST_GET(&ud_btl->recv_frags, item, rc); - frag = (mca_btl_ud_frag_t*)item; - - if(NULL == frag) { - BTL_ERROR(("error getting receive buffer from free list\n")); - goto obj_destruct; - } - - frag->type = MCA_BTL_UD_FRAG_RECV; - frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit + - sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t); - if(ibv_post_recv(ud_btl->ib_qp[0], - &frag->wr_desc.rd_desc, &bad_wr)) { - BTL_ERROR(("error posting recv, errno %s\n", strerror(errno))); - goto obj_destruct; - } - } - - /* No more errors anticipated - initialize everything else */ - OBJ_CONSTRUCT(&ud_btl->ud_lock, opal_mutex_t); - OBJ_CONSTRUCT(&ud_btl->pending_frags, opal_list_t); - OBJ_CONSTRUCT(&ud_btl->send_frags, ompi_free_list_t); - OBJ_CONSTRUCT(&ud_btl->user_frags, ompi_free_list_t); - - ompi_free_list_init_new(&ud_btl->send_frags, - length, - opal_cache_line_size, - OBJ_CLASS(mca_btl_ud_send_frag_t), - length_payload,opal_cache_line_size, - mca_btl_ofud_component.sd_num >> 1, - -1, - mca_btl_ofud_component.sd_num << 2, - ud_btl->super.btl_mpool); - - /* Initialize pool of user fragments */ - length = sizeof(mca_btl_ud_frag_t) + - sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN; - - length_payload = sizeof(mca_btl_ud_frag_t) + - sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN- - sizeof(mca_btl_ud_user_frag_t); - - ompi_free_list_init_new(&ud_btl->user_frags, - length, - opal_cache_line_size, - OBJ_CLASS(mca_btl_ud_user_frag_t), - length_payload,opal_cache_line_size, - mca_btl_ofud_component.sd_num >> 1, - -1, - mca_btl_ofud_component.sd_num << 2, - ud_btl->super.btl_mpool); - - return OMPI_SUCCESS; - -obj_destruct: - OBJ_DESTRUCT(&ud_btl->recv_frags); -qp_destroy: - for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) { - ibv_destroy_qp(ud_btl->ib_qp[i]); - } -mpool_destroy: - mca_mpool_base_module_destroy(ud_btl->super.btl_mpool); -dealloc_pd: - ibv_dealloc_pd(ud_btl->ib_pd); - return OMPI_ERROR; -} - - -int mca_btl_ud_ft_event(int state) { - return OMPI_SUCCESS; -} - - diff --git a/ompi/mca/btl/ofud/btl_ofud.h b/ompi/mca/btl/ofud/btl_ofud.h deleted file mode 100644 index 147ad5f69d..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud.h +++ /dev/null @@ -1,347 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_BTL_UD_H -#define MCA_BTL_UD_H - -/* Number of QP's to stripe sends over - keep this as power of 2 */ -/* AWF - This is intentionally NOT an MCA parameter so that I can do fast - modular arithmetic with it. */ -#define MCA_BTL_UD_NUM_QP 4 - -#include "ompi_config.h" -#include -#include - -/* Open MPI includes */ -#include "opal/class/opal_hash_table.h" -#include "ompi/class/ompi_free_list.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/btl/base/btl_base_error.h" -#include "ompi/mca/btl/base/base.h" -#include "ompi/mca/mpool/grdma/mpool_grdma.h" - -/* TODO - If I want this to go away, addr_t has to come over here */ -#include "btl_ofud_endpoint.h" - -BEGIN_C_DECLS - - -/** - * UD Infiniband (IB) BTL component. - */ - -struct mca_btl_ud_component_t { - mca_btl_base_component_2_0_0_t super; /**< base BTL component */ - - uint32_t max_btls; /**< Maximum number of BTL modules */ - uint32_t num_btls; /**< Number of available/initialized BTL modules */ - - char* if_include; - char** if_include_list; - char* if_exclude; - char** if_exclude_list; - char** if_list; - - struct mca_btl_ud_module_t* ud_btls; /**< array of available BTLs */ - - opal_list_t ud_procs; /**< list of ib proc structures */ - opal_mutex_t ud_lock; /**< lock for accessing component state */ - - char* ud_mpool_name; /**< name of memory pool */ - - int32_t sd_num; /**< max send descriptors to post per BTL */ - - int32_t rd_num; /**< number of receive descriptors per BTL */ -#if 0 - int32_t rd_num_init; /**< initial recv descriptors to post per BTL */ - int32_t rd_num_max; - int32_t rd_num_inc; -#endif - - uint32_t ib_pkey_ix; - uint32_t ib_qkey; - uint32_t ib_service_level; - uint32_t ib_src_path_bits; - -}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t; - -OMPI_MODULE_DECLSPEC extern mca_btl_ud_component_t mca_btl_ofud_component; - -typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t; - - -/** - * Profiling variables - */ - -#if OPAL_ENABLE_DEBUG -#define MCA_BTL_UD_ENABLE_PROFILE 0 -#else -#define MCA_BTL_UD_ENABLE_PROFILE 0 -#endif - -#if MCA_BTL_UD_ENABLE_PROFILE - -#define MCA_BTL_UD_PROFILE_VAR(var) \ - opal_timer_t avg_ ## var; \ - opal_timer_t cnt_ ## var; \ - opal_timer_t tmp_ ## var - -struct mca_btl_ud_profile_t { - MCA_BTL_UD_PROFILE_VAR(post_send); - MCA_BTL_UD_PROFILE_VAR(ibv_post_send); -}; - -typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t; -extern mca_btl_ud_profile_t mca_btl_ud_profile; - -#endif - - -/** - * UD/IB BTL Interface - */ - -struct mca_btl_ud_module_t { - mca_btl_base_module_t super; - - uint8_t ib_port_num; - struct ibv_device* ib_dev; - struct ibv_context* ib_dev_context; - struct ibv_pd* ib_pd; - struct ibv_cq* ib_cq; - - struct mca_btl_ud_addr_t addr; /**< local address information */ - - ompi_free_list_t send_frags; /**< send fragments & buffers */ - ompi_free_list_t user_frags; /**< user data fragments */ - ompi_free_list_t recv_frags; /**< receive fragments & buffers */ - - opal_list_t pending_frags; /**< list of pending send frags */ - - opal_mutex_t ud_lock; /**< lock for pending_frags */ - - size_t ib_inline_max; /**< max size of IB inline send */ - - /*int32_t rd_posted;*/ /**< number of receives currently posted */ - - int32_t sd_wqe; /**< available send WQ entries */ - /* No lock needed, these are incremented/decremented atomically */ - - /*opal_hash_table_t* ep_lookup;*/ - /**< hash table for fast lookup of endpoint structures in recv path */ - /* lid:qpnum is key, value is mca_btl_ud_endpoint_t* */ - - struct ibv_qp* ib_qp[MCA_BTL_UD_NUM_QP]; - uint32_t ib_qp_next; - /**< Local QPs and stripe counters */ - /* No lock needed - counters only ever increase by 1 */ -}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t; - -struct mca_btl_ud_frag_t; -extern mca_btl_ud_module_t mca_btl_ofud_module; - - -/** - * IB component initialization. - * - * @param num_btl_modules (OUT) - * Number of BTLs returned in BTL array. - * @param allow_multi_user_threads (OUT) - * Flag indicating wether BTL supports user threads (TRUE) - * @param have_hidden_threads (OUT) - * Flag indicating whether BTL uses threads (TRUE) - * - * (1) read interface list from verbs and compare against component parameters - * then create a BTL instance for selected interfaces - * (2) publish BTL addressing info - */ - -extern mca_btl_base_module_t** mca_btl_ud_component_init( - int *num_btl_modules, - bool allow_multi_user_threads, - bool have_hidden_threads); - - -/** - * UD/IB component progress. - */ -extern int mca_btl_ud_component_progress(void); - - -/** - * Cleanup any resources held by the BTL. - * - * @param btl BTL instance. - * @return OMPI_SUCCESS or error status on failure. - */ - -extern int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl); - - -/** - * PML->BTL notification of change in the process list. - * - * @param btl (IN) - * @param nprocs (IN) Number of processes - * @param procs (IN) Set of processes - * @param peers (OUT) Set of (optional) peer addressing info. - * @param peers (IN/OUT) Set of processes that are reachable via this BTL. - * @return OMPI_SUCCESS or error status on failure. - */ - -extern int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct ompi_proc_t **procs, - struct mca_btl_base_endpoint_t** peers, - opal_bitmap_t* reachable); - - -/** - * PML->BTL notification of change in the process list. - * - * @param btl (IN) BTL instance - * @param nproc (IN) Number of processes. - * @param procs (IN) Set of processes. - * @param peers (IN) Set of peer data structures. - * @return Status indicating if cleanup was successful - * - */ - -extern int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl, - size_t nprocs, - struct ompi_proc_t **procs, - struct mca_btl_base_endpoint_t** peers); - - -/** - * PML->BTL Initiate a send of the specified size. - * - * @param btl (IN) - * BTL instance - * @param btl_base_peer (IN) - * BTL peer addressing - * @param send_request (IN/OUT) - * Send request (allocated by PML via mca_btl_base_request_alloc_fn_t) - * @param size (IN) - * Number of bytes PML is requesting BTL to deliver - * @param flags (IN) - * Flags that should be passed to the peer via the message header. - * @param request (OUT) - * OMPI_SUCCESS if the BTL was able to queue one or more fragments - */ - -extern int mca_btl_ud_send(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor, - mca_btl_base_tag_t tag); - - -/** - * Allocate a descriptor. - * - * @param btl (IN) BTL module - * @param size (IN) Requested descriptor size. - */ - -extern mca_btl_base_descriptor_t* mca_btl_ud_alloc( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - uint8_t order, - size_t size, - uint32_t flags); - - -/** - * Return a segment allocated by this BTL. - * - * @param btl (IN) BTL module - * @param descriptor (IN) Allocated descriptor. - */ - -extern int mca_btl_ud_free(struct mca_btl_base_module_t* btl, - mca_btl_base_descriptor_t* des); - - -/** - * Pack data and return a descriptor that can be - * used for send/put. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ - -mca_btl_base_descriptor_t* mca_btl_ud_prepare_src( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - - - -int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint State - * @return OMPI_SUCCESS or failure status - */ - -extern int mca_btl_ud_ft_event(int state); - - - -/* - * Profiling stuff - */ - -#if MCA_BTL_UD_ENABLE_PROFILE - -#define MCA_BTL_UD_START_TIME(var) \ - ((mca_btl_ud_profile.tmp_ ## var) = opal_timer_base_get_cycles()) - -#define MCA_BTL_UD_END_TIME(var) \ -do { \ - mca_btl_ud_profile.avg_ ## var += \ - opal_timer_base_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \ - mca_btl_ud_profile.cnt_ ## var++; \ -} while(0) - -#define MCA_BTL_UD_SHOW_TIME(var) \ - OPAL_OUTPUT((0, " " #var " avg %lu cnt %lu", \ - (mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \ - mca_btl_ud_profile.cnt_ ## var)); - -#else -#define MCA_BTL_UD_START_TIME(var) -#define MCA_BTL_UD_END_TIME(var) -#define MCA_BTL_UD_SHOW_TIME(var) -#endif - -END_C_DECLS -#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_component.c b/ompi/mca/btl/ofud/btl_ofud_component.c deleted file mode 100644 index 9079165ce8..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_component.c +++ /dev/null @@ -1,731 +0,0 @@ -/* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#ifdef HAVE_STRING_H -#include -#endif -#include -#include - -#include "opal_stdint.h" -#include "ompi/constants.h" -#include "opal/prefetch.h" -#include "orte/util/show_help.h" -#include "ompi/mca/btl/btl.h" -#include "opal/mca/timer/base/base.h" -#include "opal/util/argv.h" -#include "opal/mca/base/mca_base_param.h" -#include "orte/mca/errmgr/errmgr.h" -#include "ompi/mca/btl/base/base.h" -#include "ompi/runtime/ompi_module_exchange.h" -#include "ompi/runtime/mpiruntime.h" - -#include "orte/runtime/orte_globals.h" - -#include "btl_ofud.h" -#include "btl_ofud_frag.h" -#include "btl_ofud_endpoint.h" - - -static int mca_btl_ud_component_register(void); -static int mca_btl_ud_component_open(void); -static int mca_btl_ud_component_close(void); - -mca_btl_ud_component_t mca_btl_ofud_component = { - { - /* First, the mca_base_component_t struct containing meta information - about the component itself */ - { - MCA_BTL_BASE_VERSION_2_0_0, - - "ofud", /* MCA component name */ - OMPI_MAJOR_VERSION, /* MCA component major version */ - OMPI_MINOR_VERSION, /* MCA component minor version */ - OMPI_RELEASE_VERSION, /* MCA component release version */ - mca_btl_ud_component_open, /* component open */ - mca_btl_ud_component_close, /* component close */ - NULL, /* component query */ - mca_btl_ud_component_register, /* component register */ - }, - { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE - }, - - mca_btl_ud_component_init, - mca_btl_ud_component_progress, - } -}; - - -/* - * Profiling information - */ - -#if MCA_BTL_UD_ENABLE_PROFILE -mca_btl_ud_profile_t mca_btl_ud_profile = {0}; -#endif - - -/* - * utility routines for parameter registration - */ - -static inline void mca_btl_ud_param_reg_string(const char* param_name, - const char* param_desc, - const char* default_value, - char** out_value) -{ - mca_base_param_reg_string(&mca_btl_ofud_component.super.btl_version, - param_name, param_desc, false, false, - default_value, out_value); -} - -static inline void mca_btl_ud_param_reg_int(const char* param_name, - const char* param_desc, - int default_value, - int* out_value) -{ - mca_base_param_reg_int(&mca_btl_ofud_component.super.btl_version, - param_name, param_desc, false, false, - default_value, out_value); -} - - -static int mca_btl_ud_component_register(void) -{ - int val; - - /* register IB component parameters */ - mca_btl_ud_param_reg_int("max_btls", - "Maximum number of HCAs/ports to use", - 4, (int*)&mca_btl_ofud_component.max_btls); - - mca_btl_ud_param_reg_string("if_include", "Comma-delimited list of HCAs/ports to be used; empty value means to use all HCAs/ports found", - NULL, &mca_btl_ofud_component.if_include); - - mca_btl_ud_param_reg_string("if_exclude", "Comma-delimited list of HCAs/ports to be excluded; empty value means to use all HCAs/ports found", - NULL, &mca_btl_ofud_component.if_exclude); - - mca_btl_ud_param_reg_string("mpool", "Name of the memory pool to be used", - "grdma", &mca_btl_ofud_component.ud_mpool_name); - - mca_btl_ud_param_reg_int("ib_pkey_index", "IB pkey index", - 0, (int*)&mca_btl_ofud_component.ib_pkey_ix); - mca_btl_ud_param_reg_int("ib_qkey", "IB qkey", - 0x01330133, (int*)&mca_btl_ofud_component.ib_qkey); - mca_btl_ud_param_reg_int("ib_service_level", "IB service level", - 0, (int*)&mca_btl_ofud_component.ib_service_level); - mca_btl_ud_param_reg_int("ib_src_path_bits", "IB source path bits", - 0, (int*)&mca_btl_ofud_component.ib_src_path_bits); - - mca_btl_ud_param_reg_int("sd_num", "maximum send descriptors to post", - 128, (int*)&mca_btl_ofud_component.sd_num); - - mca_btl_ud_param_reg_int("rd_num", "number of receive buffers", - 6000, (int*)&mca_btl_ofud_component.rd_num); -#if 0 - mca_btl_ud_param_reg_int("rd_num_init", "initial receive buffers", - 3000, (int*)&mca_btl_ofud_component.rd_num_init); - mca_btl_ud_param_reg_int("rd_num_max", "maximum receive buffers", - 4500, (int*)&mca_btl_ofud_component.rd_num_max); - mca_btl_ud_param_reg_int("rd_num_inc", - "number of buffers to post when rate is high", - 25, (int*)&mca_btl_ofud_component.rd_num_inc); -#endif - - /* TODO - this assumes a 2k UD MTU - query/do something more intelligent */ - /*mca_btl_ud_param_reg_int("eager_limit", "eager send limit", - 2048, &val); */ - mca_btl_ud_param_reg_int("min_send_size", "minimum send size", - 2048, &val); - mca_btl_ofud_module.super.btl_rndv_eager_limit = val; - mca_btl_ud_param_reg_int("max_send_size", "maximum send size", - 2048, &val); - mca_btl_ofud_module.super.btl_eager_limit = val; - mca_btl_ofud_module.super.btl_max_send_size = val; - - mca_btl_ud_param_reg_int("exclusivity", "BTL exclusivity", - MCA_BTL_EXCLUSIVITY_DEFAULT, - (int*)&mca_btl_ofud_module.super.btl_exclusivity); - mca_btl_ud_param_reg_int("bandwidth", - "Approximate maximum bandwidth of interconnect", - 800, (int*)&mca_btl_ofud_module.super.btl_bandwidth); - - mca_btl_ofud_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t); - - mca_btl_ofud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t); - mca_btl_ofud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t); - - return OMPI_SUCCESS; -} - - -/* - * Called by MCA framework to open the component - */ - -static int mca_btl_ud_component_open(void) -{ - /* initialize state */ - mca_btl_ofud_component.num_btls = 0; - mca_btl_ofud_component.ud_btls = NULL; - - /* initialize objects */ - OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_lock, opal_mutex_t); - OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_procs, opal_list_t); - - /* if_include and if_exclude need to be mutually exclusive */ - if (OPAL_SUCCESS != - mca_base_param_check_exclusive_string( - mca_btl_ofud_component.super.btl_version.mca_type_name, - mca_btl_ofud_component.super.btl_version.mca_component_name, - "if_include", - mca_btl_ofud_component.super.btl_version.mca_type_name, - mca_btl_ofud_component.super.btl_version.mca_component_name, - "if_exclude")) { - /* Return ERR_NOT_AVAILABLE so that a warning message about - "open" failing is not printed */ - return OMPI_ERR_NOT_AVAILABLE; - } - - return OMPI_SUCCESS; -} - - -/* - * Component cleanup - */ - -static int mca_btl_ud_component_close(void) -{ - OBJ_DESTRUCT(&mca_btl_ofud_component.ud_lock); - OBJ_DESTRUCT(&mca_btl_ofud_component.ud_procs); - - /* Calculate and print profiling numbers */ - MCA_BTL_UD_SHOW_TIME(post_send); - MCA_BTL_UD_SHOW_TIME(ibv_post_send); - - return OMPI_SUCCESS; -} - - -/* - * Register UD address information. The MCA framework - * will make this available to all peers. - */ - -static int mca_btl_ud_modex_send(void) -{ - int rc; - size_t i; - size_t size; - mca_btl_ud_addr_t* addrs = NULL; - - size = mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_addr_t); - if(size != 0) { - addrs = (mca_btl_ud_addr_t*)malloc(size); - if(NULL == addrs) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - for(i = 0; i < mca_btl_ofud_component.num_btls; i++) { - mca_btl_ud_module_t* btl = &mca_btl_ofud_component.ud_btls[i]; - addrs[i] = btl->addr; - - BTL_VERBOSE(("modex_send QP num %x, LID = %x", - addrs[i].qp_num, addrs[i].lid)); - } - } - - rc = ompi_modex_send( - &mca_btl_ofud_component.super.btl_version, addrs, size); - if(NULL != addrs) { - free(addrs); - } - return rc; -} - - -static int -get_port_list(struct ibv_device* ib_dev, int port_cnt, int *allowed_ports) -{ - int i, j, k, num_ports = 0; - const char *dev_name; - char *name; - - dev_name = ibv_get_device_name(ib_dev); - name = (char*) malloc(strlen(dev_name) + 4); - if (NULL == name) { - return 0; - } - - /* Assume that all ports are allowed. num_ports will be adjusted - below to reflect whether this is true or not. */ - for (i = 1; i <= port_cnt; ++i) { - allowed_ports[num_ports++] = i; - } - num_ports = 0; - if (NULL != mca_btl_ofud_component.if_include_list) { - /* If only the HCA name is given (eg. mthca0,mthca1) use all - ports */ - i = 0; - while (mca_btl_ofud_component.if_include_list[i]) { - if (0 == strcmp(dev_name, - mca_btl_ofud_component.if_include_list[i])) { - num_ports = port_cnt; - goto done; - } - ++i; - } - /* Include only requested ports on the HCA */ - for (i = 1; i <= port_cnt; ++i) { - sprintf(name,"%s:%d",dev_name,i); - for (j = 0; - NULL != mca_btl_ofud_component.if_include_list[j]; ++j) { - if (0 == strcmp(name, - mca_btl_ofud_component.if_include_list[j])) { - allowed_ports[num_ports++] = i; - break; - } - } - } - } else if (NULL != mca_btl_ofud_component.if_exclude_list) { - /* If only the HCA name is given (eg. mthca0,mthca1) exclude - all ports */ - i = 0; - while (mca_btl_ofud_component.if_exclude_list[i]) { - if (0 == strcmp(dev_name, - mca_btl_ofud_component.if_exclude_list[i])) { - num_ports = 0; - goto done; - } - ++i; - } - /* Exclude the specified ports on this HCA */ - for (i = 1; i <= port_cnt; ++i) { - sprintf(name,"%s:%d",dev_name,i); - for (j = 0; - NULL != mca_btl_ofud_component.if_exclude_list[j]; ++j) { - if (0 == strcmp(name, - mca_btl_ofud_component.if_exclude_list[j])) { - /* If found, set a sentinel value */ - j = -1; - break; - } - } - /* If we didn't find it, it's ok to include in the list */ - if (-1 != j) { - allowed_ports[num_ports++] = i; - } - } - } else { - num_ports = port_cnt; - } - -done: - - /* Remove the following from the error-checking if_list: - - bare device name - - device name suffixed with port number */ - if (NULL != mca_btl_ofud_component.if_list) { - for (i = 0; NULL != mca_btl_ofud_component.if_list[i]; ++i) { - - /* Look for raw device name */ - if (0 == strcmp(mca_btl_ofud_component.if_list[i], dev_name)) { - j = opal_argv_count(mca_btl_ofud_component.if_list); - opal_argv_delete(&j, &(mca_btl_ofud_component.if_list), - i, 1); - --i; - } - } - for (i = 1; i <= port_cnt; ++i) { - sprintf(name, "%s:%d", dev_name, i); - for (j = 0; NULL != mca_btl_ofud_component.if_list[j]; ++j) { - if (0 == strcmp(mca_btl_ofud_component.if_list[j], name)) { - k = opal_argv_count(mca_btl_ofud_component.if_list); - opal_argv_delete(&k, &(mca_btl_ofud_component.if_list), - j, 1); - --j; - break; - } - } - } - } - - free(name); - - return num_ports; -} - - -/* - * UD component initialization: - * (1) read interface list from kernel and compare against component parameters - * then create a BTL instance for selected interfaces - * (2) post OOB receive for incoming connection attempts - * (3) register BTL parameters with the MCA - */ - -mca_btl_base_module_t** mca_btl_ud_component_init(int* num_btl_modules, - bool enable_progress_threads, - bool enable_mpi_threads) -{ - struct ibv_device **ib_devs; - struct ibv_device* ib_dev; - int32_t num_devs; - mca_btl_base_module_t** btls; - uint32_t i, j, k; - uint32_t port_cnt; - opal_list_t btl_list; - mca_btl_ud_module_t* ud_btl; - mca_btl_base_selected_module_t* ib_selected; - opal_list_item_t* item; - unsigned short seedv[3]; - int* allowed_ports = NULL; - char* btl_str; - char* tok; - - /* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */ - if (ompi_mpi_thread_multiple && !mca_btl_base_thread_multiple_override) { - return NULL; - } - - /* First, check if the UD BTL was specifically selected. - If not, then short out right away. */ - mca_base_param_lookup_string( - mca_base_param_find("btl", NULL, NULL), &btl_str); - if(NULL == btl_str || '^' == btl_str[0]) { - /* No string at all, or an exclusion string, bail out */ - return NULL; - } - - /* Try to find a 'ofud' token */ - tok = strtok(btl_str, ","); - while(tok) { - if(!strcasecmp("ofud", tok)) { - break; - } - } - - if(NULL == tok) { - /* No valid 'ofud' token found; bail out */ - return NULL; - } - - /* initialization */ - *num_btl_modules = 0; - num_devs = 0; - - seedv[0] = ORTE_PROC_MY_NAME->vpid; - seedv[1] = opal_timer_base_get_cycles(); - seedv[2] = opal_timer_base_get_cycles(); - seed48(seedv); - - - /* Parse the include and exclude lists, checking for errors */ - mca_btl_ofud_component.if_include_list = - mca_btl_ofud_component.if_exclude_list = - mca_btl_ofud_component.if_list = NULL; - if (NULL != mca_btl_ofud_component.if_include && - NULL != mca_btl_ofud_component.if_exclude) { - orte_show_help("help-mpi-btl-openib.txt", - "specified include and exclude", true, - mca_btl_ofud_component.if_include, - mca_btl_ofud_component.if_exclude, NULL); - btls = NULL; - goto modex_send; - } else if (NULL != mca_btl_ofud_component.if_include) { - mca_btl_ofud_component.if_include_list = - opal_argv_split(mca_btl_ofud_component.if_include, ','); - mca_btl_ofud_component.if_list = - opal_argv_copy(mca_btl_ofud_component.if_include_list); - } else if (NULL != mca_btl_ofud_component.if_exclude) { - mca_btl_ofud_component.if_exclude_list = - opal_argv_split(mca_btl_ofud_component.if_exclude, ','); - mca_btl_ofud_component.if_list = - opal_argv_copy(mca_btl_ofud_component.if_exclude_list); - } - - - ib_devs = ibv_get_device_list(&num_devs); - - if(0 == num_devs) { - mca_btl_base_error_no_nics("OpenFabrics UD", "HCA"); - btls = NULL; - goto free_include_list; - } - - /** We must loop through all the hca id's, get their handles and - for each hca we query the number of ports on the hca and set up - a distinct btl module for each hca port */ - - OBJ_CONSTRUCT(&btl_list, opal_list_t); - - for(i = 0; (int32_t)i < num_devs && - mca_btl_ofud_component.num_btls < mca_btl_ofud_component.max_btls; - i++) { - struct ibv_device_attr ib_dev_attr; - struct ibv_context* ib_dev_context; - - ib_dev = ib_devs[i]; - - ib_dev_context = ibv_open_device(ib_dev); - if(!ib_dev_context) { - BTL_ERROR(("error obtaining device context for %s: %s\n", - ibv_get_device_name(ib_dev), strerror(errno))); - btls = NULL; - goto free_dev_list; - } - - if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ - BTL_ERROR(("error obtaining device attributes for %s: %s\n", - ibv_get_device_name(ib_dev), strerror(errno))); - btls = NULL; - goto free_dev_list; - } - - - allowed_ports = (int*)malloc(ib_dev_attr.phys_port_cnt * sizeof(int)); - port_cnt = get_port_list(ib_dev, - ib_dev_attr.phys_port_cnt, allowed_ports); - - /* Note ports are 1 based, but j goes over the array of ports */ - for(j = 0; j < port_cnt; j++) { - struct ibv_port_attr ib_port_attr; - - k = allowed_ports[j]; - - if(ibv_query_port(ib_dev_context, (uint8_t)k, &ib_port_attr)) { - BTL_ERROR(("error getting port attributes for device %s port %d: %s", - ibv_get_device_name(ib_dev), k, strerror(errno))); - return NULL; - } - - if(IBV_PORT_ACTIVE == ib_port_attr.state) { - ud_btl = - (mca_btl_ud_module_t*)malloc(sizeof(mca_btl_ud_module_t)); - memcpy(ud_btl, &mca_btl_ofud_module, sizeof(mca_btl_ud_module_t)); - - ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); - ib_selected->btl_module = (mca_btl_base_module_t*)ud_btl; - - ud_btl->ib_dev = ib_dev; - ud_btl->ib_dev_context = ib_dev_context; - ud_btl->ib_port_num = (uint8_t)k; - ud_btl->addr.subnet = ib_port_attr.sm_lid; - ud_btl->addr.lid = ib_port_attr.lid; - - opal_list_append(&btl_list, (opal_list_item_t*) ib_selected); - if(++mca_btl_ofud_component.num_btls >= - mca_btl_ofud_component.max_btls) - break; - } - } - } - - free(allowed_ports); - - - /* Allocate space for btl modules */ - mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*) - malloc(sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls); - if(NULL == mca_btl_ofud_component.ud_btls) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return NULL; - } - - btls = (struct mca_btl_base_module_t**) - malloc(mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*)); - if(NULL == btls) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return NULL; - } - - - for(i = 0; i < mca_btl_ofud_component.num_btls; i++){ - item = opal_list_remove_first(&btl_list); - ib_selected = (mca_btl_base_selected_module_t*)item; - ud_btl = (mca_btl_ud_module_t*)ib_selected->btl_module; - - memcpy(&(mca_btl_ofud_component.ud_btls[i]), - ud_btl, sizeof(mca_btl_ud_module_t)); - free(ib_selected); - free(ud_btl); - - ud_btl = &mca_btl_ofud_component.ud_btls[i]; - - /* Initialize module state */ - if(mca_btl_ud_module_init(ud_btl) != OMPI_SUCCESS) { - mca_btl_ofud_component.num_btls--; - i--; - - continue; - } - - btls[i] = &ud_btl->super; - } - - OBJ_DESTRUCT(&btl_list); - - /* Since not all modules may have initialized successfully, realloc - to free space from failed modules */ - mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*) - realloc(mca_btl_ofud_component.ud_btls, - sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls); - btls = (struct mca_btl_base_module_t**)realloc(btls, - mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*)); - - *num_btl_modules = mca_btl_ofud_component.num_btls; - -free_dev_list: - ibv_free_device_list(ib_devs); - -free_include_list: - if (NULL != mca_btl_ofud_component.if_include_list) { - opal_argv_free(mca_btl_ofud_component.if_include_list); - mca_btl_ofud_component.if_include_list = NULL; - } - if (NULL != mca_btl_ofud_component.if_exclude_list) { - opal_argv_free(mca_btl_ofud_component.if_exclude_list); - mca_btl_ofud_component.if_exclude_list = NULL; - } - -modex_send: - mca_btl_ud_modex_send(); - return btls; -} - - -/* - * IB component progress. - */ - -#define MCA_BTL_UD_NUM_WC 500 - -int mca_btl_ud_component_progress(void) -{ - uint32_t i; - int count = 0, ne, j, btl_ownership; - mca_btl_ud_frag_t* frag; - struct ibv_recv_wr* bad_wr; - struct ibv_recv_wr* head_wr; - mca_btl_ud_module_t* ud_btl; - mca_btl_active_message_callback_t* reg; - struct ibv_wc* cwc; - struct ibv_wc wc[MCA_BTL_UD_NUM_WC]; - - /* Poll for completions */ - for(i = 0; i < mca_btl_ofud_component.num_btls; i++) { - ud_btl = &mca_btl_ofud_component.ud_btls[i]; - - ne = ibv_poll_cq(ud_btl->ib_cq, MCA_BTL_UD_NUM_WC, wc); - if(OPAL_UNLIKELY(ne < 0)) { - BTL_ERROR(("error polling CQ with %d: %s\n", - ne, strerror(errno))); - return OMPI_ERROR; - } - - head_wr = NULL; - - for(j = 0; j < ne; j++) { - cwc = &wc[j]; - if(OPAL_UNLIKELY(cwc->status != IBV_WC_SUCCESS)) { - BTL_ERROR(("error polling CQ with status %d for wr_id %" PRIx64 " opcode %d\n", - cwc->status, cwc->wr_id, cwc->opcode)); - return OMPI_ERROR; - } - - frag = (mca_btl_ud_frag_t*)(unsigned long)cwc->wr_id; - - /* Handle work completions */ - switch(frag->type) { - case MCA_BTL_UD_FRAG_SEND: - case MCA_BTL_UD_FRAG_USER: - { - assert(cwc->opcode == IBV_WC_SEND); - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - frag->base.des_cbfunc(&ud_btl->super, - frag->endpoint, &frag->base, OMPI_SUCCESS); - if( btl_ownership ) { - mca_btl_ud_free( &ud_btl->super, &frag->base ); - } - - /* Increment send counter, post if any sends are queued */ - OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1); - if(OPAL_UNLIKELY( - !opal_list_is_empty(&ud_btl->pending_frags))) { - OPAL_THREAD_LOCK(&ud_btl->ud_lock); - frag = (mca_btl_ud_frag_t*) - opal_list_remove_first(&ud_btl->pending_frags); - OPAL_THREAD_UNLOCK(&ud_btl->ud_lock); - - if(OPAL_LIKELY(NULL != frag)) { - mca_btl_ud_endpoint_post_send(ud_btl, frag); - } - } - - continue; - } - case MCA_BTL_UD_FRAG_RECV: - assert(cwc->opcode == IBV_WC_RECV); - reg = mca_btl_base_active_message_trigger + frag->hdr->tag; - - frag->segment.seg_addr.pval = frag->hdr + 1; - frag->segment.seg_len = cwc->byte_len - - sizeof(mca_btl_ud_header_t) - - sizeof(mca_btl_ud_ib_header_t); - - reg->cbfunc(&ud_btl->super, - frag->hdr->tag, &frag->base, reg->cbdata); - - /* Add recv to linked list for reposting */ - frag->wr_desc.rd_desc.next = head_wr; - head_wr = &frag->wr_desc.rd_desc; - continue; - default: - BTL_ERROR(("Unhandled completion opcode %d frag type %d", - cwc->opcode, frag->type)); - break; - } - } - - count += ne; - - /* Repost any recv buffers all at once */ - if(OPAL_LIKELY(head_wr)) { - if(OPAL_UNLIKELY(ibv_post_recv( - ud_btl->ib_qp[0], head_wr, &bad_wr))) { - BTL_ERROR(("error posting recv: %s\n", strerror(errno))); - return OMPI_ERROR; - } - - head_wr = NULL; - } - } - - return count; -} - diff --git a/ompi/mca/btl/ofud/btl_ofud_endpoint.c b/ompi/mca/btl/ofud/btl_ofud_endpoint.c deleted file mode 100644 index 833281e28f..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_endpoint.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * Copyright (c) 2007 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include -#include - -#include "ompi_config.h" -#include "opal/prefetch.h" -#include "ompi/types.h" - -#include "btl_ofud.h" -#include "btl_ofud_endpoint.h" -#include "btl_ofud_frag.h" - - -static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint); -static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); - - -/* First, we check the downcounter on the endpoint. - If it is 0, we queue this frag on the endpoint. - Otherwise, we check the BTL downcounter. - If it is 0, we queue this frag on the BTL. - Otherwise, we post the send. */ -#define CHECK_FRAG_QUEUES(sd_wqe, lock, queue, frag) \ -do { \ - if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&(sd_wqe), -1) < 0)) { \ - OPAL_THREAD_ADD32(&(sd_wqe), 1); \ - OPAL_THREAD_LOCK(&(lock)); \ - opal_list_append(&(queue), \ - (opal_list_item_t*)(frag)); \ - OPAL_THREAD_UNLOCK(&(lock)); \ - return OMPI_SUCCESS; \ - } \ -} while(0); - - -/* - * Post a send to the work queue - */ - -int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl, - mca_btl_ud_frag_t* frag) -{ - struct ibv_qp* ib_qp; - struct ibv_send_wr* bad_wr; - struct ibv_send_wr* wr = &frag->wr_desc.sr_desc; - mca_btl_ud_endpoint_t* endpoint = frag->endpoint; - int ret; - - /* Have to be careful here - UD adds a 40 byte header, but it is not - included on the sending side. */ - frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t); - wr->send_flags = IBV_SEND_SIGNALED; - - CHECK_FRAG_QUEUES(ud_btl->sd_wqe, - ud_btl->ud_lock, ud_btl->pending_frags, frag); - - /* We avoid locking here by allowing our stripe counter to count - until it wraps around uint32_t. This keeps the mod operation - out of the critical section, allowing us to use OPAL_THREAD_ADD32 - instead of a full mutex. */ - ib_qp = ud_btl->ib_qp[ud_btl->ib_qp_next % MCA_BTL_UD_NUM_QP]; - OPAL_THREAD_ADD32(((int32_t*)&ud_btl->ib_qp_next), 1); - - wr->wr.ud.ah = endpoint->rmt_ah; - wr->wr.ud.remote_qpn = endpoint->rem_addr.qp_num; - - if(frag->sg_entry.length <= ud_btl->ib_inline_max) { - wr->send_flags = - IBV_SEND_SIGNALED|IBV_SEND_INLINE; - } - - /*frag->hdr->src_qpnum = ud_btl->addr.qp_num;*/ - - MCA_BTL_UD_START_TIME(ibv_post_send); - if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) { -#if 0 - opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d", - endpoint->sd_wqe, ud_btl->sd_wqe, - frag->sg_entry.length, ud_btl->ib_qp_next); -#endif - BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret))); - - } - MCA_BTL_UD_END_TIME(ibv_post_send); - - return OMPI_SUCCESS; -} - - -OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t, - opal_list_item_t, mca_btl_ud_endpoint_construct, - mca_btl_ud_endpoint_destruct); - -/* - * Construct/destruct an endpoint structure. - */ - -static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint) -{ -#if OPAL_ENABLE_DEBUG - memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t)); -#endif -} - -static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) -{ -} - diff --git a/ompi/mca/btl/ofud/btl_ofud_endpoint.h b/ompi/mca/btl/ofud/btl_ofud_endpoint.h deleted file mode 100644 index af970ab313..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_endpoint.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_IB_ENDPOINT_H -#define MCA_BTL_IB_ENDPOINT_H - -#include - -#include "opal/class/opal_list.h" -#include "opal/mca/event/event.h" - -#include "btl_ofud.h" -#include "btl_ofud_frag.h" - -BEGIN_C_DECLS - -struct mca_btl_ud_addr_t { - uint32_t qp_num; - uint32_t psn; - uint16_t lid; - uint16_t subnet; -}; -typedef struct mca_btl_ud_addr_t mca_btl_ud_addr_t; - - -/** - * An abstraction that represents a connection to a endpoint process. - * An instance of mca_btl_base_endpoint_t is associated w/ each process - * and BTL pair and address information is exchanged at startup. - * The UD BTL is connectionless, so no connection is ever established. - */ - -struct mca_btl_base_endpoint_t { - opal_list_item_t super; - - mca_btl_ud_addr_t rem_addr; - /**< Remote address information */ - /* No lock needed, read-only past initialization */ - - struct ibv_ah* rmt_ah; - /**< Remote address handle */ - /* No lock needed, verbs are thread-safe */ -}; - -typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; -typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t); - -int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl, - struct mca_btl_ud_frag_t * frag); - -END_C_DECLS -#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_frag.c b/ompi/mca/btl/ofud/btl_ofud_frag.c deleted file mode 100644 index 5a3a039708..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_frag.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "btl_ofud.h" -#include "btl_ofud_frag.h" - - -static inline void mca_btl_ud_frag_common_constructor(mca_btl_ud_frag_t* frag) -{ - frag->ud_reg = (mca_btl_ud_reg_t*)frag->base.super.registration; - frag->sg_entry.lkey = frag->ud_reg->mr->lkey; - frag->base.des_flags = 0; - frag->base.order = MCA_BTL_NO_ORDER; -} - - -static void mca_btl_ud_send_frag_constructor(mca_btl_ud_frag_t* frag) -{ - frag->type = MCA_BTL_UD_FRAG_SEND; - mca_btl_ud_frag_common_constructor(frag); - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - - /* We do not include the mca_btl_ud_ib_header_t data when sending */ - frag->hdr = frag->base.super.ptr; - frag->segment.seg_addr.pval = frag->hdr + 1; - - frag->sg_entry.addr = (unsigned long)frag->hdr; - - frag->wr_desc.sr_desc.wr_id = (unsigned long)frag; - frag->wr_desc.sr_desc.sg_list = &frag->sg_entry; - frag->wr_desc.sr_desc.num_sge = 1; - frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; - frag->wr_desc.sr_desc.next = NULL; - frag->wr_desc.sr_desc.wr.ud.remote_qkey = mca_btl_ofud_component.ib_qkey; -} - - -static void mca_btl_ud_user_frag_constructor(mca_btl_ud_frag_t* frag) -{ - mca_btl_ud_send_frag_constructor(frag); - frag->type = MCA_BTL_UD_FRAG_USER; -} - - -static void mca_btl_ud_recv_frag_constructor(mca_btl_ud_frag_t* frag) -{ - frag->type = MCA_BTL_UD_FRAG_RECV; - mca_btl_ud_frag_common_constructor(frag); - frag->base.des_dst = &frag->segment; - frag->base.des_dst_cnt = 1; - frag->base.des_src = NULL; - frag->base.des_src_cnt = 0; - - /* Receive frag headers start 40 bytes later */ - frag->hdr = (mca_btl_ud_header_t*)((uintptr_t)frag->base.super.ptr + - sizeof(mca_btl_ud_ib_header_t)); - frag->segment.seg_addr.pval = frag->hdr + 1; - - frag->sg_entry.addr = (uintptr_t)frag->base.super.ptr; - frag->segment.seg_len = mca_btl_ofud_module.super.btl_eager_limit; - frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit + - sizeof(mca_btl_ud_ib_header_t) + sizeof(mca_btl_ud_header_t); - - frag->wr_desc.rd_desc.wr_id = (unsigned long)frag; - frag->wr_desc.rd_desc.sg_list = &frag->sg_entry; - frag->wr_desc.rd_desc.num_sge = 1; - frag->wr_desc.rd_desc.next = NULL; -} - - -OBJ_CLASS_INSTANCE(mca_btl_ud_frag_t, - mca_btl_base_descriptor_t, - NULL, - NULL); - -OBJ_CLASS_INSTANCE(mca_btl_ud_send_frag_t, - mca_btl_base_descriptor_t, - mca_btl_ud_send_frag_constructor, - NULL); - -OBJ_CLASS_INSTANCE(mca_btl_ud_user_frag_t, - mca_btl_base_descriptor_t, - mca_btl_ud_user_frag_constructor, - NULL); - -OBJ_CLASS_INSTANCE(mca_btl_ud_recv_frag_t, - mca_btl_base_descriptor_t, - mca_btl_ud_recv_frag_constructor, - NULL); - diff --git a/ompi/mca/btl/ofud/btl_ofud_frag.h b/ompi/mca/btl/ofud/btl_ofud_frag.h deleted file mode 100644 index e6f0bdff9f..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_frag.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_UD_FRAG_H -#define MCA_BTL_UD_FRAG_H - -#define MCA_BTL_IB_FRAG_ALIGN (8) - -#include - -#include "btl_ofud.h" - - -BEGIN_C_DECLS - - -/** - * Fragment types - */ -typedef enum { - MCA_BTL_UD_FRAG_SEND, - MCA_BTL_UD_FRAG_USER, - MCA_BTL_UD_FRAG_RECV -} mca_btl_ud_frag_type_t; - - -struct mca_btl_ud_reg_t { - mca_mpool_base_registration_t base; - struct ibv_mr* mr; -}; -typedef struct mca_btl_ud_reg_t mca_btl_ud_reg_t; - - -/* UD adds a 40 byte global routing header */ -/* This works in strange ways - the sending side does not need to explicitly - include this data in sg lists. Then, on the receiving side, the extra 40 - bytes magically appear. */ -struct mca_btl_ud_ib_header_t { - uint8_t ib_grh[40]; -}; -typedef struct mca_btl_ud_ib_header_t mca_btl_ud_ib_header_t; - -struct mca_btl_ud_header_t { - /*uint32_t src_qpnum;*/ - mca_btl_base_tag_t tag; -}; -typedef struct mca_btl_ud_header_t mca_btl_ud_header_t; - - -/** - * IB send fragment derived type. - */ - -struct mca_btl_ud_frag_t { - mca_btl_base_descriptor_t base; - mca_btl_base_segment_t segment; - - struct mca_btl_base_endpoint_t* endpoint; - - mca_btl_ud_frag_type_t type; - - union{ - struct ibv_recv_wr rd_desc; - struct ibv_send_wr sr_desc; - } wr_desc; - struct ibv_sge sg_entry; - - /* When this is a send frag, hdr points right after this, as expected. - But when this is a receive frag, we have an extra 40 bytes provided - by IB, so this points 40 bytes past the end of the frag. */ - mca_btl_ud_header_t* hdr; - - mca_btl_ud_reg_t* ud_reg; -}; -typedef struct mca_btl_ud_frag_t mca_btl_ud_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t); - -typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_t); - -typedef struct mca_btl_ud_frag_t mca_btl_ud_user_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_user_frag_t); - -typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_t); - - -/* - * Allocate/return a UD/IB send/user fragment - */ - -#define MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc) \ -{ \ - ompi_free_list_item_t *item; \ - OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->send_frags, item, rc); \ - frag = (mca_btl_ud_frag_t*) item; \ -} - -#define MCA_BTL_UD_RETURN_FRAG(btl, frag) \ -{ \ - OMPI_FREE_LIST_RETURN( \ - &((mca_btl_ud_module_t*)btl)->send_frags, \ - (ompi_free_list_item_t*)(frag)); \ -} - - -#define MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc) \ -{ \ - ompi_free_list_item_t *item; \ - OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->user_frags, item, rc); \ - frag = (mca_btl_ud_frag_t*) item; \ -} - -#define MCA_BTL_UD_RETURN_USER_FRAG(btl, frag) \ -{ \ - OMPI_FREE_LIST_RETURN( \ - &((mca_btl_ud_module_t*)btl)->user_frags, \ - (ompi_free_list_item_t*)(frag)); \ -} - - -struct mca_btl_ud_module_t; - -END_C_DECLS -#endif diff --git a/ompi/mca/btl/ofud/btl_ofud_proc.c b/ompi/mca/btl/ofud/btl_ofud_proc.c deleted file mode 100644 index 1ad3779735..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_proc.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "ompi_config.h" - -#include "ompi/runtime/ompi_module_exchange.h" - -#include "btl_ofud.h" -#include "btl_ofud_proc.h" - - -static void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc); -static void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc); - -OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t, - opal_list_item_t, mca_btl_ud_proc_construct, - mca_btl_ud_proc_destruct); - -void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* ud_proc) -{ - ud_proc->proc_ompi = 0; - ud_proc->proc_addr_count = 0; - ud_proc->proc_endpoints = 0; - ud_proc->proc_endpoint_count = 0; - OBJ_CONSTRUCT(&ud_proc->proc_lock, opal_mutex_t); - - /* add to list of all proc instance */ - OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); - opal_list_append(&mca_btl_ofud_component.ud_procs, &ud_proc->super); - OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); -} - -void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* ud_proc) -{ - /* remove from list of all proc instances */ - OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); - opal_list_remove_item(&mca_btl_ofud_component.ud_procs, &ud_proc->super); - OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); - - /* release resources */ - if(NULL != ud_proc->proc_endpoints) { - free(ud_proc->proc_endpoints); - } - OBJ_DESTRUCT(&ud_proc->proc_lock); -} - - -/* - * Look for an existing IB process instance based on the associated - * ompi_proc_t instance. - */ - -mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc) -{ - mca_btl_ud_proc_t* ib_proc; - - OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock); - - for(ib_proc = (mca_btl_ud_proc_t*) - opal_list_get_first(&mca_btl_ofud_component.ud_procs); - ib_proc != (mca_btl_ud_proc_t*) - opal_list_get_end(&mca_btl_ofud_component.ud_procs); - ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) { - if(ib_proc->proc_ompi == ompi_proc) { - OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); - return ib_proc; - } - } - OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock); - return NULL; -} - - -/* - * Create a IB process structure. There is a one-to-one correspondence - * between a ompi_proc_t and a mca_btl_ud_proc_t instance. We cache - * additional data (specifically the list of mca_btl_ud_endpoint_t instances, - * and published addresses) associated w/ a given destination on this - * datastructure. - */ - -mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc) -{ - mca_btl_ud_proc_t* module_proc = NULL; - size_t size; - int rc; - - /* Check if we have already created a IB proc - * structure for this ompi process */ - module_proc = mca_btl_ud_proc_lookup_ompi(ompi_proc); - - if(module_proc != NULL) { - /* Gotcha! */ - return module_proc; - } - - /* Oops! First time, gotta create a new IB proc out of the ompi_proc ... */ - module_proc = OBJ_NEW(mca_btl_ud_proc_t); - /* Initialize number of peer */ - module_proc->proc_endpoint_count = 0; - module_proc->proc_ompi = ompi_proc; - - /* query for the peer address info */ - rc = ompi_modex_recv(&mca_btl_ofud_component.super.btl_version, - ompi_proc, (void*)&module_proc->proc_addrs, - &size); - - if(OMPI_SUCCESS != rc) { - opal_output(0, - "[%s:%d] ompi_modex_recv failed for peer %s", - __FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name)); - OBJ_RELEASE(module_proc); - return NULL; - } - - if((size % sizeof(mca_btl_ud_addr_t)) != 0) { - opal_output(0, "[%s:%d] invalid module address for peer %s", - __FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name)); - OBJ_RELEASE(module_proc); - return NULL; - } - - - module_proc->proc_addr_count = size / sizeof(mca_btl_ud_addr_t); - - - if (0 == module_proc->proc_addr_count) { - module_proc->proc_endpoints = NULL; - } else { - module_proc->proc_endpoints = (mca_btl_base_endpoint_t**) - malloc(module_proc->proc_addr_count * - sizeof(mca_btl_base_endpoint_t*)); - } - - if(NULL == module_proc->proc_endpoints) { - OBJ_RELEASE(module_proc); - return NULL; - } - return module_proc; -} - - -/* - * Insert an endpoint into the proc array and assign it an address. - * - * MUST be called with the proc lock held! - */ - -int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc, - mca_btl_base_endpoint_t* module_endpoint) -{ - module_endpoint->rem_addr = - module_proc->proc_addrs[module_proc->proc_endpoint_count]; - module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = - module_endpoint; - return OMPI_SUCCESS; -} - - -/* - * Remove an endpoint from the proc array. - */ - -int mca_btl_ud_proc_remove(mca_btl_ud_proc_t* proc, - mca_btl_base_endpoint_t* endpoint) -{ - size_t i; - - OPAL_THREAD_LOCK(&proc->proc_lock); - for(i = 0; i < proc->proc_endpoint_count; i++) { - if(proc->proc_endpoints[i] == endpoint) { - memmove(proc->proc_endpoints + i, proc->proc_endpoints + i + 1, - (proc->proc_endpoint_count -i - 1) * - sizeof(mca_btl_base_endpoint_t*)); - if(--proc->proc_endpoint_count == 0) { - OPAL_THREAD_UNLOCK(&proc->proc_lock); - OBJ_RELEASE(proc); - return OMPI_SUCCESS; - } - - break; - } - } - - OPAL_THREAD_UNLOCK(&proc->proc_lock); - return OMPI_SUCCESS; -} - diff --git a/ompi/mca/btl/ofud/btl_ofud_proc.h b/ompi/mca/btl/ofud/btl_ofud_proc.h deleted file mode 100644 index 71f2acd595..0000000000 --- a/ompi/mca/btl/ofud/btl_ofud_proc.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Sandia National Laboratories. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef MCA_BTL_UD_PROC_H -#define MCA_BTL_UD_PROC_H - -#include "opal/class/opal_object.h" -#include "ompi/proc/proc.h" - -#include "btl_ofud.h" -#include "btl_ofud_endpoint.h" - -BEGIN_C_DECLS - -/** - * Represents the state of a remote process and the set of addresses - * that it exports. Also cache an instance of mca_btl_base_endpoint_t for - * each BTL instance that attempts to open a connection to the process. - */ - -struct mca_btl_ud_proc_t { - opal_list_item_t super; - /**< allow proc to be placed on a list */ - - ompi_proc_t *proc_ompi; - /**< pointer to corresponding ompi_proc_t */ - - struct mca_btl_ud_addr_t* proc_addrs; - size_t proc_addr_count; - /**< number of addresses published by endpoint */ - - struct mca_btl_base_endpoint_t **proc_endpoints; - /**< array of endpoints that have been created to access this proc */ - - size_t proc_endpoint_count; - /**< number of endpoints */ - - opal_mutex_t proc_lock; - /**< lock to protect against concurrent access to proc state */ -}; -typedef struct mca_btl_ud_proc_t mca_btl_ud_proc_t; -OBJ_CLASS_DECLARATION(mca_btl_ud_proc_t); - - -mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc); - -mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc); - -int mca_btl_ud_proc_insert(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*); - -int mca_btl_ud_proc_remove(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*); - -END_C_DECLS -#endif diff --git a/ompi/mca/btl/ofud/configure.m4 b/ompi/mca/btl/ofud/configure.m4 deleted file mode 100644 index 5765f1e7a6..0000000000 --- a/ompi/mca/btl/ofud/configure.m4 +++ /dev/null @@ -1,46 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2006 Sandia National Laboratories. All rights -# reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - - -# MCA_btl_ofud_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_ompi_btl_ofud_CONFIG],[ - AC_CONFIG_FILES([ompi/mca/btl/ofud/Makefile]) - - OMPI_CHECK_OPENFABRICS([btl_ofud], - [btl_ofud_happy="yes"], - [btl_ofud_happy="no"]) - - AS_IF([test "$btl_ofud_happy" = "yes"], - [btl_ofud_WRAPPER_EXTRA_LDFLAGS="$btl_ofud_LDFLAGS" - btl_ofud_WRAPPER_EXTRA_LIBS="$btl_ofud_LIBS" - $1], - [$2]) - - - # substitute in the things needed to build OFUD - AC_SUBST([btl_ofud_CFLAGS]) - AC_SUBST([btl_ofud_CPPFLAGS]) - AC_SUBST([btl_ofud_LDFLAGS]) - AC_SUBST([btl_ofud_LIBS]) -])dnl