Bring the UD BTL over to the trunk, named 'ofud'.
This commit was SVN r15298.
Этот коммит содержится в:
родитель
6e8d25fdaf
Коммит
77038b65a8
68
ompi/mca/btl/ofud/Makefile.am
Обычный файл
68
ompi/mca/btl/ofud/Makefile.am
Обычный файл
@ -0,0 +1,68 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
AM_CPPFLAGS=$(btl_ofud_CPPFLAGS)
|
||||
|
||||
sources = \
|
||||
btl_ofud.c \
|
||||
btl_ofud.h \
|
||||
btl_ofud_component.c \
|
||||
btl_ofud_endpoint.c \
|
||||
btl_ofud_endpoint.h \
|
||||
btl_ofud_frag.c \
|
||||
btl_ofud_frag.h \
|
||||
btl_ofud_proc.c \
|
||||
btl_ofud_proc.h
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_btl_ofud_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_btl_ofud.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_btl_ofud.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_btl_ofud_la_SOURCES = $(component_sources)
|
||||
mca_btl_ofud_la_LDFLAGS = -module -avoid-version $(btl_ofud_LDFLAGS)
|
||||
mca_btl_ofud_la_LIBADD = \
|
||||
$(btl_ofud_LIBS) \
|
||||
$(top_ompi_builddir)/ompi/libmpi.la \
|
||||
$(top_ompi_builddir)/orte/libopen-rte.la \
|
||||
$(top_ompi_builddir)/opal/libopen-pal.la
|
||||
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_btl_ofud_la_SOURCES = $(lib_sources)
|
||||
libmca_btl_ofud_la_LDFLAGS= -module -avoid-version $(btl_ofud_LDFLAGS)
|
||||
libmca_btl_ofud_la_LIBADD=$(btl_ofud_LIBS)
|
691
ompi/mca/btl/ofud/btl_ofud.c
Обычный файл
691
ompi/mca/btl/ofud/btl_ofud.c
Обычный файл
@ -0,0 +1,691 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/prefetch.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/datatype/convertor.h"
|
||||
#include "ompi/datatype/datatype.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_frag.h"
|
||||
#include "btl_ofud_proc.h"
|
||||
#include "btl_ofud_endpoint.h"
|
||||
|
||||
|
||||
mca_btl_ud_module_t mca_btl_ofud_module = {
|
||||
{
|
||||
&mca_btl_ofud_component.super,
|
||||
0, /* eager_limit */
|
||||
0, /* min_send_size */
|
||||
0, /* max_send_size */
|
||||
0, /* rdma_pipeline_send_length */
|
||||
0, /* rdma_pipeline_frag_size */
|
||||
0, /* min_rdma_pipeline_size */
|
||||
0, /* exclusivity */
|
||||
0, /* latency */
|
||||
0, /* bandwidth */
|
||||
MCA_BTL_FLAGS_SEND,
|
||||
mca_btl_ud_add_procs,
|
||||
mca_btl_ud_del_procs,
|
||||
mca_btl_ud_register,
|
||||
mca_btl_ud_finalize,
|
||||
mca_btl_ud_alloc,
|
||||
mca_btl_ud_free,
|
||||
mca_btl_ud_prepare_src,
|
||||
NULL, /*mca_btl_ud_prepare_dst */
|
||||
mca_btl_ud_send,
|
||||
NULL, /*mca_btl_ud_put */
|
||||
NULL, /*mca_btl_ud_get */
|
||||
mca_btl_base_dump,
|
||||
NULL, /* mpool */
|
||||
NULL, /* register error */
|
||||
mca_btl_udapl_ft_event
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Add procs to this BTL module, receiving endpoint information from the modex.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
||||
struct ibv_ah_attr ah_attr;
|
||||
int i, rc;
|
||||
|
||||
/* Set up the endpoint lookup table if it hasn't been already */
|
||||
/* We do this here so we can initialize the table to a reasonable size
|
||||
based on nprocs */
|
||||
#if 0
|
||||
if(NULL == ud_btl->ep_lookup) {
|
||||
ud_btl->ep_lookup = malloc(sizeof(opal_hash_table_t));
|
||||
OBJ_CONSTRUCT(ud_btl->ep_lookup, opal_hash_table_t);
|
||||
opal_hash_table_init(ud_btl->ep_lookup, nprocs);
|
||||
}
|
||||
#endif
|
||||
|
||||
for(i = 0; i < (int)nprocs; i++) {
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_ud_proc_t* ib_proc;
|
||||
mca_btl_base_endpoint_t* ib_peer;
|
||||
|
||||
if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
|
||||
/* The btl_proc datastructure is shared by all IB BTL instances that are
|
||||
* trying to reach this destination. Cache the peer instance on the
|
||||
* btl_proc.
|
||||
*/
|
||||
ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t);
|
||||
if(NULL == ib_peer) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
||||
rc = mca_btl_ud_proc_insert(ib_proc, ib_peer);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OBJ_RELEASE(ib_peer);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("modex_recv QP num %d, LID = %d",
|
||||
ib_peer->rem_addr.qp_num, ib_peer->rem_addr.lid));
|
||||
|
||||
/* Set up IB address handles for the endpoint */
|
||||
ah_attr.is_global = 0;
|
||||
ah_attr.dlid = ib_peer->rem_addr.lid;
|
||||
ah_attr.sl = mca_btl_ofud_component.ib_service_level;
|
||||
ah_attr.src_path_bits = mca_btl_ofud_component.ib_src_path_bits;
|
||||
ah_attr.port_num = ud_btl->ib_port_num;
|
||||
|
||||
ib_peer->rmt_ah = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
|
||||
if(NULL == ib_peer->rmt_ah) {
|
||||
BTL_ERROR(("error creating address handle: %s\n", strerror(errno)));
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Insert a pointer to the endpoint in the BTL lookup table */
|
||||
#if 0
|
||||
opal_hash_table_set_value_uint64(ud_btl->ep_lookup,
|
||||
((uint64_t)ib_peer->rem_addr.lid << 32) |
|
||||
ib_peer->rem_addr.qp_num,
|
||||
ib_peer);
|
||||
#endif
|
||||
|
||||
ompi_bitmap_set_bit(reachable, i);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
peers[i] = ib_peer;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Delete the proc as reachable from this btl module
|
||||
*/
|
||||
|
||||
int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t** procs,
|
||||
struct mca_btl_base_endpoint_t** peers)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for(i = 0; i < nprocs; i++) {
|
||||
mca_btl_ud_endpoint_t* endpoint = (mca_btl_ud_endpoint_t*)peers[i];
|
||||
mca_btl_ud_proc_t* proc = mca_btl_ud_proc_lookup_ompi(procs[i]);
|
||||
#if 0
|
||||
opal_hash_table_remove_value_uint64(ud_btl->ep_lookup,
|
||||
((uint64_t)endpoint->rem_addr.lid << 32) |
|
||||
endpoint->rem_addr.qp_num);
|
||||
#endif
|
||||
if(NULL != proc) {
|
||||
mca_btl_ud_proc_remove(proc, endpoint);
|
||||
}
|
||||
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register callback function to support send/recv semantics
|
||||
*/
|
||||
|
||||
int mca_btl_ud_register(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata)
|
||||
{
|
||||
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
||||
|
||||
OPAL_THREAD_LOCK(&ud_btl->ud_lock);
|
||||
ud_btl->ib_reg[tag].cbfunc = cbfunc;
|
||||
ud_btl->ib_reg[tag].cbdata = cbdata;
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ud_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a segment.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Request segment size.
|
||||
*
|
||||
* When allocating a segment we pull a pre-alllocated segment
|
||||
* from one of two free lists, an eager list and a max list
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_alloc(struct mca_btl_base_module_t* btl,
|
||||
uint8_t order,
|
||||
size_t size)
|
||||
{
|
||||
mca_btl_ud_frag_t* frag = NULL;
|
||||
int rc;
|
||||
|
||||
if(OPAL_LIKELY(size <= mca_btl_ofud_module.super.btl_eager_limit)) {
|
||||
MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc);
|
||||
}
|
||||
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
frag->segment.seg_len = size;
|
||||
return (mca_btl_base_descriptor_t*)frag;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a segment
|
||||
*
|
||||
* Return the segment to the appropriate
|
||||
* preallocated segment list
|
||||
*/
|
||||
|
||||
int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des;
|
||||
|
||||
if(OPAL_LIKELY(frag->type == MCA_BTL_UD_FRAG_SEND)) {
|
||||
MCA_BTL_UD_RETURN_FRAG(btl, frag);
|
||||
} else if(frag->type == MCA_BTL_UD_FRAG_USER && frag->ud_reg != NULL) {
|
||||
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
|
||||
(mca_mpool_base_registration_t*)frag->ud_reg);
|
||||
MCA_BTL_UD_RETURN_USER_FRAG(btl, frag);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* register user buffer or pack
|
||||
* data into pre-registered buffer and return a
|
||||
* descriptor that can be
|
||||
* used for send/put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*
|
||||
* prepare source's behavior depends on the following:
|
||||
* Has a valid memory registration been passed to prepare_src?
|
||||
* if so we attempt to use the pre-registred user-buffer, if the memory
|
||||
* registration is to small (only a portion of the user buffer) then we must
|
||||
* reregister the user buffer
|
||||
* Has the user requested the memory to be left pinned?
|
||||
* if so we insert the memory registration into a memory tree for later
|
||||
* lookup, we may also remove a previous registration if a MRU (most recently
|
||||
* used) list of registions is full, this prevents resources from being
|
||||
* exhausted.
|
||||
* Is the requested size larger than the btl's max send size?
|
||||
* if so and we aren't asked to leave the registration pinned then we
|
||||
* register the memory if the user's buffer is contiguous.
|
||||
* Otherwise we choose from two free lists of pre-registered memory in which
|
||||
* to pack the data into.
|
||||
*
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct ompi_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size)
|
||||
{
|
||||
mca_btl_ud_frag_t* frag = NULL;
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
size_t max_data = *size;
|
||||
int rc;
|
||||
|
||||
if(ompi_convertor_need_buffers(convertor) == 0 && reserve == 0 &&
|
||||
(registration != NULL || max_data > btl->btl_max_send_size)) {
|
||||
/* The user buffer is contigous and we are asked to send more than
|
||||
the max send size. */
|
||||
|
||||
MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc);
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
|
||||
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
|
||||
frag->segment.seg_len = max_data;
|
||||
frag->segment.seg_addr.pval = iov.iov_base;
|
||||
frag->base.des_flags = 0;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
|
||||
if(NULL == registration) {
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base,
|
||||
max_data, 0, ®istration);
|
||||
if(OMPI_SUCCESS != rc || NULL == registration) {
|
||||
MCA_BTL_UD_RETURN_USER_FRAG(btl, frag);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->ud_reg = (mca_btl_ud_reg_t*)registration;
|
||||
|
||||
frag->sg_entry.lkey = frag->ud_reg->mr->lkey;
|
||||
frag->sg_entry.addr = (unsigned long)iov.iov_base;
|
||||
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
*size = max_data;
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
if(max_data + reserve > btl->btl_eager_limit) {
|
||||
max_data = btl->btl_eager_limit - reserve;
|
||||
}
|
||||
|
||||
MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc);
|
||||
if(OPAL_UNLIKELY(NULL == frag)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
|
||||
|
||||
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
||||
if(OPAL_UNLIKELY(rc < 0)) {
|
||||
MCA_BTL_UD_RETURN_FRAG(btl, frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.seg_len = max_data + reserve;
|
||||
frag->sg_entry.length =
|
||||
max_data + reserve + sizeof(mca_btl_ud_header_t);
|
||||
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
frag->base.des_flags = 0;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
*size = max_data;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
|
||||
|
||||
int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
||||
int32_t i;
|
||||
|
||||
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
||||
ibv_destroy_qp(ud_btl->ib_qp[i]);
|
||||
}
|
||||
|
||||
ibv_dealloc_pd(ud_btl->ib_pd);
|
||||
|
||||
OBJ_DESTRUCT(&ud_btl->ud_lock);
|
||||
OBJ_DESTRUCT(&ud_btl->pending_frags);
|
||||
OBJ_DESTRUCT(&ud_btl->send_frags);
|
||||
OBJ_DESTRUCT(&ud_btl->user_frags);
|
||||
OBJ_DESTRUCT(&ud_btl->recv_frags);
|
||||
mca_mpool_base_module_destroy(ud_btl->super.btl_mpool);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initiate a send.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_send(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
mca_btl_base_tag_t tag)
|
||||
{
|
||||
int rc;
|
||||
|
||||
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor;
|
||||
MCA_BTL_UD_START_TIME(post_send);
|
||||
frag->endpoint = endpoint;
|
||||
frag->hdr->tag = tag;
|
||||
|
||||
rc = mca_btl_ud_endpoint_post_send((mca_btl_ud_module_t*)btl, frag);
|
||||
|
||||
MCA_BTL_UD_END_TIME(post_send);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RDMA Memory Pool (de)register callbacks
|
||||
*/
|
||||
|
||||
static int mca_btl_ud_reg_mr(void* reg_data, void* base, size_t size,
|
||||
mca_mpool_base_registration_t* reg)
|
||||
{
|
||||
mca_btl_ud_module_t* mod = (mca_btl_ud_module_t*)reg_data;
|
||||
mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg;
|
||||
|
||||
ud_reg->mr = ibv_reg_mr(mod->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE |
|
||||
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
||||
|
||||
if(NULL == ud_reg->mr)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int mca_btl_ud_dereg_mr(void* reg_data,
|
||||
mca_mpool_base_registration_t* reg)
|
||||
{
|
||||
mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg;
|
||||
|
||||
if(ud_reg->mr != NULL) {
|
||||
if(ibv_dereg_mr(ud_reg->mr)) {
|
||||
opal_output(0, "%s: error unpinning UD memory: %s\n",
|
||||
__func__, strerror(errno));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
ud_reg->mr = NULL;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a single UD queue pair. Since UD is connectionless, the QP is
|
||||
* useable immediately.
|
||||
*/
|
||||
|
||||
/* TODO - can remove cq/psn args now with only one type of frag */
|
||||
static int mca_btl_ud_init_qp(mca_btl_ud_module_t* ud_btl,
|
||||
struct ibv_cq* cq,
|
||||
struct ibv_qp** qp,
|
||||
uint32_t lcl_psn)
|
||||
{
|
||||
struct ibv_qp_attr qp_attr;
|
||||
struct ibv_qp_init_attr qp_init_attr;
|
||||
|
||||
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
|
||||
|
||||
qp_init_attr.send_cq = cq;
|
||||
qp_init_attr.recv_cq = cq;
|
||||
qp_init_attr.cap.max_send_wr = mca_btl_ofud_component.sd_num;
|
||||
qp_init_attr.cap.max_recv_wr = mca_btl_ofud_component.rd_num;
|
||||
qp_init_attr.cap.max_send_sge = 1;
|
||||
qp_init_attr.cap.max_recv_sge = 1;
|
||||
/* TODO - find the best value for max_inline_data */
|
||||
qp_init_attr.cap.max_inline_data = 200;
|
||||
qp_init_attr.qp_type = IBV_QPT_UD;
|
||||
|
||||
*qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr);
|
||||
if(NULL == *qp) {
|
||||
BTL_ERROR(("error creating QP: %s\n", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) {
|
||||
BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data"));
|
||||
}
|
||||
|
||||
BTL_VERBOSE((0, "ib_inline_max %d\n", ud_btl->ib_inline_max));
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_INIT;
|
||||
qp_attr.pkey_index = mca_btl_ofud_component.ib_pkey_ix;
|
||||
qp_attr.qkey = mca_btl_ofud_component.ib_qkey;
|
||||
qp_attr.port_num = ud_btl->ib_port_num;
|
||||
|
||||
if(ibv_modify_qp(*qp, &qp_attr,
|
||||
IBV_QP_STATE | IBV_QP_PKEY_INDEX |
|
||||
IBV_QP_PORT | IBV_QP_QKEY)) {
|
||||
BTL_ERROR(("error modifying QP to INIT: %s", strerror(errno)));
|
||||
goto destroy_qp;
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_RTR;
|
||||
if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) {
|
||||
BTL_ERROR(("error modifing QP to RTR: %s", strerror(errno)));
|
||||
goto destroy_qp;
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_RTS;
|
||||
qp_attr.sq_psn = lcl_psn;
|
||||
if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) {
|
||||
BTL_ERROR(("error modifying QP to RTS: %s", strerror(errno)));
|
||||
goto destroy_qp;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
destroy_qp:
|
||||
ibv_destroy_qp(*qp);
|
||||
*qp = NULL;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize the btl module by allocating a protection domain,
|
||||
* memory pool, completion queue, and free lists
|
||||
*/
|
||||
|
||||
int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
||||
{
|
||||
struct mca_mpool_base_resources_t mpool_resources;
|
||||
struct ibv_context *ctx = ud_btl->ib_dev_context;
|
||||
struct ibv_recv_wr* bad_wr;
|
||||
mca_btl_ud_frag_t* frag;
|
||||
ompi_free_list_item_t* item;
|
||||
uint32_t length;
|
||||
int32_t rc, i;
|
||||
|
||||
ud_btl->sd_wqe = mca_btl_ofud_component.sd_num;
|
||||
|
||||
ud_btl->ib_pd = ibv_alloc_pd(ctx);
|
||||
if(NULL == ud_btl->ib_pd) {
|
||||
BTL_ERROR(("error allocating PD for %s: %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
mpool_resources.reg_data = (void*)ud_btl;
|
||||
mpool_resources.sizeof_reg = sizeof(mca_btl_ud_reg_t);
|
||||
mpool_resources.register_mem = mca_btl_ud_reg_mr;
|
||||
mpool_resources.deregister_mem = mca_btl_ud_dereg_mr;
|
||||
ud_btl->super.btl_mpool =
|
||||
mca_mpool_base_module_create(mca_btl_ofud_component.ud_mpool_name,
|
||||
&ud_btl->super, &mpool_resources);
|
||||
|
||||
if(NULL == ud_btl->super.btl_mpool) {
|
||||
BTL_ERROR(("error creating IB mpool for %s: %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
||||
goto dealloc_pd;
|
||||
}
|
||||
|
||||
/* Create the completion queue */
|
||||
length = mca_btl_ofud_component.rd_num + mca_btl_ofud_component.sd_num;
|
||||
|
||||
ud_btl->ib_cq = ibv_create_cq(ctx, length, NULL, NULL, 0);
|
||||
if(NULL == ud_btl->ib_cq) {
|
||||
BTL_ERROR(("error creating CQ for %s: %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
||||
goto mpool_destroy;
|
||||
}
|
||||
|
||||
/* Set up our packet sequence numbers */
|
||||
ud_btl->addr.psn = lrand48() & 0xffffff;
|
||||
|
||||
/* Set up the QPs for this BTL */
|
||||
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
||||
if(OMPI_SUCCESS != mca_btl_ud_init_qp(ud_btl,
|
||||
ud_btl->ib_cq, &ud_btl->ib_qp[i], ud_btl->addr.psn)) {
|
||||
goto qp_destroy;
|
||||
}
|
||||
}
|
||||
|
||||
/* Place our QP numbers in our local address information */
|
||||
ud_btl->addr.qp_num = ud_btl->ib_qp[0]->qp_num;
|
||||
ud_btl->ib_qp_next = 0;
|
||||
|
||||
/*ud_btl->rd_posted = mca_btl_ofud_component.rd_num_init;*/
|
||||
|
||||
/* Initialize pool of receive fragments first, since an error may occur */
|
||||
/* TODO - no need for a free list with a static buffer count */
|
||||
OBJ_CONSTRUCT(&ud_btl->recv_frags, ompi_free_list_t);
|
||||
length = sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) +
|
||||
ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
ompi_free_list_init(&ud_btl->recv_frags,
|
||||
length + sizeof(mca_btl_ud_ib_header_t),
|
||||
OBJ_CLASS(mca_btl_ud_recv_frag_t),
|
||||
mca_btl_ofud_component.rd_num,
|
||||
mca_btl_ofud_component.rd_num,
|
||||
mca_btl_ofud_component.rd_num,
|
||||
ud_btl->super.btl_mpool);
|
||||
#if 0
|
||||
ompi_free_list_init(&ud_btl->recv_frags,
|
||||
length + sizeof(mca_btl_ud_ib_header_t),
|
||||
OBJ_CLASS(mca_btl_ud_recv_frag_t),
|
||||
mca_btl_ofud_component.rd_num_init,
|
||||
mca_btl_ofud_component.rd_num_max,
|
||||
mca_btl_ofud_component.rd_num_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
#endif
|
||||
|
||||
/* Post receive descriptors */
|
||||
for(i = 0; i < mca_btl_ofud_component.rd_num; i++) {
|
||||
OMPI_FREE_LIST_GET(&ud_btl->recv_frags, item, rc);
|
||||
frag = (mca_btl_ud_frag_t*)item;
|
||||
|
||||
if(NULL == frag) {
|
||||
BTL_ERROR(("error getting receive buffer from free list\n"));
|
||||
goto obj_destruct;
|
||||
}
|
||||
|
||||
frag->type = MCA_BTL_UD_FRAG_RECV;
|
||||
frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit +
|
||||
sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t);
|
||||
if(ibv_post_recv(ud_btl->ib_qp[0],
|
||||
&frag->wr_desc.rd_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting recv, errno %s\n", strerror(errno)));
|
||||
goto obj_destruct;
|
||||
}
|
||||
}
|
||||
|
||||
/* No more errors anticipated - initialize everything else */
|
||||
OBJ_CONSTRUCT(&ud_btl->ud_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->pending_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->send_frags, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->user_frags, ompi_free_list_t);
|
||||
|
||||
ompi_free_list_init(&ud_btl->send_frags,
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_send_frag_t),
|
||||
mca_btl_ofud_component.sd_num >> 1,
|
||||
mca_btl_ofud_component.sd_num << 2,
|
||||
mca_btl_ofud_component.sd_num >> 3,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
/* Initialize pool of user fragments */
|
||||
length = sizeof(mca_btl_ud_frag_t) +
|
||||
sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
ompi_free_list_init(&ud_btl->user_frags,
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_user_frag_t),
|
||||
mca_btl_ofud_component.sd_num >> 1,
|
||||
mca_btl_ofud_component.sd_num << 2,
|
||||
mca_btl_ofud_component.sd_num >> 3,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
obj_destruct:
|
||||
OBJ_DESTRUCT(&ud_btl->recv_frags);
|
||||
qp_destroy:
|
||||
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
||||
ibv_destroy_qp(ud_btl->ib_qp[i]);
|
||||
}
|
||||
mpool_destroy:
|
||||
mca_mpool_base_module_destroy(ud_btl->super.btl_mpool);
|
||||
dealloc_pd:
|
||||
ibv_dealloc_pd(ud_btl->ib_pd);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
int mca_btl_udapl_ft_event(int state) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
374
ompi/mca/btl/ofud/btl_ofud.h
Обычный файл
374
ompi/mca/btl/ofud/btl_ofud.h
Обычный файл
@ -0,0 +1,374 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_BTL_UD_H
|
||||
#define MCA_BTL_UD_H
|
||||
|
||||
/* Number of QP's to stripe sends over - keep this as power of 2 */
|
||||
/* AWF - This is intentionally NOT an MCA parameter so that I can do fast
|
||||
modular arithmetic with it. */
|
||||
#define MCA_BTL_UD_NUM_QP 4
|
||||
|
||||
/* Standard system includes */
|
||||
#include <sys/types.h>
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
/* Open MPI includes */
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/sys/timer.h"
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
#include "ompi/class/ompi_bitmap.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
|
||||
/* TODO - If I want this to go away, addr_t has to come over here */
|
||||
#include "btl_ofud_endpoint.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* UD Infiniband (IB) BTL component.
|
||||
*/
|
||||
|
||||
struct mca_btl_ud_component_t {
|
||||
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
|
||||
|
||||
uint32_t max_btls; /**< Maximum number of BTL modules */
|
||||
uint32_t num_btls; /**< Number of available/initialized BTL modules */
|
||||
|
||||
struct mca_btl_ud_module_t* ud_btls; /**< array of available BTLs */
|
||||
|
||||
opal_list_t ud_procs; /**< list of ib proc structures */
|
||||
opal_mutex_t ud_lock; /**< lock for accessing component state */
|
||||
|
||||
char* ud_mpool_name; /**< name of memory pool */
|
||||
|
||||
int32_t sd_num; /**< max send descriptors to post per BTL */
|
||||
int32_t sd_num_peer; /**< max send descriptors to post per endpoint */
|
||||
|
||||
int32_t rd_num; /**< number of receive descriptors per BTL */
|
||||
#if 0
|
||||
int32_t rd_num_init; /**< initial recv descriptors to post per BTL */
|
||||
int32_t rd_num_max;
|
||||
int32_t rd_num_inc;
|
||||
#endif
|
||||
|
||||
uint32_t ib_pkey_ix;
|
||||
uint32_t ib_qkey;
|
||||
uint32_t ib_service_level;
|
||||
uint32_t ib_src_path_bits;
|
||||
|
||||
}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t;
|
||||
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_ud_component_t mca_btl_ofud_component;
|
||||
|
||||
typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t;
|
||||
|
||||
|
||||
/**
|
||||
* Profiling variables
|
||||
*/
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
#define MCA_BTL_UD_ENABLE_PROFILE 0
|
||||
#else
|
||||
#define MCA_BTL_UD_ENABLE_PROFILE 0
|
||||
#endif
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
|
||||
#define MCA_BTL_UD_PROFILE_VAR(var) \
|
||||
opal_timer_t avg_ ## var; \
|
||||
opal_timer_t cnt_ ## var; \
|
||||
opal_timer_t tmp_ ## var
|
||||
|
||||
struct mca_btl_ud_profile_t {
|
||||
MCA_BTL_UD_PROFILE_VAR(post_send);
|
||||
MCA_BTL_UD_PROFILE_VAR(ibv_post_send);
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t;
|
||||
extern mca_btl_ud_profile_t mca_btl_ud_profile;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* UD/IB BTL Interface
|
||||
*/
|
||||
|
||||
struct mca_btl_ud_module_t {
|
||||
mca_btl_base_module_t super;
|
||||
mca_btl_ud_recv_reg_t ib_reg[256]; /* protected by ib_lock */
|
||||
|
||||
uint8_t ib_port_num;
|
||||
struct ibv_device* ib_dev;
|
||||
struct ibv_context* ib_dev_context;
|
||||
struct ibv_pd* ib_pd;
|
||||
struct ibv_cq* ib_cq;
|
||||
|
||||
struct mca_btl_ud_addr_t addr; /**< local address information */
|
||||
|
||||
ompi_free_list_t send_frags; /**< send fragments & buffers */
|
||||
ompi_free_list_t user_frags; /**< user data fragments */
|
||||
ompi_free_list_t recv_frags; /**< receive fragments & buffers */
|
||||
|
||||
opal_list_t pending_frags; /**< list of pending send frags */
|
||||
|
||||
opal_mutex_t ud_lock; /**< lock for ib_reg and pending_frags */
|
||||
|
||||
size_t ib_inline_max; /**< max size of IB inline send */
|
||||
|
||||
/*int32_t rd_posted;*/ /**< number of receives currently posted */
|
||||
|
||||
int32_t sd_wqe; /**< available send WQ entries */
|
||||
/* No lock needed, these are incremented/decremented atomically */
|
||||
|
||||
/*opal_hash_table_t* ep_lookup;*/
|
||||
/**< hash table for fast lookup of endpoint structures in recv path */
|
||||
/* lid:qpnum is key, value is mca_btl_ud_endpoint_t* */
|
||||
|
||||
struct ibv_qp* ib_qp[MCA_BTL_UD_NUM_QP];
|
||||
uint32_t ib_qp_next;
|
||||
/**< Local QPs and stripe counters */
|
||||
/* No lock needed - counters only ever increase by 1 */
|
||||
}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t;
|
||||
|
||||
struct mca_btl_ud_frag_t;
|
||||
extern mca_btl_ud_module_t mca_btl_ofud_module;
|
||||
|
||||
|
||||
/**
|
||||
* Open the component; register UD/IB parameters with the MCA framework
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_component_open(void);
|
||||
|
||||
|
||||
/**
|
||||
* Any final cleanup before being unloaded.
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_component_close(void);
|
||||
|
||||
|
||||
/**
|
||||
* IB component initialization.
|
||||
*
|
||||
* @param num_btl_modules (OUT)
|
||||
* Number of BTLs returned in BTL array.
|
||||
* @param allow_multi_user_threads (OUT)
|
||||
* Flag indicating wether BTL supports user threads (TRUE)
|
||||
* @param have_hidden_threads (OUT)
|
||||
* Flag indicating whether BTL uses threads (TRUE)
|
||||
*
|
||||
* (1) read interface list from verbs and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
* (2) publish BTL addressing info
|
||||
*/
|
||||
|
||||
extern mca_btl_base_module_t** mca_btl_ud_component_init(
|
||||
int *num_btl_modules,
|
||||
bool allow_multi_user_threads,
|
||||
bool have_hidden_threads);
|
||||
|
||||
|
||||
/**
|
||||
* UD/IB component progress.
|
||||
*/
|
||||
extern int mca_btl_ud_component_progress(void);
|
||||
|
||||
|
||||
/**
|
||||
* Register a callback function that is called on receipt
|
||||
* of a fragment.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @return Status indicating if cleanup was successful
|
||||
*/
|
||||
|
||||
int mca_btl_ud_register(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata);
|
||||
|
||||
|
||||
/**
|
||||
* Cleanup any resources held by the BTL.
|
||||
*
|
||||
* @param btl BTL instance.
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN)
|
||||
* @param nprocs (IN) Number of processes
|
||||
* @param procs (IN) Set of processes
|
||||
* @param peers (OUT) Set of (optional) peer addressing info.
|
||||
* @param peers (IN/OUT) Set of processes that are reachable via this BTL.
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param nproc (IN) Number of processes.
|
||||
* @param procs (IN) Set of processes.
|
||||
* @param peers (IN) Set of peer data structures.
|
||||
* @return Status indicating if cleanup was successful
|
||||
*
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** peers);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a send of the specified size.
|
||||
*
|
||||
* @param btl (IN)
|
||||
* BTL instance
|
||||
* @param btl_base_peer (IN)
|
||||
* BTL peer addressing
|
||||
* @param send_request (IN/OUT)
|
||||
* Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
|
||||
* @param size (IN)
|
||||
* Number of bytes PML is requesting BTL to deliver
|
||||
* @param flags (IN)
|
||||
* Flags that should be passed to the peer via the message header.
|
||||
* @param request (OUT)
|
||||
* OMPI_SUCCESS if the BTL was able to queue one or more fragments
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_send(struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
mca_btl_base_tag_t tag);
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a descriptor.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Requested descriptor size.
|
||||
*/
|
||||
|
||||
extern mca_btl_base_descriptor_t* mca_btl_ud_alloc(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
uint8_t order,
|
||||
size_t size);
|
||||
|
||||
|
||||
/**
|
||||
* Return a segment allocated by this BTL.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param descriptor (IN) Allocated descriptor.
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des);
|
||||
|
||||
|
||||
/**
|
||||
* Pack data and return a descriptor that can be
|
||||
* used for send/put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct ompi_convertor_t* convertor,
|
||||
uint8_t order,
|
||||
size_t reserve,
|
||||
size_t* size);
|
||||
|
||||
|
||||
|
||||
int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl);
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
* @param state Checkpoint State
|
||||
* @return OMPI_SUCCESS or failure status
|
||||
*/
|
||||
|
||||
extern int mca_btl_udapl_ft_event(int state);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Profiling stuff
|
||||
*/
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
|
||||
#define MCA_BTL_UD_START_TIME(var) \
|
||||
((mca_btl_ud_profile.tmp_ ## var) = opal_sys_timer_get_cycles())
|
||||
|
||||
#define MCA_BTL_UD_END_TIME(var) \
|
||||
do { \
|
||||
mca_btl_ud_profile.avg_ ## var += \
|
||||
opal_sys_timer_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \
|
||||
mca_btl_ud_profile.cnt_ ## var++; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_BTL_UD_SHOW_TIME(var) \
|
||||
OPAL_OUTPUT((0, " " #var " avg %lu cnt %lu", \
|
||||
(mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \
|
||||
mca_btl_ud_profile.cnt_ ## var));
|
||||
|
||||
#else
|
||||
#define MCA_BTL_UD_START_TIME(var)
|
||||
#define MCA_BTL_UD_END_TIME(var)
|
||||
#define MCA_BTL_UD_SHOW_TIME(var)
|
||||
#endif
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
537
ompi/mca/btl/ofud/btl_ofud_component.c
Обычный файл
537
ompi/mca/btl/ofud/btl_ofud_component.c
Обычный файл
@ -0,0 +1,537 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/prefetch.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "opal/sys/timer.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_frag.h"
|
||||
#include "btl_ofud_endpoint.h"
|
||||
|
||||
|
||||
mca_btl_ud_component_t mca_btl_ofud_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
about the component itself */
|
||||
{
|
||||
/* Indicate that we are a pml v1.0.0 component (which also implies a
|
||||
specific MCA version) */
|
||||
MCA_BTL_BASE_VERSION_1_0_1,
|
||||
|
||||
"ofud", /* MCA component name */
|
||||
OMPI_MAJOR_VERSION, /* MCA component major version */
|
||||
OMPI_MINOR_VERSION, /* MCA component minor version */
|
||||
OMPI_RELEASE_VERSION, /* MCA component release version */
|
||||
mca_btl_ud_component_open, /* component open */
|
||||
mca_btl_ud_component_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
false
|
||||
},
|
||||
|
||||
mca_btl_ud_component_init,
|
||||
mca_btl_ud_component_progress,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Profiling information
|
||||
*/
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
mca_btl_ud_profile_t mca_btl_ud_profile = {0};
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* utility routines for parameter registration
|
||||
*/
|
||||
|
||||
static inline void mca_btl_ud_param_reg_string(const char* param_name,
|
||||
const char* param_desc,
|
||||
const char* default_value,
|
||||
char** out_value)
|
||||
{
|
||||
mca_base_param_reg_string(&mca_btl_ofud_component.super.btl_version,
|
||||
param_name, param_desc, false, false,
|
||||
default_value, out_value);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ud_param_reg_int(const char* param_name,
|
||||
const char* param_desc,
|
||||
int default_value,
|
||||
int* out_value)
|
||||
{
|
||||
mca_base_param_reg_int(&mca_btl_ofud_component.super.btl_version,
|
||||
param_name, param_desc, false, false,
|
||||
default_value, out_value);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Called by MCA framework to open the component, registers
|
||||
* component parameters.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_component_open(void)
|
||||
{
|
||||
int val;
|
||||
|
||||
/* initialize state */
|
||||
mca_btl_ofud_component.num_btls = 0;
|
||||
mca_btl_ofud_component.ud_btls = NULL;
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_procs, opal_list_t);
|
||||
|
||||
/* register IB component parameters */
|
||||
mca_btl_ud_param_reg_int("max_btls",
|
||||
"Maximum number of HCAs/ports to use",
|
||||
4, (int*)&mca_btl_ofud_component.max_btls);
|
||||
|
||||
mca_btl_ud_param_reg_string("mpool", "Name of the memory pool to be used",
|
||||
"rdma", &mca_btl_ofud_component.ud_mpool_name);
|
||||
|
||||
mca_btl_ud_param_reg_int("ib_pkey_index", "IB pkey index",
|
||||
0, (int*)&mca_btl_ofud_component.ib_pkey_ix);
|
||||
mca_btl_ud_param_reg_int("ib_qkey", "IB qkey",
|
||||
0x01330133, (int*)&mca_btl_ofud_component.ib_qkey);
|
||||
mca_btl_ud_param_reg_int("ib_service_level", "IB service level",
|
||||
0, (int*)&mca_btl_ofud_component.ib_service_level);
|
||||
mca_btl_ud_param_reg_int("ib_src_path_bits", "IB source path bits",
|
||||
0, (int*)&mca_btl_ofud_component.ib_src_path_bits);
|
||||
|
||||
mca_btl_ud_param_reg_int("sd_num", "maximum send descriptors to post",
|
||||
128, (int*)&mca_btl_ofud_component.sd_num);
|
||||
mca_btl_ud_param_reg_int("sd_num_peer",
|
||||
"maximum send descriptors to post to one peer",
|
||||
8, (int*)&mca_btl_ofud_component.sd_num_peer);
|
||||
|
||||
mca_btl_ud_param_reg_int("rd_num_init", "number of receive buffers",
|
||||
6000, (int*)&mca_btl_ofud_component.rd_num);
|
||||
#if 0
|
||||
mca_btl_ud_param_reg_int("rd_num_init", "initial receive buffers",
|
||||
3000, (int*)&mca_btl_ofud_component.rd_num_init);
|
||||
mca_btl_ud_param_reg_int("rd_num_max", "maximum receive buffers",
|
||||
4500, (int*)&mca_btl_ofud_component.rd_num_max);
|
||||
mca_btl_ud_param_reg_int("rd_num_inc",
|
||||
"number of buffers to post when rate is high",
|
||||
25, (int*)&mca_btl_ofud_component.rd_num_inc);
|
||||
#endif
|
||||
|
||||
/* TODO - this assumes a 2k UD MTU - query/do something more intelligent */
|
||||
/*mca_btl_ud_param_reg_int("eager_limit", "eager send limit",
|
||||
2048, &val); */
|
||||
mca_btl_ud_param_reg_int("min_send_size", "minimum send size",
|
||||
2048, &val);
|
||||
mca_btl_ofud_module.super.btl_min_send_size = val;
|
||||
mca_btl_ud_param_reg_int("max_send_size", "maximum send size",
|
||||
2048, &val);
|
||||
mca_btl_ofud_module.super.btl_eager_limit = val;
|
||||
mca_btl_ofud_module.super.btl_max_send_size = val;
|
||||
|
||||
mca_btl_ud_param_reg_int("exclusivity", "BTL exclusivity",
|
||||
MCA_BTL_EXCLUSIVITY_DEFAULT,
|
||||
(int*)&mca_btl_ofud_module.super.btl_exclusivity);
|
||||
mca_btl_ud_param_reg_int("bandwidth",
|
||||
"Approximate maximum bandwidth of interconnect",
|
||||
800, (int*)&mca_btl_ofud_module.super.btl_bandwidth);
|
||||
|
||||
mca_btl_ofud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t);
|
||||
mca_btl_ofud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Component cleanup
|
||||
*/
|
||||
|
||||
int mca_btl_ud_component_close(void)
|
||||
{
|
||||
OBJ_DESTRUCT(&mca_btl_ofud_component.ud_lock);
|
||||
OBJ_DESTRUCT(&mca_btl_ofud_component.ud_procs);
|
||||
|
||||
/* Calculate and print profiling numbers */
|
||||
MCA_BTL_UD_SHOW_TIME(post_send);
|
||||
MCA_BTL_UD_SHOW_TIME(ibv_post_send);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register UD address information. The MCA framework
|
||||
* will make this available to all peers.
|
||||
*/
|
||||
|
||||
static int mca_btl_ud_modex_send(void)
|
||||
{
|
||||
int rc;
|
||||
size_t i;
|
||||
size_t size;
|
||||
mca_btl_ud_addr_t* addrs = NULL;
|
||||
|
||||
size = mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_addr_t);
|
||||
if(size != 0) {
|
||||
addrs = (mca_btl_ud_addr_t*)malloc(size);
|
||||
if(NULL == addrs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for(i = 0; i < mca_btl_ofud_component.num_btls; i++) {
|
||||
mca_btl_ud_module_t* btl = &mca_btl_ofud_component.ud_btls[i];
|
||||
addrs[i] = btl->addr;
|
||||
|
||||
BTL_VERBOSE((0, "modex_send QP num %x, LID = %x",
|
||||
addrs[i].qp_num, addrs[i].lid));
|
||||
}
|
||||
}
|
||||
|
||||
rc = mca_pml_base_modex_send(
|
||||
&mca_btl_ofud_component.super.btl_version, addrs, size);
|
||||
if(NULL != addrs) {
|
||||
free(addrs);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* UD component initialization:
|
||||
* (1) read interface list from kernel and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
* (2) post OOB receive for incoming connection attempts
|
||||
* (3) register BTL parameters with the MCA
|
||||
*/
|
||||
|
||||
mca_btl_base_module_t** mca_btl_ud_component_init(int* num_btl_modules,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
struct ibv_device **ib_devs;
|
||||
struct ibv_device* ib_dev;
|
||||
int32_t num_devs;
|
||||
mca_btl_base_module_t** btls;
|
||||
uint32_t i, j;
|
||||
opal_list_t btl_list;
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
mca_btl_base_selected_module_t* ib_selected;
|
||||
opal_list_item_t* item;
|
||||
unsigned short seedv[3];
|
||||
char* btl_str;
|
||||
char* tok;
|
||||
|
||||
/* First, check if the UD BTL was specifically selected.
|
||||
If not, then short out right away. */
|
||||
mca_base_param_lookup_string(
|
||||
mca_base_param_find("btl", NULL, NULL), &btl_str);
|
||||
if(NULL == btl_str) {
|
||||
/* Can't specify UD with out any string at all.. bail out */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Try to find a 'ud' token */
|
||||
tok = strtok(btl_str, ",");
|
||||
while(tok) {
|
||||
if(!strcasecmp("ofud", tok)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(NULL == tok) {
|
||||
/* No valid 'ud' token found; bail out */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* initialization */
|
||||
*num_btl_modules = 0;
|
||||
num_devs = 0;
|
||||
|
||||
seedv[0] = orte_process_info.my_name->vpid;
|
||||
seedv[1] = opal_sys_timer_get_cycles();
|
||||
seedv[2] = opal_sys_timer_get_cycles();
|
||||
seed48(seedv);
|
||||
|
||||
ib_devs = ibv_get_device_list(&num_devs);
|
||||
|
||||
if(0 == num_devs) {
|
||||
mca_btl_base_error_no_nics("OpenFabrics UD", "HCA");
|
||||
mca_btl_ud_modex_send();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/** We must loop through all the hca id's, get their handles and
|
||||
for each hca we query the number of ports on the hca and set up
|
||||
a distinct btl module for each hca port */
|
||||
|
||||
OBJ_CONSTRUCT(&btl_list, opal_list_t);
|
||||
|
||||
for(i = 0; (int32_t)i < num_devs &&
|
||||
mca_btl_ofud_component.num_btls < mca_btl_ofud_component.max_btls;
|
||||
i++) {
|
||||
struct ibv_device_attr ib_dev_attr;
|
||||
struct ibv_context* ib_dev_context;
|
||||
|
||||
ib_dev = ib_devs[i];
|
||||
|
||||
ib_dev_context = ibv_open_device(ib_dev);
|
||||
if(!ib_dev_context) {
|
||||
BTL_ERROR(("error obtaining device context for %s: %s\n",
|
||||
ibv_get_device_name(ib_dev), strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
|
||||
BTL_ERROR(("error obtaining device attributes for %s: %s\n",
|
||||
ibv_get_device_name(ib_dev), strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Note ports are 1 based hence j = 1 */
|
||||
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++) {
|
||||
struct ibv_port_attr ib_port_attr;
|
||||
|
||||
if(ibv_query_port(ib_dev_context, (uint8_t)j, &ib_port_attr)) {
|
||||
BTL_ERROR(("error getting port attributes for device %s port %d: %s",
|
||||
ibv_get_device_name(ib_dev), j, strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(IBV_PORT_ACTIVE == ib_port_attr.state) {
|
||||
ud_btl =
|
||||
(mca_btl_ud_module_t*)malloc(sizeof(mca_btl_ud_module_t));
|
||||
memcpy(ud_btl, &mca_btl_ofud_module, sizeof(mca_btl_ud_module_t));
|
||||
|
||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||
ib_selected->btl_module = (mca_btl_base_module_t*)ud_btl;
|
||||
|
||||
ud_btl->ib_dev = ib_dev;
|
||||
ud_btl->ib_dev_context = ib_dev_context;
|
||||
ud_btl->ib_port_num = (uint8_t)j;
|
||||
ud_btl->addr.subnet = ib_port_attr.sm_lid;
|
||||
ud_btl->addr.lid = ib_port_attr.lid;
|
||||
|
||||
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
|
||||
if(++mca_btl_ofud_component.num_btls >=
|
||||
mca_btl_ofud_component.max_btls)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Allocate space for btl modules */
|
||||
mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*)
|
||||
malloc(sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls);
|
||||
if(NULL == mca_btl_ofud_component.ud_btls) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
btls = (struct mca_btl_base_module_t**)
|
||||
malloc(mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*));
|
||||
if(NULL == btls) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < mca_btl_ofud_component.num_btls; i++){
|
||||
item = opal_list_remove_first(&btl_list);
|
||||
ib_selected = (mca_btl_base_selected_module_t*)item;
|
||||
ud_btl = (mca_btl_ud_module_t*)ib_selected->btl_module;
|
||||
|
||||
memcpy(&(mca_btl_ofud_component.ud_btls[i]),
|
||||
ud_btl, sizeof(mca_btl_ud_module_t));
|
||||
free(ib_selected);
|
||||
free(ud_btl);
|
||||
|
||||
ud_btl = &mca_btl_ofud_component.ud_btls[i];
|
||||
|
||||
/* Initialize module state */
|
||||
if(mca_btl_ud_module_init(ud_btl) != OMPI_SUCCESS) {
|
||||
mca_btl_ofud_component.num_btls--;
|
||||
i--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
btls[i] = &ud_btl->super;
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&btl_list);
|
||||
mca_btl_ud_modex_send();
|
||||
|
||||
/* Since not all modules may have initialized successfully, realloc
|
||||
to free space from failed modules */
|
||||
mca_btl_ofud_component.ud_btls = (mca_btl_ud_module_t*)
|
||||
realloc(mca_btl_ofud_component.ud_btls,
|
||||
sizeof(mca_btl_ud_module_t) * mca_btl_ofud_component.num_btls);
|
||||
btls = (struct mca_btl_base_module_t**)realloc(btls,
|
||||
mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_module_t*));
|
||||
|
||||
*num_btl_modules = mca_btl_ofud_component.num_btls;
|
||||
|
||||
ibv_free_device_list(ib_devs);
|
||||
return btls;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IB component progress.
|
||||
*/
|
||||
|
||||
#define MCA_BTL_UD_NUM_WC 500
|
||||
|
||||
int mca_btl_ud_component_progress(void)
|
||||
{
|
||||
uint32_t i;
|
||||
int count = 0, ne, j;
|
||||
mca_btl_ud_frag_t* frag;
|
||||
struct ibv_recv_wr* bad_wr;
|
||||
struct ibv_recv_wr* head_wr;
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
mca_btl_base_recv_reg_t* reg;
|
||||
struct ibv_wc* cwc;
|
||||
struct ibv_wc wc[MCA_BTL_UD_NUM_WC];
|
||||
|
||||
/* Poll for completions */
|
||||
for(i = 0; i < mca_btl_ofud_component.num_btls; i++) {
|
||||
ud_btl = &mca_btl_ofud_component.ud_btls[i];
|
||||
|
||||
ne = ibv_poll_cq(ud_btl->ib_cq, MCA_BTL_UD_NUM_WC, wc);
|
||||
if(OPAL_UNLIKELY(ne < 0)) {
|
||||
BTL_ERROR(("error polling CQ with %d: %s\n",
|
||||
ne, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
head_wr = NULL;
|
||||
|
||||
for(j = 0; j < ne; j++) {
|
||||
cwc = &wc[j];
|
||||
if(OPAL_UNLIKELY(cwc->status != IBV_WC_SUCCESS)) {
|
||||
BTL_ERROR(("error polling CQ with status %d for wr_id %llu opcode %d\n",
|
||||
cwc->status, cwc->wr_id, cwc->opcode));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
frag = (mca_btl_ud_frag_t*)(unsigned long)cwc->wr_id;
|
||||
|
||||
/* Handle work completions */
|
||||
switch(frag->type) {
|
||||
case MCA_BTL_UD_FRAG_SEND:
|
||||
case MCA_BTL_UD_FRAG_USER:
|
||||
{
|
||||
mca_btl_ud_endpoint_t* endpoint = frag->endpoint;
|
||||
assert(cwc->opcode == IBV_WC_SEND);
|
||||
|
||||
frag->base.des_cbfunc(&ud_btl->super,
|
||||
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
||||
|
||||
/* Increment send counter, post if any sends are queued */
|
||||
OPAL_THREAD_ADD32(&endpoint->sd_wqe, 1);
|
||||
if(OPAL_UNLIKELY(
|
||||
!opal_list_is_empty(&endpoint->pending_frags))) {
|
||||
OPAL_THREAD_LOCK(&endpoint->pending_frags_lock);
|
||||
frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&endpoint->pending_frags);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->pending_frags_lock);
|
||||
|
||||
if(OPAL_LIKELY(NULL != frag)) {
|
||||
mca_btl_ud_endpoint_post_send(ud_btl, frag);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1);
|
||||
if(OPAL_UNLIKELY(
|
||||
!opal_list_is_empty(&ud_btl->pending_frags))) {
|
||||
OPAL_THREAD_LOCK(&ud_btl->ud_lock);
|
||||
frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&ud_btl->pending_frags);
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ud_lock);
|
||||
|
||||
if(OPAL_LIKELY(NULL != frag)) {
|
||||
mca_btl_ud_endpoint_post_send(ud_btl, frag);
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
case MCA_BTL_UD_FRAG_RECV:
|
||||
assert(cwc->opcode == IBV_WC_RECV);
|
||||
reg = &ud_btl->ib_reg[frag->hdr->tag];
|
||||
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
frag->segment.seg_len = cwc->byte_len -
|
||||
sizeof(mca_btl_ud_header_t) -
|
||||
sizeof(mca_btl_ud_ib_header_t);
|
||||
|
||||
reg->cbfunc(&ud_btl->super,
|
||||
frag->hdr->tag, &frag->base, reg->cbdata);
|
||||
|
||||
/* Add recv to linked list for reposting */
|
||||
frag->wr_desc.rd_desc.next = head_wr;
|
||||
head_wr = &frag->wr_desc.rd_desc;
|
||||
continue;
|
||||
default:
|
||||
BTL_ERROR(("Unhandled completion opcode %d frag type %d",
|
||||
cwc->opcode, frag->type));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
count += ne;
|
||||
|
||||
/* Repost any recv buffers all at once */
|
||||
if(OPAL_LIKELY(head_wr)) {
|
||||
if(OPAL_UNLIKELY(ibv_post_recv(
|
||||
ud_btl->ib_qp[0], head_wr, &bad_wr))) {
|
||||
BTL_ERROR(("error posting recv: %s\n", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
head_wr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
137
ompi/mca/btl/ofud/btl_ofud_endpoint.c
Обычный файл
137
ompi/mca/btl/ofud/btl_ofud_endpoint.c
Обычный файл
@ -0,0 +1,137 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "opal/prefetch.h"
|
||||
#include "ompi/types.h"
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_endpoint.h"
|
||||
#include "btl_ofud_frag.h"
|
||||
|
||||
|
||||
static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
|
||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
|
||||
|
||||
|
||||
/* First, we check the downcounter on the endpoint.
|
||||
If it is 0, we queue this frag on the endpoint.
|
||||
Otherwise, we check the BTL downcounter.
|
||||
If it is 0, we queue this frag on the BTL.
|
||||
Otherwise, we post the send. */
|
||||
#define CHECK_FRAG_QUEUES(sd_wqe, lock, queue, frag) \
|
||||
do { \
|
||||
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&(sd_wqe), -1) < 0)) { \
|
||||
OPAL_THREAD_ADD32(&(sd_wqe), 1); \
|
||||
OPAL_THREAD_LOCK(&(lock)); \
|
||||
opal_list_append(&(queue), \
|
||||
(opal_list_item_t*)(frag)); \
|
||||
OPAL_THREAD_UNLOCK(&(lock)); \
|
||||
return OMPI_SUCCESS; \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/*
|
||||
* Post a send to the work queue
|
||||
*/
|
||||
|
||||
int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
||||
mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
struct ibv_qp* ib_qp;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
struct ibv_send_wr* wr = &frag->wr_desc.sr_desc;
|
||||
mca_btl_ud_endpoint_t* endpoint = frag->endpoint;
|
||||
int ret;
|
||||
|
||||
/* Have to be careful here - UD adds a 40 byte header, but it is not
|
||||
included on the sending side. */
|
||||
frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t);
|
||||
wr->send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
CHECK_FRAG_QUEUES(endpoint->sd_wqe,
|
||||
endpoint->pending_frags_lock, endpoint->pending_frags, frag);
|
||||
CHECK_FRAG_QUEUES(ud_btl->sd_wqe,
|
||||
ud_btl->ud_lock, ud_btl->pending_frags, frag);
|
||||
|
||||
/* We avoid locking here by allowing our stripe counter to count
|
||||
until it wraps around uint32_t. This keeps the mod operation
|
||||
out of the critical section, allowing us to use OPAL_THREAD_ADD32
|
||||
instead of a full mutex. */
|
||||
ib_qp = ud_btl->ib_qp[ud_btl->ib_qp_next % MCA_BTL_UD_NUM_QP];
|
||||
OPAL_THREAD_ADD32(&ud_btl->ib_qp_next, 1);
|
||||
|
||||
wr->wr.ud.ah = endpoint->rmt_ah;
|
||||
wr->wr.ud.remote_qpn = endpoint->rem_addr.qp_num;
|
||||
|
||||
if(frag->sg_entry.length <= ud_btl->ib_inline_max) {
|
||||
wr->send_flags =
|
||||
IBV_SEND_SIGNALED|IBV_SEND_INLINE;
|
||||
}
|
||||
|
||||
/*frag->hdr->src_qpnum = ud_btl->addr.qp_num;*/
|
||||
|
||||
MCA_BTL_UD_START_TIME(ibv_post_send);
|
||||
if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) {
|
||||
opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d",
|
||||
endpoint->sd_wqe, ud_btl->sd_wqe,
|
||||
frag->sg_entry.length, ud_btl->ib_qp_next);
|
||||
BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret)));
|
||||
|
||||
}
|
||||
MCA_BTL_UD_END_TIME(ibv_post_send);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t,
|
||||
opal_list_item_t, mca_btl_ud_endpoint_construct,
|
||||
mca_btl_ud_endpoint_destruct);
|
||||
|
||||
/*
|
||||
* Construct/destruct an endpoint structure.
|
||||
*/
|
||||
|
||||
static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t));
|
||||
#endif
|
||||
|
||||
OBJ_CONSTRUCT(&endpoint->pending_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&endpoint->pending_frags_lock, opal_mutex_t);
|
||||
|
||||
endpoint->sd_wqe = mca_btl_ofud_component.sd_num_peer;
|
||||
}
|
||||
|
||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
/* TODO - what about any pending frags? */
|
||||
OBJ_DESTRUCT(&endpoint->pending_frags);
|
||||
OBJ_DESTRUCT(&endpoint->pending_frags_lock);
|
||||
}
|
||||
|
82
ompi/mca/btl/ofud/btl_ofud_endpoint.h
Обычный файл
82
ompi/mca/btl/ofud/btl_ofud_endpoint.h
Обычный файл
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_IB_ENDPOINT_H
|
||||
#define MCA_BTL_IB_ENDPOINT_H
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/event/event.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_frag.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct mca_btl_ud_addr_t {
|
||||
uint32_t qp_num;
|
||||
uint32_t psn;
|
||||
uint16_t lid;
|
||||
uint16_t subnet;
|
||||
};
|
||||
typedef struct mca_btl_ud_addr_t mca_btl_ud_addr_t;
|
||||
|
||||
|
||||
/**
|
||||
* An abstraction that represents a connection to a endpoint process.
|
||||
* An instance of mca_btl_base_endpoint_t is associated w/ each process
|
||||
* and BTL pair and address information is exchanged at startup.
|
||||
* The UD BTL is connectionless, so no connection is ever established.
|
||||
*/
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
mca_btl_ud_addr_t rem_addr;
|
||||
/**< Remote address information */
|
||||
/* No lock needed, read-only past initialization */
|
||||
|
||||
struct ibv_ah* rmt_ah;
|
||||
/**< Remote address handle */
|
||||
/* No lock needed, verbs are thread-safe */
|
||||
|
||||
opal_list_t pending_frags;
|
||||
opal_mutex_t pending_frags_lock;
|
||||
/**< list of pending frags and lock */
|
||||
|
||||
int32_t sd_wqe;
|
||||
/**< number of available send wqe entries */
|
||||
/* No lock needed, OPAL_THREAD_ADD32 is used */
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t);
|
||||
|
||||
int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl,
|
||||
struct mca_btl_ud_frag_t * frag);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
112
ompi/mca/btl/ofud/btl_ofud_frag.c
Обычный файл
112
ompi/mca/btl/ofud/btl_ofud_frag.c
Обычный файл
@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_frag.h"
|
||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||
|
||||
|
||||
static inline void mca_btl_ud_frag_common_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->ud_reg = (mca_btl_ud_reg_t*)frag->base.super.registration;
|
||||
frag->sg_entry.lkey = frag->ud_reg->mr->lkey;
|
||||
frag->base.des_flags = 0;
|
||||
frag->base.order = MCA_BTL_NO_ORDER;
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_send_frag_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->type = MCA_BTL_UD_FRAG_SEND;
|
||||
mca_btl_ud_frag_common_constructor(frag);
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
|
||||
/* We do not include the mca_btl_ud_ib_header_t data when sending */
|
||||
frag->hdr = frag->base.super.ptr;
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
|
||||
frag->sg_entry.addr = (unsigned long)frag->hdr;
|
||||
|
||||
frag->wr_desc.sr_desc.wr_id = (unsigned long)frag;
|
||||
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.sr_desc.num_sge = 1;
|
||||
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
frag->wr_desc.sr_desc.next = NULL;
|
||||
frag->wr_desc.sr_desc.wr.ud.remote_qkey = mca_btl_ofud_component.ib_qkey;
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_user_frag_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
mca_btl_ud_send_frag_constructor(frag);
|
||||
frag->type = MCA_BTL_UD_FRAG_USER;
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_recv_frag_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->type = MCA_BTL_UD_FRAG_RECV;
|
||||
mca_btl_ud_frag_common_constructor(frag);
|
||||
frag->base.des_dst = &frag->segment;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.des_src = NULL;
|
||||
frag->base.des_src_cnt = 0;
|
||||
|
||||
/* Receive frag headers start 40 bytes later */
|
||||
frag->hdr = (mca_btl_ud_header_t*)((uintptr_t)frag->base.super.ptr +
|
||||
sizeof(mca_btl_ud_ib_header_t));
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
|
||||
frag->sg_entry.addr = (uintptr_t)frag->base.super.ptr;
|
||||
frag->segment.seg_len = mca_btl_ofud_module.super.btl_eager_limit;
|
||||
frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit +
|
||||
sizeof(mca_btl_ud_ib_header_t) + sizeof(mca_btl_ud_header_t);
|
||||
|
||||
frag->wr_desc.rd_desc.wr_id = (unsigned long)frag;
|
||||
frag->wr_desc.rd_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.rd_desc.num_sge = 1;
|
||||
frag->wr_desc.rd_desc.next = NULL;
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_send_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_send_frag_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_user_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_user_frag_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_recv_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_recv_frag_constructor,
|
||||
NULL);
|
||||
|
148
ompi/mca/btl/ofud/btl_ofud_frag.h
Обычный файл
148
ompi/mca/btl/ofud/btl_ofud_frag.h
Обычный файл
@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_UD_FRAG_H
|
||||
#define MCA_BTL_UD_FRAG_H
|
||||
|
||||
#define MCA_BTL_IB_FRAG_ALIGN (8)
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Fragment types
|
||||
*/
|
||||
typedef enum {
|
||||
MCA_BTL_UD_FRAG_SEND,
|
||||
MCA_BTL_UD_FRAG_USER,
|
||||
MCA_BTL_UD_FRAG_RECV
|
||||
} mca_btl_ud_frag_type_t;
|
||||
|
||||
|
||||
struct mca_btl_ud_reg_t {
|
||||
mca_mpool_base_registration_t base;
|
||||
struct ibv_mr* mr;
|
||||
};
|
||||
typedef struct mca_btl_ud_reg_t mca_btl_ud_reg_t;
|
||||
|
||||
|
||||
/* UD adds a 40 byte global routing header */
|
||||
/* This works in strange ways - the sending side does not need to explicitly
|
||||
include this data in sg lists. Then, on the receiving side, the extra 40
|
||||
bytes magically appear. */
|
||||
struct mca_btl_ud_ib_header_t {
|
||||
uint8_t ib_grh[40];
|
||||
};
|
||||
typedef struct mca_btl_ud_ib_header_t mca_btl_ud_ib_header_t;
|
||||
|
||||
struct mca_btl_ud_header_t {
|
||||
/*uint32_t src_qpnum;*/
|
||||
mca_btl_base_tag_t tag;
|
||||
};
|
||||
typedef struct mca_btl_ud_header_t mca_btl_ud_header_t;
|
||||
|
||||
|
||||
/**
|
||||
* IB send fragment derived type.
|
||||
*/
|
||||
|
||||
struct mca_btl_ud_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_base_segment_t segment;
|
||||
|
||||
struct mca_btl_base_endpoint_t* endpoint;
|
||||
|
||||
mca_btl_ud_frag_type_t type;
|
||||
|
||||
union{
|
||||
struct ibv_recv_wr rd_desc;
|
||||
struct ibv_send_wr sr_desc;
|
||||
} wr_desc;
|
||||
struct ibv_sge sg_entry;
|
||||
|
||||
/* When this is a send frag, hdr points right after this, as expected.
|
||||
But when this is a receive frag, we have an extra 40 bytes provided
|
||||
by IB, so this points 40 bytes past the end of the frag. */
|
||||
mca_btl_ud_header_t* hdr;
|
||||
|
||||
mca_btl_ud_reg_t* ud_reg;
|
||||
};
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_user_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_user_frag_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_t);
|
||||
|
||||
|
||||
/*
|
||||
* Allocate/return a UD/IB send/user fragment
|
||||
*/
|
||||
|
||||
#define MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t *item; \
|
||||
OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->send_frags, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_UD_RETURN_FRAG(btl, frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN( \
|
||||
&((mca_btl_ud_module_t*)btl)->send_frags, \
|
||||
(ompi_free_list_item_t*)(frag)); \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t *item; \
|
||||
OMPI_FREE_LIST_GET(&((mca_btl_ud_module_t*)btl)->user_frags, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_UD_RETURN_USER_FRAG(btl, frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN( \
|
||||
&((mca_btl_ud_module_t*)btl)->user_frags, \
|
||||
(ompi_free_list_item_t*)(frag)); \
|
||||
}
|
||||
|
||||
|
||||
struct mca_btl_ud_module_t;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
206
ompi/mca/btl/ofud/btl_ofud_proc.c
Обычный файл
206
ompi/mca/btl/ofud/btl_ofud_proc.c
Обычный файл
@ -0,0 +1,206 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_proc.h"
|
||||
|
||||
|
||||
static void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc);
|
||||
static void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t,
|
||||
opal_list_item_t, mca_btl_ud_proc_construct,
|
||||
mca_btl_ud_proc_destruct);
|
||||
|
||||
void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc)
|
||||
{
|
||||
proc->proc_ompi = 0;
|
||||
proc->proc_addr_count = 0;
|
||||
proc->proc_endpoints = 0;
|
||||
proc->proc_endpoint_count = 0;
|
||||
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
|
||||
|
||||
/* add to list of all proc instance */
|
||||
OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock);
|
||||
opal_list_append(&mca_btl_ofud_component.ud_procs, &proc->super);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock);
|
||||
}
|
||||
|
||||
void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc)
|
||||
{
|
||||
/* remove from list of all proc instances */
|
||||
OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock);
|
||||
opal_list_remove_item(&mca_btl_ofud_component.ud_procs, &proc->super);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock);
|
||||
|
||||
/* release resources */
|
||||
if(NULL != proc->proc_endpoints) {
|
||||
free(proc->proc_endpoints);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Look for an existing IB process instance based on the associated
|
||||
* ompi_proc_t instance.
|
||||
*/
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc)
|
||||
{
|
||||
mca_btl_ud_proc_t* ib_proc;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_btl_ofud_component.ud_lock);
|
||||
|
||||
for(ib_proc = (mca_btl_ud_proc_t*)
|
||||
opal_list_get_first(&mca_btl_ofud_component.ud_procs);
|
||||
ib_proc != (mca_btl_ud_proc_t*)
|
||||
opal_list_get_end(&mca_btl_ofud_component.ud_procs);
|
||||
ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
if(ib_proc->proc_ompi == ompi_proc) {
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock);
|
||||
return ib_proc;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ofud_component.ud_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create a IB process structure. There is a one-to-one correspondence
|
||||
* between a ompi_proc_t and a mca_btl_ud_proc_t instance. We cache
|
||||
* additional data (specifically the list of mca_btl_ud_endpoint_t instances,
|
||||
* and published addresses) associated w/ a given destination on this
|
||||
* datastructure.
|
||||
*/
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
|
||||
{
|
||||
mca_btl_ud_proc_t* module_proc = NULL;
|
||||
size_t size;
|
||||
int rc;
|
||||
|
||||
/* Check if we have already created a IB proc
|
||||
* structure for this ompi process */
|
||||
module_proc = mca_btl_ud_proc_lookup_ompi(ompi_proc);
|
||||
|
||||
if(module_proc != NULL) {
|
||||
/* Gotcha! */
|
||||
return module_proc;
|
||||
}
|
||||
|
||||
/* Oops! First time, gotta create a new IB proc out of the ompi_proc ... */
|
||||
module_proc = OBJ_NEW(mca_btl_ud_proc_t);
|
||||
/* Initialize number of peer */
|
||||
module_proc->proc_endpoint_count = 0;
|
||||
module_proc->proc_ompi = ompi_proc;
|
||||
|
||||
/* build a unique identifier (of arbitrary size) to represent the proc */
|
||||
module_proc->proc_guid = ompi_proc->proc_name;
|
||||
|
||||
|
||||
/* query for the peer address info */
|
||||
rc = mca_pml_base_modex_recv(&mca_btl_ofud_component.super.btl_version,
|
||||
ompi_proc, (void*)&module_proc->proc_addrs,
|
||||
&size);
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0,
|
||||
"[%s:%d] mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_ud_addr_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
module_proc->proc_addr_count = size / sizeof(mca_btl_ud_addr_t);
|
||||
|
||||
|
||||
if (0 == module_proc->proc_addr_count) {
|
||||
module_proc->proc_endpoints = NULL;
|
||||
} else {
|
||||
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||
malloc(module_proc->proc_addr_count *
|
||||
sizeof(mca_btl_base_endpoint_t*));
|
||||
}
|
||||
|
||||
if(NULL == module_proc->proc_endpoints) {
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
return module_proc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Insert an endpoint into the proc array and assign it an address.
|
||||
*
|
||||
* MUST be called with the proc lock held!
|
||||
*/
|
||||
|
||||
int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc,
|
||||
mca_btl_base_endpoint_t* module_endpoint)
|
||||
{
|
||||
module_endpoint->rem_addr =
|
||||
module_proc->proc_addrs[module_proc->proc_endpoint_count];
|
||||
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] =
|
||||
module_endpoint;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Remove an endpoint from the proc array.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_proc_remove(mca_btl_ud_proc_t* proc,
|
||||
mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
OPAL_THREAD_LOCK(&proc->proc_lock);
|
||||
for(i = 0; i < proc->proc_endpoint_count; i++) {
|
||||
if(proc->proc_endpoints[i] == endpoint) {
|
||||
memmove(proc->proc_endpoints + i, proc->proc_endpoints + i + 1,
|
||||
(proc->proc_endpoint_count -i - 1) *
|
||||
sizeof(mca_btl_base_endpoint_t*));
|
||||
if(--proc->proc_endpoint_count == 0) {
|
||||
OPAL_THREAD_UNLOCK(&proc->proc_lock);
|
||||
OBJ_RELEASE(proc);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&proc->proc_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
79
ompi/mca/btl/ofud/btl_ofud_proc.h
Обычный файл
79
ompi/mca/btl/ofud/btl_ofud_proc.h
Обычный файл
@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_UD_PROC_H
|
||||
#define MCA_BTL_UD_PROC_H
|
||||
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#include "btl_ofud.h"
|
||||
#include "btl_ofud_endpoint.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Represents the state of a remote process and the set of addresses
|
||||
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
||||
* each BTL instance that attempts to open a connection to the process.
|
||||
*/
|
||||
|
||||
struct mca_btl_ud_proc_t {
|
||||
opal_list_item_t super;
|
||||
/**< allow proc to be placed on a list */
|
||||
|
||||
ompi_proc_t *proc_ompi;
|
||||
/**< pointer to corresponding ompi_proc_t */
|
||||
|
||||
orte_process_name_t proc_guid;
|
||||
/**< globally unique identifier for the process */
|
||||
|
||||
struct mca_btl_ud_addr_t* proc_addrs;
|
||||
size_t proc_addr_count;
|
||||
/**< number of addresses published by endpoint */
|
||||
|
||||
struct mca_btl_base_endpoint_t **proc_endpoints;
|
||||
/**< array of endpoints that have been created to access this proc */
|
||||
|
||||
size_t proc_endpoint_count;
|
||||
/**< number of endpoints */
|
||||
|
||||
opal_mutex_t proc_lock;
|
||||
/**< lock to protect against concurrent access to proc state */
|
||||
};
|
||||
typedef struct mca_btl_ud_proc_t mca_btl_ud_proc_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_proc_t);
|
||||
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc);
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc);
|
||||
|
||||
int mca_btl_ud_proc_insert(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*);
|
||||
|
||||
int mca_btl_ud_proc_remove(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
43
ompi/mca/btl/ofud/configure.m4
Обычный файл
43
ompi/mca/btl/ofud/configure.m4
Обычный файл
@ -0,0 +1,43 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
||||
# MCA_btl_ofud_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_btl_ofud_CONFIG],[
|
||||
OMPI_CHECK_OPENIB([btl_ofud],
|
||||
[btl_ofud_happy="yes"],
|
||||
[btl_ofud_happy="no"])
|
||||
|
||||
AS_IF([test "$btl_ofud_happy" = "yes"],
|
||||
[btl_ofud_WRAPPER_EXTRA_LDFLAGS="$btl_ofud_LDFLAGS"
|
||||
btl_ofud_WRAPPER_EXTRA_LIBS="$btl_ofud_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
|
||||
# substitute in the things needed to build OFUD
|
||||
AC_SUBST([btl_ofud_CFLAGS])
|
||||
AC_SUBST([btl_ofud_CPPFLAGS])
|
||||
AC_SUBST([btl_ofud_LDFLAGS])
|
||||
AC_SUBST([btl_ofud_LIBS])
|
||||
])dnl
|
26
ompi/mca/btl/ofud/configure.params
Обычный файл
26
ompi/mca/btl/ofud/configure.params
Обычный файл
@ -0,0 +1,26 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=btl_ofud.c
|
||||
PARAM_CONFIG_HEADER_FILE="ofud_config.h"
|
||||
PARAM_CONFIG_FILES="Makefile"
|
Загрузка…
x
Ссылка в новой задаче
Block a user