Finally committing the UD BTL.
UD is the Unreliable Datagram transport for Infiniband, specifically OpenIB. This BTL is derived from the existing openib BTL, which is RC (Reliable Connection) based. Still a work in progress, as there is a lot of work left to do. Specifically, performance, scalability, and flow control need to be addressed. Currently I'm playing around with different methods for handling receive buffers, as well as profiling to figure out where the time is going. This commit was SVN r10271.
Этот коммит содержится в:
родитель
272ef9f412
Коммит
cca1616368
68
ompi/mca/btl/ud/Makefile.am
Обычный файл
68
ompi/mca/btl/ud/Makefile.am
Обычный файл
@ -0,0 +1,68 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Use the top-level Makefile.options
|
||||
|
||||
|
||||
|
||||
AM_CPPFLAGS=$(btl_ud_CPPFLAGS)
|
||||
|
||||
sources = \
|
||||
btl_ud.c \
|
||||
btl_ud.h \
|
||||
btl_ud_component.c \
|
||||
btl_ud_endpoint.c \
|
||||
btl_ud_endpoint.h \
|
||||
btl_ud_frag.c \
|
||||
btl_ud_frag.h \
|
||||
btl_ud_proc.c \
|
||||
btl_ud_proc.h
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if OMPI_BUILD_btl_ud_DSO
|
||||
lib =
|
||||
lib_sources =
|
||||
component = mca_btl_ud.la
|
||||
component_sources = $(sources)
|
||||
else
|
||||
lib = libmca_btl_ud.la
|
||||
lib_sources = $(sources)
|
||||
component =
|
||||
component_sources =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(libdir)/openmpi
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_btl_ud_la_SOURCES = $(component_sources)
|
||||
mca_btl_ud_la_LDFLAGS = -module -avoid-version $(btl_ud_LDFLAGS)
|
||||
mca_btl_ud_la_LIBADD = \
|
||||
$(btl_ud_LIBS) \
|
||||
$(top_ompi_builddir)/ompi/libmpi.la \
|
||||
$(top_ompi_builddir)/orte/liborte.la \
|
||||
$(top_ompi_builddir)/opal/libopal.la
|
||||
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_btl_ud_la_SOURCES = $(lib_sources)
|
||||
libmca_btl_ud_la_LDFLAGS= -module -avoid-version $(btl_ud_LDFLAGS)
|
||||
libmca_btl_ud_la_LIBADD=$(btl_ud_LIBS)
|
564
ompi/mca/btl/ud/btl_ud.c
Обычный файл
564
ompi/mca/btl/ud/btl_ud.c
Обычный файл
@ -0,0 +1,564 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
#include "btl_ud.h"
|
||||
#include "btl_ud_frag.h"
|
||||
#include "btl_ud_proc.h"
|
||||
#include "btl_ud_endpoint.h"
|
||||
#include "ompi/datatype/convertor.h"
|
||||
#include "ompi/datatype/datatype.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/mpool/openib/mpool_openib.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
mca_btl_ud_module_t mca_btl_ud_module = {
|
||||
{
|
||||
&mca_btl_ud_component.super,
|
||||
0, /* max size of first fragment */
|
||||
0, /* min send fragment size */
|
||||
0, /* max send fragment size */
|
||||
0, /* min rdma fragment size */
|
||||
0, /* max rdma fragment size */
|
||||
0, /* exclusivity */
|
||||
0, /* latency */
|
||||
0, /* bandwidth */
|
||||
MCA_BTL_FLAGS_SEND,
|
||||
mca_btl_ud_add_procs,
|
||||
mca_btl_ud_del_procs,
|
||||
mca_btl_ud_register,
|
||||
mca_btl_ud_finalize,
|
||||
/* we need alloc free, pack */
|
||||
mca_btl_ud_alloc,
|
||||
mca_btl_ud_free,
|
||||
mca_btl_ud_prepare_src,
|
||||
NULL, /*mca_btl_ud_prepare_dst */
|
||||
mca_btl_ud_send,
|
||||
NULL, /*mca_btl_ud_put */
|
||||
NULL, /*mca_btl_ud_get */
|
||||
mca_btl_ud_dump
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* add a proc to this btl module
|
||||
* creates an endpoint that is setup on the
|
||||
* first send to the endpoint
|
||||
*/
|
||||
int mca_btl_ud_add_procs(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **ompi_procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
||||
int i, rc;
|
||||
|
||||
for(i = 0; i < (int) nprocs; i++) {
|
||||
|
||||
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
||||
mca_btl_ud_proc_t* ib_proc;
|
||||
mca_btl_base_endpoint_t* ib_peer;
|
||||
|
||||
if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to make sure that the peer has at least as many interface
|
||||
* addresses exported as we are trying to use. If not, then
|
||||
* don't bind this PTL instance to the proc.
|
||||
*/
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
||||
|
||||
/* The btl_proc datastructure is shared by all IB PTL
|
||||
* instances that are trying to reach this destination.
|
||||
* Cache the peer instance on the btl_proc.
|
||||
*/
|
||||
ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t);
|
||||
if(NULL == ib_peer) {
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
ib_peer->endpoint_btl = ud_btl;
|
||||
ib_peer->subnet = ud_btl->port_info.subnet;
|
||||
rc = mca_btl_ud_proc_insert(ib_proc, ib_peer);
|
||||
if(rc != OMPI_SUCCESS) {
|
||||
OBJ_RELEASE(ib_peer);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
ompi_bitmap_set_bit(reachable, i);
|
||||
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||
peers[i] = ib_peer;
|
||||
}
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
ud_btl->rd_num = mca_btl_ud_component.rd_num + log2(nprocs) * mca_btl_ud_component.srq_rd_per_peer;
|
||||
if(ud_btl->rd_num > mca_btl_ud_component.srq_rd_max)
|
||||
ud_btl->rd_num = mca_btl_ud_component.srq_rd_max;
|
||||
ud_btl->rd_low = ud_btl->rd_num - 1;
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* delete the proc as reachable from this btl module
|
||||
*/
|
||||
int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t ** peers)
|
||||
{
|
||||
BTL_DEBUG(("TODO\n"));
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
*Register callback function to support send/recv semantics
|
||||
*/
|
||||
int mca_btl_ud_register(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata)
|
||||
{
|
||||
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*) btl;
|
||||
|
||||
OPAL_THREAD_LOCK(&ud_btl->ib_lock);
|
||||
ud_btl->ib_reg[tag].cbfunc = cbfunc;
|
||||
ud_btl->ib_reg[tag].cbdata = cbdata;
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ib_lock);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a segment.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Request segment size.
|
||||
*
|
||||
* When allocating a segment we pull a pre-alllocated segment
|
||||
* from one of two free lists, an eager list and a max list
|
||||
*/
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_alloc(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t size)
|
||||
{
|
||||
mca_btl_ud_frag_t* frag;
|
||||
int rc;
|
||||
|
||||
if(size <= mca_btl_ud_component.eager_limit){
|
||||
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
|
||||
frag->segment.seg_len = size;
|
||||
} else if(size <= mca_btl_ud_component.max_send_size) {
|
||||
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
|
||||
frag->segment.seg_len = size;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->base.des_flags = 0;
|
||||
return (mca_btl_base_descriptor_t*)frag;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a segment
|
||||
*
|
||||
* Return the segment to the appropriate
|
||||
* preallocated segment list
|
||||
*/
|
||||
int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des)
|
||||
{
|
||||
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des;
|
||||
|
||||
if(frag->size == 0) {
|
||||
btl->btl_mpool->mpool_release(btl->btl_mpool,
|
||||
(mca_mpool_base_registration_t*)
|
||||
frag->ud_reg);
|
||||
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
|
||||
|
||||
}
|
||||
else if(frag->size == mca_btl_ud_component.max_send_size){
|
||||
MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag);
|
||||
} else if(frag->size == mca_btl_ud_component.eager_limit){
|
||||
MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag);
|
||||
} else {
|
||||
BTL_ERROR(("invalid descriptor"));
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* register user buffer or pack
|
||||
* data into pre-registered buffer and return a
|
||||
* descriptor that can be
|
||||
* used for send/put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*
|
||||
* prepare source's behavior depends on the following:
|
||||
* Has a valid memory registration been passed to prepare_src?
|
||||
* if so we attempt to use the pre-registred user-buffer, if the memory registration
|
||||
* is to small (only a portion of the user buffer) then we must reregister the user buffer
|
||||
* Has the user requested the memory to be left pinned?
|
||||
* if so we insert the memory registration into a memory tree for later lookup, we
|
||||
* may also remove a previous registration if a MRU (most recently used) list of
|
||||
* registions is full, this prevents resources from being exhausted.
|
||||
* Is the requested size larger than the btl's max send size?
|
||||
* if so and we aren't asked to leave the registration pinned than we register the memory if
|
||||
* the users buffer is contiguous
|
||||
* Otherwise we choose from two free lists of pre-registered memory in which to pack the data into.
|
||||
*
|
||||
*/
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct ompi_convertor_t* convertor,
|
||||
size_t reserve,
|
||||
size_t* size
|
||||
)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
mca_btl_ud_frag_t* frag;
|
||||
mca_mpool_openib_registration_t * ud_reg;
|
||||
struct iovec iov;
|
||||
uint32_t iov_count = 1;
|
||||
size_t max_data = *size;
|
||||
int32_t free_after;
|
||||
int rc;
|
||||
|
||||
ud_btl = (mca_btl_ud_module_t*) btl;
|
||||
ud_reg = (mca_mpool_openib_registration_t*) registration;
|
||||
|
||||
if(NULL != ud_reg && 0 == ompi_convertor_need_buffers(convertor)){
|
||||
size_t reg_len;
|
||||
|
||||
/* the memory is already pinned and we have contiguous user data */
|
||||
|
||||
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
|
||||
if(NULL == frag){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
|
||||
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
|
||||
|
||||
frag->segment.seg_len = max_data;
|
||||
frag->segment.seg_addr.pval = iov.iov_base;
|
||||
|
||||
reg_len = (unsigned char*)ud_reg->base_reg.bound - (unsigned char*)iov.iov_base + 1;
|
||||
|
||||
frag->sg_entry.length = max_data;
|
||||
frag->sg_entry.lkey = ud_reg->mr->lkey;
|
||||
|
||||
frag->sg_entry.addr = (unsigned long) iov.iov_base;
|
||||
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
frag->base.des_flags = 0;
|
||||
frag->ud_reg = ud_reg;
|
||||
btl->btl_mpool->mpool_retain(btl->btl_mpool, (mca_mpool_base_registration_t*) ud_reg);
|
||||
return &frag->base;
|
||||
|
||||
} else if( max_data > btl->btl_max_send_size &&
|
||||
ompi_convertor_need_buffers(convertor) == 0 &&
|
||||
reserve == 0) {
|
||||
/* The user buffer is contigous and we are asked to send more than the max send size. */
|
||||
|
||||
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
|
||||
if(NULL == frag){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = NULL;
|
||||
|
||||
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
|
||||
|
||||
frag->segment.seg_len = max_data;
|
||||
frag->segment.seg_addr.pval = iov.iov_base;
|
||||
frag->base.des_flags = 0;
|
||||
|
||||
|
||||
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
|
||||
iov.iov_base,
|
||||
max_data,
|
||||
0,
|
||||
(mca_mpool_base_registration_t**) &ud_reg);
|
||||
if(OMPI_SUCCESS != rc || NULL == ud_reg) {
|
||||
BTL_ERROR(("mpool_register(%p,%lu) failed", iov.iov_base, max_data));
|
||||
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
frag->sg_entry.length = max_data;
|
||||
frag->sg_entry.lkey = ud_reg->mr->lkey;
|
||||
|
||||
frag->sg_entry.addr = (unsigned long) iov.iov_base;
|
||||
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
frag->ud_reg = ud_reg;
|
||||
|
||||
return &frag->base;
|
||||
|
||||
} else
|
||||
if (max_data+reserve <= btl->btl_eager_limit) {
|
||||
/* the data is small enough to fit in the eager frag and
|
||||
either we received no prepinned memory or leave pinned is
|
||||
not set
|
||||
*/
|
||||
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
|
||||
|
||||
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
|
||||
*size = max_data;
|
||||
if( rc < 0 ) {
|
||||
MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.seg_len = max_data + reserve;
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
frag->base.des_flags = 0;
|
||||
|
||||
return &frag->base;
|
||||
|
||||
} else {
|
||||
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
|
||||
if(NULL == frag) {
|
||||
return NULL;
|
||||
}
|
||||
if(max_data + reserve > btl->btl_max_send_size){
|
||||
max_data = btl->btl_max_send_size - reserve;
|
||||
}
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
|
||||
|
||||
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
|
||||
*size = max_data;
|
||||
|
||||
if( rc < 0 ) {
|
||||
MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
frag->segment.seg_len = max_data + reserve;
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
frag->base.des_flags=0;
|
||||
|
||||
return &frag->base;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
ud_btl = (mca_btl_ud_module_t*) btl;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initiate a send.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_send(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
mca_btl_base_tag_t tag)
|
||||
|
||||
{
|
||||
int rc;
|
||||
|
||||
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor;
|
||||
MCA_BTL_UD_START_TIME(post_send);
|
||||
frag->endpoint = endpoint;
|
||||
frag->hdr->tag = tag;
|
||||
rc = mca_btl_ud_endpoint_send(endpoint, frag);
|
||||
MCA_BTL_UD_END_TIME(post_send);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize the btl module by allocating a protection domain
|
||||
* and creating both the high and low priority completion queues
|
||||
*/
|
||||
int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
||||
{
|
||||
/* Allocate Protection Domain */
|
||||
struct ibv_context *ctx;
|
||||
|
||||
ctx = ud_btl->ib_dev_context;
|
||||
|
||||
ud_btl->ib_pd = ibv_alloc_pd(ctx);
|
||||
if(NULL == ud_btl->ib_pd) {
|
||||
BTL_ERROR(("error allocating pd for %s errno says %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev),
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
struct ibv_srq_init_attr attr;
|
||||
attr.attr.max_wr = mca_btl_ud_component.srq_rd_max;
|
||||
attr.attr.max_sge = mca_btl_ud_component.ib_sg_list_size;
|
||||
|
||||
ud_btl->srd_posted_hp = 0;
|
||||
ud_btl->srd_posted_lp = 0;
|
||||
|
||||
ud_btl->srq_hp = ibv_create_srq(ud_btl->ib_pd, &attr);
|
||||
if(NULL == ud_btl->srq_hp) {
|
||||
BTL_ERROR(("error in ibv_create_srq\n"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
ud_btl->srq_lp = ibv_create_srq(ud_btl->ib_pd, &attr);
|
||||
if(NULL == ud_btl->srq_hp) {
|
||||
BTL_ERROR(("error in ibv_create_srq\n"));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
} else {
|
||||
ud_btl->srq_hp = NULL;
|
||||
ud_btl->srq_lp = NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Create the low and high priority completion queues */
|
||||
#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3
|
||||
ud_btl->ib_cq_lp =
|
||||
ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, NULL);
|
||||
#else
|
||||
ud_btl->ib_cq_lp =
|
||||
ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size,
|
||||
NULL, NULL, 0);
|
||||
#endif
|
||||
|
||||
if(NULL == ud_btl->ib_cq_lp) {
|
||||
BTL_ERROR(("error creating low priority cq for %s errno says %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev),
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
#if OMPI_MCA_BTL_OPENIB_IBV_CREATE_CQ_ARGS == 3
|
||||
ud_btl->ib_cq_hp =
|
||||
ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size, NULL);
|
||||
#else
|
||||
ud_btl->ib_cq_hp =
|
||||
ibv_create_cq(ctx, mca_btl_ud_component.ib_cq_size,
|
||||
NULL, NULL, 0);
|
||||
#endif
|
||||
|
||||
if(NULL == ud_btl->ib_cq_hp) {
|
||||
BTL_ERROR(("error creating high priority cq for %s errno says %s\n",
|
||||
ibv_get_device_name(ud_btl->ib_dev),
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Set up the QPs for this BTL */
|
||||
if(OMPI_SUCCESS != mca_btl_ud_endpoint_init_qp(&ud_btl->super,
|
||||
ud_btl->ib_cq_hp,
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
ud_btl->srq_hp,
|
||||
#endif
|
||||
&ud_btl->qp_hp,
|
||||
ud_btl->psn_hp)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if(OMPI_SUCCESS != mca_btl_ud_endpoint_init_qp(&ud_btl->super,
|
||||
ud_btl->ib_cq_lp,
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
ud_btl->srq_lp,
|
||||
#endif
|
||||
&ud_btl->qp_lp,
|
||||
ud_btl->psn_lp)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dump profiling information
|
||||
*/
|
||||
void mca_btl_ud_dump(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
int verbose)
|
||||
{
|
||||
|
||||
mca_btl_base_dump(btl, endpoint, verbose);
|
||||
}
|
||||
|
597
ompi/mca/btl/ud/btl_ud.h
Обычный файл
597
ompi/mca/btl/ud/btl_ud.h
Обычный файл
@ -0,0 +1,597 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*/
|
||||
#ifndef MCA_PTL_UD_H
|
||||
#define MCA_PTL_UD_H
|
||||
|
||||
/* Standard system includes */
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Open MPI includes */
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
#include "ompi/class/ompi_bitmap.h"
|
||||
#include "orte/class/orte_pointer_array.h"
|
||||
#include "opal/class/opal_value_array.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/sys/timer.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "btl_ud_endpoint.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MCA_BTL_IB_LEAVE_PINNED 1
|
||||
|
||||
/**
|
||||
* UD Infiniband (IB) BTL component.
|
||||
*/
|
||||
|
||||
struct mca_btl_ud_component_t {
|
||||
mca_btl_base_component_1_0_0_t super; /**< base BTL component */
|
||||
|
||||
uint32_t ib_max_btls;
|
||||
/**< maximum number of hcas available to the IB component */
|
||||
|
||||
uint32_t ib_num_btls;
|
||||
/**< number of hcas available to the IB component */
|
||||
|
||||
struct mca_btl_ud_module_t *ud_btls;
|
||||
/**< array of available PTLs */
|
||||
|
||||
int ib_free_list_num;
|
||||
/**< initial size of free lists */
|
||||
|
||||
int ib_free_list_max;
|
||||
/**< maximum size of free lists */
|
||||
|
||||
int ib_free_list_inc;
|
||||
/**< number of elements to alloc when growing free lists */
|
||||
|
||||
opal_list_t ib_procs;
|
||||
/**< list of ib proc structures */
|
||||
|
||||
opal_mutex_t ib_lock;
|
||||
/**< lock for accessing module state */
|
||||
|
||||
char* ib_mpool_name;
|
||||
/**< name of ib memory pool */
|
||||
|
||||
int32_t sd_num; /**< maximum number of send descriptors to post to a QP */
|
||||
int32_t rd_num; /**< number of receive descriptors to post to each QP */
|
||||
int32_t rd_low; /**< low water mark to reach before re-posting receive descriptors */
|
||||
|
||||
int32_t srq_rd_max; /* maximum number of receive descriptors posted */
|
||||
int32_t srq_rd_per_peer; /* number of receive descriptors to post per log2(peers) in SRQ mode */
|
||||
int32_t srq_sd_max; /* maximum number of send descriptors posted */
|
||||
|
||||
size_t eager_limit;
|
||||
size_t max_send_size;
|
||||
uint32_t reg_mru_len;
|
||||
uint32_t use_srq;
|
||||
|
||||
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
|
||||
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
|
||||
uint32_t ib_pkey_ix;
|
||||
uint32_t ib_qkey;
|
||||
uint32_t ib_psn;
|
||||
uint32_t ib_service_level;
|
||||
uint32_t ib_src_path_bits;
|
||||
|
||||
}; typedef struct mca_btl_ud_component_t mca_btl_ud_component_t;
|
||||
|
||||
extern mca_btl_ud_component_t mca_btl_ud_component;
|
||||
|
||||
typedef mca_btl_base_recv_reg_t mca_btl_ud_recv_reg_t;
|
||||
|
||||
|
||||
/**
|
||||
* Profiling variables
|
||||
*/
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
#define MCA_BTL_UD_ENABLE_PROFILE 1
|
||||
#else
|
||||
#define MCA_BTL_UD_ENABLE_PROFILE 0
|
||||
#endif
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
|
||||
#define MCA_BTL_UD_PROFILE_VAR(var) \
|
||||
opal_timer_t avg_ ## var; \
|
||||
opal_timer_t cnt_ ## var; \
|
||||
opal_timer_t tmp_ ## var
|
||||
|
||||
struct mca_btl_ud_profile_t
|
||||
{
|
||||
MCA_BTL_UD_PROFILE_VAR(post_send);
|
||||
MCA_BTL_UD_PROFILE_VAR(endpoint_send_conn);
|
||||
MCA_BTL_UD_PROFILE_VAR(ibv_post_send);
|
||||
MCA_BTL_UD_PROFILE_VAR(full_send);
|
||||
};
|
||||
|
||||
typedef struct mca_btl_ud_profile_t mca_btl_ud_profile_t;
|
||||
extern mca_btl_ud_profile_t mca_btl_ud_profile;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* IB PTL Interface
|
||||
*/
|
||||
struct mca_btl_ud_module_t {
|
||||
mca_btl_base_module_t super; /**< base PTL interface */
|
||||
mca_btl_ud_recv_reg_t ib_reg[256];
|
||||
mca_btl_ud_port_info_t port_info; /* contains only the subnet right now */
|
||||
uint8_t port_num; /**< ID of the PORT */
|
||||
struct ibv_device *ib_dev; /* the ib device */
|
||||
struct ibv_context *ib_dev_context;
|
||||
struct ibv_pd *ib_pd;
|
||||
struct ibv_cq *ib_cq_hp;
|
||||
struct ibv_cq *ib_cq_lp;
|
||||
struct ibv_port_attr* ib_port_attr;
|
||||
|
||||
ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */
|
||||
ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */
|
||||
ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */
|
||||
|
||||
ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */
|
||||
ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */
|
||||
|
||||
opal_list_t pending_frags_hp;
|
||||
/**< list of pending high priority frags */
|
||||
|
||||
opal_list_t pending_frags_lp;
|
||||
/**< list of pending low priority frags */
|
||||
|
||||
opal_mutex_t ib_lock; /**< module level lock */
|
||||
|
||||
size_t ib_inline_max; /**< max size of inline send*/
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
struct ibv_srq *srq_hp;
|
||||
struct ibv_srq *srq_lp;
|
||||
int32_t srd_posted_hp;
|
||||
int32_t srd_posted_lp;
|
||||
#endif
|
||||
|
||||
int32_t rd_num;
|
||||
int32_t rd_low;
|
||||
|
||||
int32_t rd_posted_hp; /**< number of high priority descriptors posted */
|
||||
int32_t rd_posted_lp; /**< number of low priority descriptors posted */
|
||||
|
||||
int32_t sd_wqe_hp; /**< number of available send wqe entries */
|
||||
int32_t sd_wqe_lp; /**< number of available send wqe entries */
|
||||
|
||||
uint32_t psn_hp;
|
||||
uint32_t psn_lp;
|
||||
/* Local processes port sequence number (Low and High) */
|
||||
|
||||
struct ibv_qp* qp_hp;
|
||||
struct ibv_qp* qp_lp;
|
||||
/* Local QP (Low and High) */
|
||||
}; typedef struct mca_btl_ud_module_t mca_btl_ud_module_t;
|
||||
|
||||
struct mca_btl_ud_frag_t;
|
||||
extern mca_btl_ud_module_t mca_btl_ud_module;
|
||||
|
||||
/**
|
||||
* Register IB component parameters with the MCA framework
|
||||
*/
|
||||
extern int mca_btl_ud_component_open(void);
|
||||
|
||||
/**
|
||||
* Any final cleanup before being unloaded.
|
||||
*/
|
||||
extern int mca_btl_ud_component_close(void);
|
||||
|
||||
/**
|
||||
* IB component initialization.
|
||||
*
|
||||
* @param num_btl_modules (OUT) Number of BTLs returned in BTL array.
|
||||
* @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE)
|
||||
* @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE)
|
||||
*
|
||||
* (1) read interface list from kernel and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
* (2) setup IB listen socket for incoming connection attempts
|
||||
* (3) publish BTL addressing info
|
||||
*
|
||||
*/
|
||||
extern mca_btl_base_module_t** mca_btl_ud_component_init(
|
||||
int *num_btl_modules,
|
||||
bool allow_multi_user_threads,
|
||||
bool have_hidden_threads
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* IB component progress.
|
||||
*/
|
||||
extern int mca_btl_ud_component_progress(void);
|
||||
|
||||
|
||||
/**
|
||||
* Register a callback function that is called on receipt
|
||||
* of a fragment.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @return Status indicating if cleanup was successful
|
||||
*
|
||||
* When the process list changes, the PML notifies the BTL of the
|
||||
* change, to provide the opportunity to cleanup or release any
|
||||
* resources associated with the peer.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_register(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
||||
void* cbdata
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Cleanup any resources held by the BTL.
|
||||
*
|
||||
* @param btl BTL instance.
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_finalize(
|
||||
struct mca_btl_base_module_t* btl
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN)
|
||||
* @param nprocs (IN) Number of processes
|
||||
* @param procs (IN) Set of processes
|
||||
* @param peers (OUT) Set of (optional) peer addressing info.
|
||||
* @param peers (IN/OUT) Set of processes that are reachable via this BTL.
|
||||
* @return OMPI_SUCCESS or error status on failure.
|
||||
*
|
||||
*/
|
||||
|
||||
extern int mca_btl_ud_add_procs(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** peers,
|
||||
ompi_bitmap_t* reachable
|
||||
);
|
||||
|
||||
/**
|
||||
* PML->BTL notification of change in the process list.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param nproc (IN) Number of processes.
|
||||
* @param procs (IN) Set of processes.
|
||||
* @param peers (IN) Set of peer data structures.
|
||||
* @return Status indicating if cleanup was successful
|
||||
*
|
||||
*/
|
||||
extern int mca_btl_ud_del_procs(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t nprocs,
|
||||
struct ompi_proc_t **procs,
|
||||
struct mca_btl_base_endpoint_t** peers
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a send of the specified size.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param btl_base_peer (IN) BTL peer addressing
|
||||
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
|
||||
* @param size (IN) Number of bytes PML is requesting BTL to deliver
|
||||
* @param flags (IN) Flags that should be passed to the peer via the message header.
|
||||
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
|
||||
*/
|
||||
extern int mca_btl_ud_send(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* btl_peer,
|
||||
struct mca_btl_base_descriptor_t* descriptor,
|
||||
mca_btl_base_tag_t tag
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Allocate a descriptor.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param size (IN) Requested descriptor size.
|
||||
*/
|
||||
extern mca_btl_base_descriptor_t* mca_btl_ud_alloc(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
size_t size);
|
||||
|
||||
|
||||
/**
|
||||
* Return a segment allocated by this BTL.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param descriptor (IN) Allocated descriptor.
|
||||
*/
|
||||
extern int mca_btl_ud_free(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
mca_btl_base_descriptor_t* des);
|
||||
|
||||
|
||||
/**
|
||||
* Pack data and return a descriptor that can be
|
||||
* used for send/put.
|
||||
*
|
||||
* @param btl (IN) BTL module
|
||||
* @param peer (IN) BTL peer addressing
|
||||
*/
|
||||
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* peer,
|
||||
mca_mpool_base_registration_t* registration,
|
||||
struct ompi_convertor_t* convertor,
|
||||
size_t reserve,
|
||||
size_t* size
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* Return a send fragment to the modules free list.
|
||||
*
|
||||
* @param btl (IN) BTL instance
|
||||
* @param frag (IN) IB send fragment
|
||||
*
|
||||
*/
|
||||
extern void mca_btl_ud_send_frag_return(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_ud_frag_t*
|
||||
);
|
||||
|
||||
|
||||
int mca_btl_ud_module_init(mca_btl_ud_module_t* ud_btl);
|
||||
|
||||
void mca_btl_ud_dump(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
struct mca_btl_base_endpoint_t* endpoint,
|
||||
int verbose);
|
||||
|
||||
|
||||
/*
|
||||
* Profiling stuff
|
||||
*/
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
|
||||
#define MCA_BTL_UD_START_TIME(var) \
|
||||
((mca_btl_ud_profile.tmp_ ## var) = opal_sys_timer_get_cycles())
|
||||
#define MCA_BTL_UD_END_TIME(var) \
|
||||
do { \
|
||||
mca_btl_ud_profile.avg_ ## var += \
|
||||
opal_sys_timer_get_cycles() - mca_btl_ud_profile.tmp_ ## var; \
|
||||
mca_btl_ud_profile.cnt_ ## var++; \
|
||||
} while(0)
|
||||
|
||||
#define MCA_BTL_UD_SHOW_TIME(var) \
|
||||
BTL_VERBOSE((" " #var " avg %lu cnt %lu", \
|
||||
(mca_btl_ud_profile.avg_ ## var) / (mca_btl_ud_profile.cnt_ ## var), \
|
||||
mca_btl_ud_profile.cnt_ ## var));
|
||||
|
||||
#else
|
||||
#define MCA_BTL_UD_START_TIME(var)
|
||||
#define MCA_BTL_UD_END_TIME(var)
|
||||
#define MCA_BTL_UD_SHOW_TIME(var)
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Post non-SRQ receive buffers
|
||||
*/
|
||||
|
||||
#define MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, \
|
||||
additional) \
|
||||
{ \
|
||||
do { \
|
||||
OPAL_THREAD_LOCK(&ud_btl->ib_lock); \
|
||||
if(ud_btl->rd_posted_hp <= mca_btl_ud_component.rd_low+additional && \
|
||||
ud_btl->rd_posted_hp < ud_btl->rd_num) { \
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_SUB(ud_btl->rd_num - \
|
||||
ud_btl->rd_posted_hp, \
|
||||
&ud_btl->recv_free_eager, \
|
||||
ud_btl->rd_posted_hp, \
|
||||
ud_btl->qp_hp); \
|
||||
} \
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \
|
||||
} while(0); \
|
||||
}
|
||||
|
||||
#define MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, \
|
||||
additional) { \
|
||||
do { \
|
||||
OPAL_THREAD_LOCK(&ud_btl->ib_lock); \
|
||||
if(ud_btl->rd_posted_lp <= mca_btl_ud_component.rd_low+additional && \
|
||||
ud_btl->rd_posted_lp < ud_btl->rd_num){ \
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_SUB(ud_btl->rd_num - \
|
||||
ud_btl->rd_posted_lp, \
|
||||
&ud_btl->recv_free_max, \
|
||||
ud_btl->rd_posted_lp, \
|
||||
ud_btl->qp_lp \
|
||||
); } \
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \
|
||||
} while(0); \
|
||||
}
|
||||
|
||||
#define MCA_BTL_UD_ENDPOINT_POST_RR_SUB(cnt, \
|
||||
frag_list, \
|
||||
rd_posted, \
|
||||
qp ) \
|
||||
do { \
|
||||
int32_t i; \
|
||||
int rc; \
|
||||
int32_t num_post = cnt; \
|
||||
struct ibv_recv_wr* bad_wr; \
|
||||
for(i = 0; i < num_post; i++) { \
|
||||
opal_list_item_t* item; \
|
||||
mca_btl_ud_frag_t* frag; \
|
||||
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
frag->sg_entry.length = frag->size + sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t); \
|
||||
if(ibv_post_recv(qp, \
|
||||
&frag->wr_desc.rd_desc, \
|
||||
&bad_wr)) { \
|
||||
BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \
|
||||
return OMPI_ERROR; \
|
||||
}\
|
||||
}\
|
||||
OPAL_THREAD_ADD32(&(rd_posted), num_post); \
|
||||
} while(0);
|
||||
|
||||
#if 0
|
||||
#define MCA_BTL_UD_ENDPOINT_POST_RR_SUB(cnt, \
|
||||
frag_list, \
|
||||
rd_posted, \
|
||||
qp ) \
|
||||
do { \
|
||||
int32_t i; \
|
||||
int rc; \
|
||||
int32_t num_post = cnt; \
|
||||
struct ibv_recv_wr* head_wr; \
|
||||
struct ibv_recv_wr* prev_wr; \
|
||||
opal_list_item_t* item; \
|
||||
mca_btl_ud_frag_t* frag; \
|
||||
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*)item; \
|
||||
head_wr = &frag->wr_desc.rd_desc; \
|
||||
prev_wr = head_wr; \
|
||||
OPAL_OUTPUT((0, "posting %d recvs\n", num_post)); \
|
||||
for(i = 1; i < num_post; i++) { \
|
||||
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
prev_wr->next = &frag->wr_desc.rd_desc; \
|
||||
prev_wr = prev_wr->next; \
|
||||
}\
|
||||
prev_wr->next = NULL; \
|
||||
if(ibv_post_recv(qp, head_wr, &prev_wr)) { \
|
||||
BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); \
|
||||
return OMPI_ERROR; \
|
||||
}\
|
||||
OPAL_THREAD_ADD32(&(rd_posted), num_post); \
|
||||
} while(0);
|
||||
#endif
|
||||
|
||||
#define BTL_OPENIB_INSERT_PENDING(frag, frag_list, tokens, lock) \
|
||||
do{ \
|
||||
OPAL_THREAD_LOCK(&lock); \
|
||||
opal_list_append(&frag_list, (opal_list_item_t *)frag); \
|
||||
OPAL_THREAD_UNLOCK(&lock); \
|
||||
OPAL_THREAD_ADD32(&tokens, 1); \
|
||||
} while(0);
|
||||
|
||||
|
||||
/*
|
||||
* Post SRQ receive buffers
|
||||
*/
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
|
||||
#define MCA_BTL_UD_POST_SRR_HIGH(ud_btl, additional) \
|
||||
{ \
|
||||
do{ \
|
||||
OPAL_THREAD_LOCK(&ud_btl->ib_lock); \
|
||||
if(ud_btl->srd_posted_hp <= ud_btl->rd_low+additional && \
|
||||
ud_btl->srd_posted_hp < ud_btl->rd_num){ \
|
||||
MCA_BTL_UD_POST_SRR_SUB(ud_btl->rd_num - \
|
||||
ud_btl->srd_posted_hp, \
|
||||
ud_btl, \
|
||||
&ud_btl->recv_free_eager, \
|
||||
&ud_btl->srd_posted_hp, \
|
||||
ud_btl->srq_hp); \
|
||||
} \
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \
|
||||
} while(0); \
|
||||
}
|
||||
|
||||
#define MCA_BTL_UD_POST_SRR_LOW(ud_btl, additional) \
|
||||
{ \
|
||||
do { \
|
||||
OPAL_THREAD_LOCK(&ud_btl->ib_lock); \
|
||||
if(ud_btl->srd_posted_lp <= ud_btl->rd_low+additional && \
|
||||
ud_btl->srd_posted_lp < ud_btl->rd_num){ \
|
||||
MCA_BTL_UD_POST_SRR_SUB(ud_btl->rd_num - \
|
||||
ud_btl->srd_posted_lp, \
|
||||
ud_btl, \
|
||||
&ud_btl->recv_free_max, \
|
||||
&ud_btl->srd_posted_lp, \
|
||||
ud_btl->srq_lp); \
|
||||
} \
|
||||
OPAL_THREAD_UNLOCK(&ud_btl->ib_lock); \
|
||||
} while(0); \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_BTL_UD_POST_SRR_SUB(cnt, \
|
||||
ud_btl, \
|
||||
frag_list, \
|
||||
srd_posted, \
|
||||
srq) \
|
||||
{\
|
||||
do { \
|
||||
int32_t i; \
|
||||
int32_t num_post = cnt; \
|
||||
opal_list_item_t* item = NULL; \
|
||||
mca_btl_ud_frag_t* frag = NULL; \
|
||||
struct ibv_recv_wr *bad_wr; \
|
||||
int32_t rc; \
|
||||
for(i = 0; i < num_post; i++) { \
|
||||
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
frag->sg_entry.length = frag->size + \
|
||||
((unsigned char*) frag->segment.seg_addr.pval- \
|
||||
(unsigned char*) frag->hdr); \
|
||||
if(ibv_post_srq_recv(srq, &frag->wr_desc.rd_desc, &bad_wr)) { \
|
||||
BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\
|
||||
strerror(errno))); \
|
||||
return OMPI_ERROR; \
|
||||
}\
|
||||
}\
|
||||
OPAL_THREAD_ADD32(srd_posted, num_post); \
|
||||
} while(0);\
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
768
ompi/mca/btl/ud/btl_ud_component.c
Обычный файл
768
ompi/mca/btl/ud/btl_ud_component.c
Обычный файл
@ -0,0 +1,768 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "opal/util/if.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "opal/sys/timer.h"
|
||||
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "btl_ud.h"
|
||||
#include "btl_ud_frag.h"
|
||||
#include "btl_ud_endpoint.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
|
||||
|
||||
#include "ompi/datatype/convertor.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
#include <sysfs/libsysfs.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include <errno.h>
|
||||
#include <string.h> /* for strerror()*/
|
||||
|
||||
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
mca_btl_ud_component_t mca_btl_ud_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a pml v1.0.0 component (which also implies a
|
||||
specific MCA version) */
|
||||
|
||||
MCA_BTL_BASE_VERSION_1_0_0,
|
||||
|
||||
"ud", /* MCA component name */
|
||||
OMPI_MAJOR_VERSION, /* MCA component major version */
|
||||
OMPI_MINOR_VERSION, /* MCA component minor version */
|
||||
OMPI_RELEASE_VERSION, /* MCA component release version */
|
||||
mca_btl_ud_component_open, /* component open */
|
||||
mca_btl_ud_component_close /* component close */
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
|
||||
false
|
||||
},
|
||||
|
||||
mca_btl_ud_component_init,
|
||||
mca_btl_ud_component_progress,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Profiling information
|
||||
*/
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
mca_btl_ud_profile_t mca_btl_ud_profile = {0};
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* utility routines for parameter registration
|
||||
*/
|
||||
|
||||
static inline void mca_btl_ud_param_register_string(
|
||||
const char* param_name,
|
||||
const char* param_desc,
|
||||
const char* default_value,
|
||||
char** out_value)
|
||||
{
|
||||
mca_base_param_reg_string(&mca_btl_ud_component.super.btl_version,
|
||||
param_name,
|
||||
param_desc,
|
||||
false,
|
||||
false,
|
||||
default_value,
|
||||
out_value);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ud_param_register_int(
|
||||
const char* param_name,
|
||||
const char* param_desc,
|
||||
int default_value,
|
||||
int* out_value)
|
||||
{
|
||||
mca_base_param_reg_int(&mca_btl_ud_component.super.btl_version,
|
||||
param_name,
|
||||
param_desc,
|
||||
false,
|
||||
false,
|
||||
default_value,
|
||||
out_value);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by MCA framework to open the component, registers
|
||||
* component parameters.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_component_open(void)
|
||||
{
|
||||
/* initialize state */
|
||||
mca_btl_ud_component.ib_num_btls=0;
|
||||
mca_btl_ud_component.ud_btls=NULL;
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_ud_component.ib_procs, opal_list_t);
|
||||
|
||||
/* register IB component parameters */
|
||||
mca_btl_ud_param_register_int ("max_btls", "maximum number of HCAs/ports to use",
|
||||
4, (int*)&mca_btl_ud_component.ib_max_btls);
|
||||
mca_btl_ud_param_register_int ("free_list_num", "intial size of free lists",
|
||||
8, &mca_btl_ud_component.ib_free_list_num);
|
||||
mca_btl_ud_param_register_int ("free_list_max", "maximum size of free lists",
|
||||
-1, &mca_btl_ud_component.ib_free_list_max);
|
||||
mca_btl_ud_param_register_int ("free_list_inc", "increment size of free lists",
|
||||
32, &mca_btl_ud_component.ib_free_list_inc);
|
||||
mca_btl_ud_param_register_string("mpool", "name of the memory pool to be used",
|
||||
"openib", &mca_btl_ud_component.ib_mpool_name);
|
||||
mca_btl_ud_param_register_int("reg_mru_len", "length of the registration cache most recently used list",
|
||||
16, (int*) &mca_btl_ud_component.reg_mru_len);
|
||||
mca_btl_ud_param_register_int("use_srq", "if 1 use the IB shared receive queue to post receive descriptors",
|
||||
0, (int*) &mca_btl_ud_component.use_srq);
|
||||
mca_btl_ud_param_register_int("ib_cq_size", "size of the IB completion queue",
|
||||
2000, (int*) &mca_btl_ud_component.ib_cq_size);
|
||||
mca_btl_ud_param_register_int("ib_sg_list_size", "size of IB segment list",
|
||||
4, (int*) &mca_btl_ud_component.ib_sg_list_size);
|
||||
mca_btl_ud_param_register_int("ib_pkey_ix", "IB pkey index",
|
||||
0, (int*) &mca_btl_ud_component.ib_pkey_ix);
|
||||
mca_btl_ud_param_register_int("ib_qkey", "IB qkey",
|
||||
0x01330133, (int*) &mca_btl_ud_component.ib_qkey);
|
||||
mca_btl_ud_param_register_int("ib_psn", "IB Packet sequence starting number",
|
||||
0, (int*) &mca_btl_ud_component.ib_psn);
|
||||
mca_btl_ud_param_register_int("ib_service_level", "IB service level",
|
||||
0, (int*) &mca_btl_ud_component.ib_service_level);
|
||||
mca_btl_ud_param_register_int("ib_src_path_bits", "IB source path bits",
|
||||
0, (int*) &mca_btl_ud_component.ib_src_path_bits);
|
||||
mca_btl_ud_param_register_int ("exclusivity", "BTL exclusivity",
|
||||
MCA_BTL_EXCLUSIVITY_DEFAULT, (int*) &mca_btl_ud_module.super.btl_exclusivity);
|
||||
mca_btl_ud_param_register_int("sd_num", "maximum descriptors to post to a QP",
|
||||
16, (int*) &mca_btl_ud_component.sd_num);
|
||||
mca_btl_ud_param_register_int("rd_num", "number of receive descriptors to post to a QP",
|
||||
500, (int*) &mca_btl_ud_component.rd_num);
|
||||
mca_btl_ud_param_register_int("rd_low", "low water mark before reposting occurs",
|
||||
300, (int*) &mca_btl_ud_component.rd_low);
|
||||
mca_btl_ud_param_register_int("srq_rd_max", "Max number of receive descriptors posted per SRQ.",
|
||||
1000, (int*) &mca_btl_ud_component.srq_rd_max);
|
||||
mca_btl_ud_param_register_int("srq_rd_per_peer", "Number of receive descriptors posted per peer. (SRQ)",
|
||||
16, (int*) &mca_btl_ud_component.srq_rd_per_peer);
|
||||
mca_btl_ud_param_register_int("srq_sd_max", "Maximum number of send descriptors posted. (SRQ)",
|
||||
8, &mca_btl_ud_component.srq_sd_max);
|
||||
|
||||
/* TODO - this assumes a 2k UD MTU - should query/do something more intelligent */
|
||||
mca_btl_ud_param_register_int ("eager_limit", "eager send limit",
|
||||
2047, (int*)&mca_btl_ud_module.super.btl_eager_limit);
|
||||
mca_btl_ud_param_register_int ("min_send_size", "minimum send size",
|
||||
2048, (int*)&mca_btl_ud_module.super.btl_min_send_size);
|
||||
mca_btl_ud_param_register_int ("max_send_size", "maximum send size",
|
||||
2048, (int*) &mca_btl_ud_module.super.btl_max_send_size);
|
||||
mca_btl_ud_param_register_int("bandwidth", "Approximate maximum bandwidth of interconnect",
|
||||
800, (int*) &mca_btl_ud_module.super.btl_bandwidth);
|
||||
|
||||
mca_btl_ud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t);
|
||||
mca_btl_ud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t);
|
||||
mca_btl_ud_component.max_send_size = mca_btl_ud_module.super.btl_max_send_size;
|
||||
mca_btl_ud_component.eager_limit = mca_btl_ud_module.super.btl_eager_limit;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* component cleanup - sanity checking of queue lengths
|
||||
*/
|
||||
|
||||
int mca_btl_ud_component_close(void)
|
||||
{
|
||||
/* Calculate and print profiling numbers */
|
||||
MCA_BTL_UD_SHOW_TIME(post_send);
|
||||
MCA_BTL_UD_SHOW_TIME(endpoint_send_conn);
|
||||
MCA_BTL_UD_SHOW_TIME(ibv_post_send);
|
||||
MCA_BTL_UD_SHOW_TIME(full_send);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register UD port information. The MCA framework
|
||||
* will make this available to all peers.
|
||||
*/
|
||||
|
||||
static int
|
||||
mca_btl_ud_modex_send(void)
|
||||
{
|
||||
int rc;
|
||||
size_t i;
|
||||
size_t size;
|
||||
mca_btl_ud_port_info_t *ports = NULL;
|
||||
|
||||
size = mca_btl_ud_component.ib_num_btls * sizeof (mca_btl_ud_port_info_t);
|
||||
if (size != 0) {
|
||||
ports = (mca_btl_ud_port_info_t *)malloc (size);
|
||||
if (NULL == ports) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
for (i = 0; i < mca_btl_ud_component.ib_num_btls; i++) {
|
||||
mca_btl_ud_module_t *btl = &mca_btl_ud_component.ud_btls[i];
|
||||
ports[i] = btl->port_info;
|
||||
}
|
||||
}
|
||||
rc = mca_pml_base_modex_send (&mca_btl_ud_component.super.btl_version, ports, size);
|
||||
if (NULL != ports) {
|
||||
free (ports);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* UD component initialization:
|
||||
* (1) read interface list from kernel and compare against component parameters
|
||||
* then create a BTL instance for selected interfaces
|
||||
* (2) post OOB receive for incoming connection attempts
|
||||
* (3) register BTL parameters with the MCA
|
||||
*/
|
||||
|
||||
mca_btl_base_module_t** mca_btl_ud_component_init(int *num_btl_modules,
|
||||
bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
struct ibv_device **ib_devs;
|
||||
struct ibv_device* ib_dev;
|
||||
int32_t num_devs;
|
||||
mca_btl_base_module_t** btls;
|
||||
uint32_t i,j, length;
|
||||
struct mca_mpool_base_resources_t mpool_resources;
|
||||
opal_list_t btl_list;
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
mca_btl_base_selected_module_t* ib_selected;
|
||||
opal_list_item_t* item;
|
||||
unsigned short seedv[3];
|
||||
|
||||
/* initialization */
|
||||
*num_btl_modules = 0;
|
||||
num_devs = 0;
|
||||
|
||||
seedv[0] = orte_process_info.my_name->vpid;
|
||||
seedv[1] = opal_sys_timer_get_cycles();
|
||||
seedv[2] = opal_sys_timer_get_cycles();
|
||||
seed48(seedv);
|
||||
|
||||
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST
|
||||
ib_devs = ibv_get_device_list(&num_devs);
|
||||
#else
|
||||
/* Determine the number of hca's available on the host */
|
||||
dev_list = ibv_get_devices();
|
||||
if (NULL == dev_list) {
|
||||
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
||||
mca_btl_ud_component.ib_num_btls = 0;
|
||||
mca_btl_ud_modex_send();
|
||||
return NULL;
|
||||
}
|
||||
dlist_start(dev_list);
|
||||
|
||||
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
||||
num_devs++;
|
||||
#endif
|
||||
|
||||
if(0 == num_devs) {
|
||||
mca_btl_base_error_no_nics("OpenIB", "HCA");
|
||||
mca_btl_ud_modex_send();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST == 0
|
||||
/* Allocate space for the ib devices */
|
||||
ib_devs = (struct ibv_device**) malloc(num_devs * sizeof(struct ibv_dev*));
|
||||
if(NULL == ib_devs) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
dlist_start(dev_list);
|
||||
|
||||
i = 0;
|
||||
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
||||
ib_devs[i++] = ib_dev;
|
||||
#endif
|
||||
|
||||
/** We must loop through all the hca id's, get there handles and
|
||||
for each hca we query the number of ports on the hca and set up
|
||||
a distinct btl module for each hca port */
|
||||
|
||||
OBJ_CONSTRUCT(&btl_list, opal_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_ud_component.ib_lock, opal_mutex_t);
|
||||
|
||||
|
||||
for(i = 0; (int32_t)i < num_devs
|
||||
&& mca_btl_ud_component.ib_num_btls < mca_btl_ud_component.ib_max_btls; i++){
|
||||
struct ibv_device_attr ib_dev_attr;
|
||||
struct ibv_context* ib_dev_context;
|
||||
|
||||
ib_dev = ib_devs[i];
|
||||
|
||||
ib_dev_context = ibv_open_device(ib_dev);
|
||||
if(!ib_dev_context) {
|
||||
BTL_ERROR((" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
|
||||
BTL_ERROR(("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Note ports are 1 based hence j = 1 */
|
||||
|
||||
for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){
|
||||
struct ibv_port_attr* ib_port_attr;
|
||||
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
|
||||
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
|
||||
BTL_ERROR(("error getting port attributes for device %s port number %d errno says %s",
|
||||
ibv_get_device_name(ib_dev), j, strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( IBV_PORT_ACTIVE == ib_port_attr->state ){
|
||||
ud_btl = (mca_btl_ud_module_t*) malloc(sizeof(mca_btl_ud_module_t));
|
||||
memcpy(ud_btl, &mca_btl_ud_module, sizeof(mca_btl_ud_module));
|
||||
|
||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||
ib_selected->btl_module = (mca_btl_base_module_t*) ud_btl;
|
||||
ud_btl->ib_dev = ib_dev;
|
||||
ud_btl->ib_dev_context = ib_dev_context;
|
||||
ud_btl->port_num = (uint8_t) j;
|
||||
ud_btl->ib_port_attr = ib_port_attr;
|
||||
ud_btl->port_info.subnet = ib_port_attr->sm_lid; /* store the sm_lid for multi-nic support */
|
||||
|
||||
opal_list_append(&btl_list, (opal_list_item_t*) ib_selected);
|
||||
if(++mca_btl_ud_component.ib_num_btls >= mca_btl_ud_component.ib_max_btls)
|
||||
break;
|
||||
}
|
||||
else{
|
||||
free(ib_port_attr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Allocate space for btl modules */
|
||||
mca_btl_ud_component.ud_btls = (mca_btl_ud_module_t*)
|
||||
malloc(sizeof(mca_btl_ud_module_t) * mca_btl_ud_component.ib_num_btls);
|
||||
|
||||
if(NULL == mca_btl_ud_component.ud_btls) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
btls = (struct mca_btl_base_module_t**)
|
||||
malloc(mca_btl_ud_component.ib_num_btls * sizeof(mca_btl_ud_module_t*));
|
||||
if(NULL == btls) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < mca_btl_ud_component.ib_num_btls; i++){
|
||||
item = opal_list_remove_first(&btl_list);
|
||||
ib_selected = (mca_btl_base_selected_module_t*)item;
|
||||
ud_btl = (mca_btl_ud_module_t*) ib_selected->btl_module;
|
||||
memcpy(&(mca_btl_ud_component.ud_btls[i]), ud_btl , sizeof(mca_btl_ud_module_t));
|
||||
free(ib_selected);
|
||||
free(ud_btl);
|
||||
|
||||
ud_btl = &mca_btl_ud_component.ud_btls[i];
|
||||
ud_btl->rd_num = mca_btl_ud_component.rd_num;
|
||||
ud_btl->rd_low = mca_btl_ud_component.rd_low;
|
||||
|
||||
ud_btl->sd_wqe_lp = mca_btl_ud_component.sd_num;
|
||||
ud_btl->sd_wqe_hp = mca_btl_ud_component.sd_num;
|
||||
|
||||
/* Initialize module state */
|
||||
OBJ_CONSTRUCT(&ud_btl->ib_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->send_free_eager, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->send_free_max, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->send_free_frag, ompi_free_list_t);
|
||||
|
||||
OBJ_CONSTRUCT(&ud_btl->recv_free_eager, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->recv_free_max, ompi_free_list_t);
|
||||
|
||||
OBJ_CONSTRUCT(&ud_btl->pending_frags_hp, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ud_btl->pending_frags_lp, opal_list_t);
|
||||
|
||||
if(mca_btl_ud_module_init(ud_btl) != OMPI_SUCCESS) {
|
||||
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST
|
||||
ibv_free_device_list(ib_devs);
|
||||
#else
|
||||
free(ib_devs);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mpool_resources.ib_pd = ud_btl->ib_pd;
|
||||
|
||||
/* initialize the memory pool using the hca */
|
||||
ud_btl->super.btl_mpool =
|
||||
mca_mpool_base_module_create(mca_btl_ud_component.ib_mpool_name,
|
||||
&ud_btl->super,
|
||||
&mpool_resources);
|
||||
|
||||
if(NULL == ud_btl->super.btl_mpool) {
|
||||
BTL_ERROR(("error creating openib memory pool! aborting ud btl initialization"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Initialize pool of send fragments */
|
||||
length = sizeof(mca_btl_ud_frag_t) +
|
||||
sizeof(mca_btl_ud_header_t) +
|
||||
ud_btl->super.btl_eager_limit +
|
||||
2*MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
ompi_free_list_init(&ud_btl->send_free_eager,
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_send_frag_eager_t),
|
||||
mca_btl_ud_component.ib_free_list_num,
|
||||
mca_btl_ud_component.ib_free_list_max,
|
||||
mca_btl_ud_component.ib_free_list_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
ompi_free_list_init(&ud_btl->recv_free_eager,
|
||||
length + sizeof(mca_btl_ud_ib_header_t),
|
||||
OBJ_CLASS(mca_btl_ud_recv_frag_eager_t),
|
||||
mca_btl_ud_component.ib_free_list_num,
|
||||
mca_btl_ud_component.ib_free_list_max,
|
||||
mca_btl_ud_component.ib_free_list_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_ud_frag_t) +
|
||||
sizeof(mca_btl_ud_header_t) +
|
||||
ud_btl->super.btl_max_send_size +
|
||||
2*MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
ompi_free_list_init(&ud_btl->send_free_max,
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_send_frag_max_t),
|
||||
mca_btl_ud_component.ib_free_list_num,
|
||||
mca_btl_ud_component.ib_free_list_max,
|
||||
mca_btl_ud_component.ib_free_list_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
/* Initialize pool of receive fragments */
|
||||
ompi_free_list_init (&ud_btl->recv_free_max,
|
||||
length + sizeof(mca_btl_ud_ib_header_t),
|
||||
OBJ_CLASS (mca_btl_ud_recv_frag_max_t),
|
||||
mca_btl_ud_component.ib_free_list_num,
|
||||
mca_btl_ud_component.ib_free_list_max,
|
||||
mca_btl_ud_component.ib_free_list_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_ud_frag_t) +
|
||||
sizeof(mca_btl_ud_header_t)+
|
||||
2*MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
ompi_free_list_init(&ud_btl->send_free_frag,
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_send_frag_frag_t),
|
||||
mca_btl_ud_component.ib_free_list_num,
|
||||
mca_btl_ud_component.ib_free_list_max,
|
||||
mca_btl_ud_component.ib_free_list_inc,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
/* Post receive descriptors */
|
||||
do {
|
||||
int32_t i;
|
||||
int rc;
|
||||
struct ibv_recv_wr* bad_wr;
|
||||
|
||||
for(i = 0; i < ud_btl->rd_num; i++) {
|
||||
mca_btl_ud_frag_t* frag;
|
||||
OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_eager, frag, rc);
|
||||
frag->sg_entry.length = frag->size +
|
||||
sizeof(mca_btl_ud_header_t) +
|
||||
sizeof(mca_btl_ud_ib_header_t);
|
||||
if(ibv_post_recv(ud_btl->qp_hp,
|
||||
&frag->wr_desc.rd_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting recv, errno %s\n",
|
||||
strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
OMPI_FREE_LIST_WAIT(&ud_btl->recv_free_max, frag, rc);
|
||||
frag->sg_entry.length = frag->size +
|
||||
sizeof(mca_btl_ud_header_t) +
|
||||
sizeof(mca_btl_ud_ib_header_t);
|
||||
if(ibv_post_recv(ud_btl->qp_lp,
|
||||
&frag->wr_desc.rd_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting recv, errno %s\n",
|
||||
strerror(errno)));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
} while(0);
|
||||
|
||||
|
||||
/* TODO - Put this somewhere else or clean up our macros */
|
||||
#if 0
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 1);
|
||||
MCA_BTL_UD_POST_SRR_LOW(ud_btl, 1);
|
||||
} else {
|
||||
#endif
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 0);
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 0);
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
btls[i] = &ud_btl->super;
|
||||
}
|
||||
|
||||
/* Post OOB receive to support dynamic connection setup */
|
||||
mca_btl_ud_post_recv();
|
||||
mca_btl_ud_modex_send();
|
||||
|
||||
*num_btl_modules = mca_btl_ud_component.ib_num_btls;
|
||||
#if OMPI_MCA_BTL_OPENIB_HAVE_DEVICE_LIST
|
||||
ibv_free_device_list(ib_devs);
|
||||
#else
|
||||
free(ib_devs);
|
||||
#endif
|
||||
return btls;
|
||||
}
|
||||
|
||||
|
||||
static inline int mca_btl_ud_handle_incoming_hp(mca_btl_ud_module_t *,
|
||||
mca_btl_ud_frag_t *, size_t);
|
||||
|
||||
static inline int mca_btl_ud_handle_incoming_hp(
|
||||
mca_btl_ud_module_t *ud_btl,
|
||||
mca_btl_ud_frag_t *frag,
|
||||
size_t byte_len)
|
||||
{
|
||||
struct ibv_recv_wr* bad_wr;
|
||||
|
||||
/* advance the segment address past the header and adjust the length..*/
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
frag->segment.seg_len = byte_len -
|
||||
sizeof(mca_btl_ud_header_t) - sizeof(mca_btl_ud_ib_header_t);
|
||||
|
||||
/* call registered callback */
|
||||
ud_btl->ib_reg[frag->hdr->tag].cbfunc(&ud_btl->super,
|
||||
frag->hdr->tag, &frag->base,
|
||||
ud_btl->ib_reg[frag->hdr->tag].cbdata);
|
||||
|
||||
/* TODO - not resetting segment values here.. problem? */
|
||||
if(ibv_post_recv(ud_btl->qp_hp, &frag->wr_desc.rd_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting recv, errno %s\n", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
#if 0
|
||||
OMPI_FREE_LIST_RETURN(&(ud_btl->recv_free_eager),
|
||||
(opal_list_item_t*) frag);
|
||||
|
||||
/* repost receive descriptors */
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
OPAL_THREAD_ADD32((int32_t*) &ud_btl->srd_posted_hp, -1);
|
||||
MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 0);
|
||||
} else {
|
||||
#endif
|
||||
OPAL_THREAD_ADD32((int32_t*) &ud_btl->rd_posted_hp, -1);
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 0);
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* IB component progress.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_component_progress()
|
||||
{
|
||||
uint32_t i;
|
||||
int count = 0,ne = 0, ret;
|
||||
mca_btl_ud_frag_t* frag;
|
||||
struct ibv_recv_wr* bad_wr;
|
||||
|
||||
/* Poll for completions */
|
||||
for(i = 0; i < mca_btl_ud_component.ib_num_btls; i++) {
|
||||
struct ibv_wc wc;
|
||||
mca_btl_ud_module_t* ud_btl = &mca_btl_ud_component.ud_btls[i];
|
||||
|
||||
ne=ibv_poll_cq(ud_btl->ib_cq_hp, 1, &wc);
|
||||
if(ne < 0 ) {
|
||||
BTL_ERROR(("error polling HP CQ with %d errno says %s\n",
|
||||
ne, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(1 == ne) {
|
||||
if(wc.status != IBV_WC_SUCCESS) {
|
||||
BTL_ERROR(("error polling HP CQ with status %d for wr_id %llu opcode %d\n",
|
||||
wc.status, wc.wr_id, wc.opcode));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Handle work completions */
|
||||
switch(wc.opcode) {
|
||||
case IBV_WC_SEND :
|
||||
frag = (mca_btl_ud_frag_t*)(unsigned long)wc.wr_id;
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
mca_btl_ud_profile.avg_full_send +=
|
||||
opal_sys_timer_get_cycles() - frag->tm;
|
||||
mca_btl_ud_profile.cnt_full_send++;
|
||||
#endif
|
||||
|
||||
/* Process a completed send */
|
||||
frag->base.des_cbfunc(&ud_btl->super,
|
||||
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
||||
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, 1);
|
||||
if(!opal_list_is_empty(&ud_btl->pending_frags_hp)) {
|
||||
frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&ud_btl->pending_frags_hp);
|
||||
mca_btl_ud_endpoint_post_send(ud_btl, frag->endpoint, frag);
|
||||
}
|
||||
|
||||
count++;
|
||||
break;
|
||||
|
||||
case IBV_WC_RECV:
|
||||
/* Process a RECV */
|
||||
frag = (mca_btl_ud_frag_t*)(unsigned long) wc.wr_id;
|
||||
ret = mca_btl_ud_handle_incoming_hp(ud_btl, frag, wc.byte_len);
|
||||
|
||||
if (ret != OMPI_SUCCESS)
|
||||
return ret;
|
||||
count++;
|
||||
break;
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ne=ibv_poll_cq(ud_btl->ib_cq_lp, 1, &wc );
|
||||
if(ne < 0){
|
||||
BTL_ERROR(("error polling LP CQ with %d errno says %s",
|
||||
ne, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
else if(1 == ne) {
|
||||
if(wc.status != IBV_WC_SUCCESS) {
|
||||
BTL_ERROR(("error polling LP CQ with status %d for wr_id %llu opcode %d",
|
||||
wc.status, wc.wr_id, wc.opcode));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Handle n/w completions */
|
||||
switch(wc.opcode) {
|
||||
case IBV_WC_SEND:
|
||||
frag = (mca_btl_ud_frag_t*) (unsigned long) wc.wr_id;
|
||||
|
||||
/* Process a completed send - receiver must return tokens */
|
||||
frag->base.des_cbfunc(&ud_btl->super,
|
||||
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
||||
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, 1);
|
||||
if(!opal_list_is_empty(&ud_btl->pending_frags_lp)) {
|
||||
frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&ud_btl->pending_frags_lp);
|
||||
mca_btl_ud_endpoint_post_send(ud_btl, frag->endpoint, frag);
|
||||
}
|
||||
|
||||
count++;
|
||||
break;
|
||||
|
||||
case IBV_WC_RECV:
|
||||
/* Process a RECV */
|
||||
frag = (mca_btl_ud_frag_t*) (unsigned long) wc.wr_id;
|
||||
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
frag->segment.seg_len =
|
||||
wc.byte_len - sizeof(mca_btl_ud_header_t) -
|
||||
sizeof(mca_btl_ud_ib_header_t);
|
||||
|
||||
/* call registered callback */
|
||||
ud_btl->ib_reg[frag->hdr->tag].cbfunc(&ud_btl->super,
|
||||
frag->hdr->tag, &frag->base,
|
||||
ud_btl->ib_reg[frag->hdr->tag].cbdata);
|
||||
|
||||
if(ibv_post_recv(ud_btl->qp_lp,
|
||||
&frag->wr_desc.rd_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting recv, errno %s\n",
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
#if 0
|
||||
OMPI_FREE_LIST_RETURN(
|
||||
&(ud_btl->recv_free_max), (opal_list_item_t*) frag);
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
/* repost receive descriptors */
|
||||
OPAL_THREAD_ADD32((int32_t*) &ud_btl->srd_posted_lp, -1);
|
||||
MCA_BTL_UD_POST_SRR_LOW(ud_btl, 0);
|
||||
} else {
|
||||
#endif
|
||||
/* repost receive descriptors */
|
||||
OPAL_THREAD_ADD32((int32_t*) &ud_btl->rd_posted_lp, -1);
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 0);
|
||||
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
count++;
|
||||
break;
|
||||
|
||||
default:
|
||||
BTL_ERROR(("Unhandled work completion opcode %d", wc.opcode));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
551
ompi/mca/btl/ud/btl_ud_endpoint.c
Обычный файл
551
ompi/mca/btl/ud/btl_ud_endpoint.c
Обычный файл
@ -0,0 +1,551 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
#include "ompi/types.h"
|
||||
#include "ompi/mca/pml/base/pml_base_sendreq.h"
|
||||
#include "orte/mca/ns/base/base.h"
|
||||
#include "orte/mca/oob/base/base.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/dss/dss.h"
|
||||
#include "btl_ud.h"
|
||||
#include "btl_ud_endpoint.h"
|
||||
#include "btl_ud_proc.h"
|
||||
#include "btl_ud_frag.h"
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
|
||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
|
||||
|
||||
|
||||
/*
|
||||
* post a send to the work queue
|
||||
*/
|
||||
inline int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
||||
mca_btl_ud_endpoint_t * endpoint,
|
||||
mca_btl_ud_frag_t * frag)
|
||||
{
|
||||
struct ibv_qp* ib_qp;
|
||||
struct ibv_send_wr* bad_wr;
|
||||
|
||||
/* Have to be careful here - UD adds a 40 byte header, but it is not
|
||||
included on the sending side. */
|
||||
frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t);
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
if(frag->size == ud_btl->super.btl_eager_limit) {
|
||||
if(OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, -1) < 0) {
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe_hp, 1);
|
||||
opal_list_append(&ud_btl->pending_frags_hp,
|
||||
(opal_list_item_t*)frag);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ib_qp = ud_btl->qp_hp;
|
||||
frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_hp;
|
||||
frag->wr_desc.sr_desc.wr.ud.remote_qpn =
|
||||
endpoint->rem_info.rem_qp_num_hp;
|
||||
|
||||
if(frag->sg_entry.length <= ud_btl->ib_inline_max) {
|
||||
frag->wr_desc.sr_desc.send_flags =
|
||||
IBV_SEND_SIGNALED|IBV_SEND_INLINE;
|
||||
}
|
||||
} else {
|
||||
if(OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, -1) < 0) {
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe_lp, 1);
|
||||
opal_list_append(&ud_btl->pending_frags_lp,
|
||||
(opal_list_item_t*)frag);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ib_qp = ud_btl->qp_lp;
|
||||
frag->wr_desc.sr_desc.wr.ud.ah = endpoint->rmt_ah_lp;
|
||||
frag->wr_desc.sr_desc.wr.ud.remote_qpn =
|
||||
endpoint->rem_info.rem_qp_num_lp;
|
||||
}
|
||||
|
||||
/*BTL_VERBOSE(("Send to : %d, len : %d %d %d, frag : %p",
|
||||
endpoint->endpoint_proc->proc_guid.vpid,
|
||||
frag->sg_entry.length, frag->segment.seg_len,
|
||||
ud_btl->ib_inline_max, frag)); */
|
||||
|
||||
#if MCA_BTL_UD_ENABLE_PROFILE
|
||||
frag->tm = opal_sys_timer_get_cycles();
|
||||
#endif
|
||||
|
||||
MCA_BTL_UD_START_TIME(ibv_post_send);
|
||||
if(ibv_post_send(ib_qp, &frag->wr_desc.sr_desc, &bad_wr)) {
|
||||
BTL_ERROR(("error posting send request errno says %d %s\n",
|
||||
errno, strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
MCA_BTL_UD_END_TIME(ibv_post_send);
|
||||
|
||||
#if 0
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
MCA_BTL_UD_POST_SRR_HIGH(ud_btl, 1);
|
||||
MCA_BTL_UD_POST_SRR_LOW(ud_btl, 1);
|
||||
} else {
|
||||
#endif
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_HIGH(ud_btl, 1);
|
||||
MCA_BTL_UD_ENDPOINT_POST_RR_LOW(ud_btl, 1);
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_endpoint_t,
|
||||
opal_list_item_t, mca_btl_ud_endpoint_construct,
|
||||
mca_btl_ud_endpoint_destruct);
|
||||
|
||||
/*
|
||||
* Initialize state of the endpoint instance.
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
endpoint->endpoint_btl = 0;
|
||||
endpoint->endpoint_proc = 0;
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
|
||||
OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
|
||||
|
||||
memset(&endpoint->rem_info, 0, sizeof(struct mca_btl_ud_rem_info_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Destroy a endpoint
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Send connection information to remote endpoint using OOB
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_ud_endpoint_send_cb(int status, orte_process_name_t* endpoint,
|
||||
orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
|
||||
|
||||
static int mca_btl_ud_endpoint_send_connect_data(mca_btl_ud_endpoint_t* endpoint)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl = endpoint->endpoint_btl;
|
||||
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
|
||||
int rc;
|
||||
|
||||
if(NULL == buffer) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pack the info in the send buffer */
|
||||
rc = orte_dss.pack(buffer, &ud_btl->qp_hp->qp_num, 1, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dss.pack(buffer, &ud_btl->qp_lp->qp_num, 1, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dss.pack(buffer, &ud_btl->psn_hp, 1, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dss.pack(buffer, &ud_btl->psn_lp, 1, ORTE_UINT32);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dss.pack(buffer, &ud_btl->ib_port_attr->lid, 1, ORTE_UINT16);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
rc = orte_dss.pack(buffer,
|
||||
&((mca_btl_ud_endpoint_t*)endpoint)->subnet, 1, ORTE_UINT16);
|
||||
if(rc != ORTE_SUCCESS) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* send to endpoint */
|
||||
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid,
|
||||
buffer, ORTE_RML_TAG_DYNAMIC-1, 0, mca_btl_ud_endpoint_send_cb, NULL);
|
||||
|
||||
BTL_VERBOSE(("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
|
||||
ud_btl->qp_hp->qp_num,
|
||||
ud_btl->qp_lp->qp_num,
|
||||
endpoint->endpoint_btl->ib_port_attr->lid));
|
||||
|
||||
if(rc < 0) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Non blocking OOB recv callback.
|
||||
* Read incoming QP and other info, and if this endpoint
|
||||
* is trying to connect, reply with our QP info,
|
||||
* otherwise try to modify QP's and establish
|
||||
* reliable connection
|
||||
*
|
||||
*/
|
||||
|
||||
static void mca_btl_ud_endpoint_recv(
|
||||
int status,
|
||||
orte_process_name_t* endpoint,
|
||||
orte_buffer_t* buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
struct ibv_ah_attr ah_attr;
|
||||
mca_btl_ud_proc_t *ib_proc;
|
||||
mca_btl_ud_endpoint_t *ib_endpoint = NULL;
|
||||
mca_btl_ud_rem_info_t rem_info;
|
||||
mca_btl_ud_module_t* ud_btl;
|
||||
int rc;
|
||||
uint32_t i;
|
||||
size_t cnt = 1;
|
||||
|
||||
/* start by unpacking data first so we know who is knocking at
|
||||
our door */
|
||||
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_hp, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_qp_num_lp, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_psn_hp, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_psn_lp, &cnt, ORTE_UINT32);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_lid, &cnt, ORTE_UINT16);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
rc = orte_dss.unpack(buffer, &rem_info.rem_subnet, &cnt, ORTE_UINT16);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return;
|
||||
}
|
||||
|
||||
/*BTL_VERBOSE(("Received High Priority QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
rem_info.rem_qp_num_hp,
|
||||
rem_info.rem_qp_num_lp,
|
||||
rem_info.rem_lid));*/
|
||||
|
||||
for(ib_proc = (mca_btl_ud_proc_t*)
|
||||
opal_list_get_first(&mca_btl_ud_component.ib_procs);
|
||||
ib_proc != (mca_btl_ud_proc_t*)
|
||||
opal_list_get_end(&mca_btl_ud_component.ib_procs);
|
||||
ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
|
||||
if(orte_ns.compare(ORTE_NS_CMP_ALL,
|
||||
&ib_proc->proc_guid, endpoint) == 0) {
|
||||
bool found = false;
|
||||
|
||||
/* Try to get the endpoint instance of this proc */
|
||||
for(i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(ib_endpoint->rem_info.rem_lid &&
|
||||
ib_endpoint->rem_info.rem_lid == rem_info.rem_lid) {
|
||||
/* we've seen them before! */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* If we haven't seen this remote lid before then try to match on
|
||||
endpoint */
|
||||
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid &&
|
||||
ib_endpoint->subnet == rem_info.rem_subnet) {
|
||||
/* found a match based on subnet! */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* try finding an open port, even if subnets don't match */
|
||||
for(i = 0; !found && i < ib_proc->proc_endpoint_count; i++) {
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if(!ib_endpoint->rem_info.rem_lid) {
|
||||
/* found an unused end-point */
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!found) {
|
||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
|
||||
|
||||
/* Update status */
|
||||
if(ib_endpoint->endpoint_state == MCA_BTL_IB_CLOSED) {
|
||||
if(OMPI_SUCCESS !=
|
||||
mca_btl_ud_endpoint_send_connect_data(ib_endpoint)) {
|
||||
BTL_ERROR(("error sending connect request, error code %d", rc));
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Always 'CONNECTED' at this point */
|
||||
ud_btl = ib_endpoint->endpoint_btl;
|
||||
memcpy(&ib_endpoint->rem_info,
|
||||
&rem_info, sizeof(mca_btl_ud_rem_info_t));
|
||||
|
||||
ah_attr.is_global = 0;
|
||||
ah_attr.dlid = rem_info.rem_lid;
|
||||
ah_attr.sl = mca_btl_ud_component.ib_service_level;
|
||||
ah_attr.src_path_bits = mca_btl_ud_component.ib_src_path_bits;
|
||||
ah_attr.port_num = ud_btl->port_num;
|
||||
|
||||
ib_endpoint->rmt_ah_hp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
|
||||
if(NULL == ib_endpoint->rmt_ah_hp) {
|
||||
BTL_ERROR(("error creating address handle errno says %s\n",
|
||||
strerror(errno)));
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
ib_endpoint->rmt_ah_lp = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
|
||||
if(NULL == ib_endpoint) {
|
||||
BTL_ERROR(("error creating address handle errno says %s\n",
|
||||
strerror(errno)));
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
|
||||
|
||||
/*BTL_VERBOSE(("connected! QP num = %d, Low Priority QP num %d, LID = %d",
|
||||
ib_endpoint->rem_info.rem_qp_num_hp,
|
||||
ib_endpoint->rem_info.rem_qp_num_lp,
|
||||
ib_endpoint->rem_info.rem_lid));*/
|
||||
|
||||
/* Post our queued sends */
|
||||
while(!opal_list_is_empty(&(ib_endpoint->pending_send_frags))) {
|
||||
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&(ib_endpoint->pending_send_frags));
|
||||
if(OMPI_SUCCESS != mca_btl_ud_endpoint_post_send(
|
||||
ud_btl, ib_endpoint, frag)) {
|
||||
BTL_ERROR(("ERROR posting send"));
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Post the OOB recv (for receiving the peers information)
|
||||
*/
|
||||
void mca_btl_ud_post_recv()
|
||||
{
|
||||
orte_rml.recv_buffer_nb(
|
||||
ORTE_RML_NAME_ANY,
|
||||
ORTE_RML_TAG_DYNAMIC-1,
|
||||
ORTE_RML_PERSISTENT,
|
||||
mca_btl_ud_endpoint_recv,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt to send a fragment using a given endpoint. If the endpoint is not
|
||||
* connected, queue the fragment and start the connection as required.
|
||||
*/
|
||||
|
||||
int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
int rc = OMPI_SUCCESS;
|
||||
bool call_progress = false;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
switch(endpoint->endpoint_state) {
|
||||
case MCA_BTL_IB_CONNECTED:
|
||||
{
|
||||
MCA_BTL_UD_START_TIME(endpoint_send_conn);
|
||||
rc = mca_btl_ud_endpoint_post_send(
|
||||
endpoint->endpoint_btl, endpoint, frag);
|
||||
MCA_BTL_UD_END_TIME(endpoint_send_conn);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
return rc;
|
||||
}
|
||||
case MCA_BTL_IB_CLOSED:
|
||||
/* Send connection info over to remote endpoint */
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
|
||||
rc = mca_btl_ud_endpoint_send_connect_data(endpoint);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error sending connect request, error code %d", rc));
|
||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* As long as we expect a message from the peer (in order to setup
|
||||
* the connection) let the event engine pool the OOB events. Note:
|
||||
* we increment it once per active connection.
|
||||
*/
|
||||
opal_progress_event_increment();
|
||||
call_progress = true;
|
||||
|
||||
/* No break here - fall through */
|
||||
case MCA_BTL_IB_CONNECTING:
|
||||
opal_list_append(&endpoint->pending_send_frags,
|
||||
(opal_list_item_t *)frag);
|
||||
break;
|
||||
case MCA_BTL_IB_FAILED:
|
||||
BTL_ERROR(("endpoint FAILED"));
|
||||
default:
|
||||
rc = OMPI_ERR_UNREACH;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
if(call_progress) opal_progress();
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create the queue pair note that this is just the initial
|
||||
* queue pair creation and we need to get the remote queue pair
|
||||
* info from the peer before the qp is usable,
|
||||
*/
|
||||
/* TODO - maybe start to push this off into its own file? */
|
||||
|
||||
int mca_btl_ud_endpoint_init_qp(
|
||||
mca_btl_base_module_t* btl,
|
||||
struct ibv_cq* cq,
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
struct ibv_srq* srq,
|
||||
#endif
|
||||
struct ibv_qp** qp,
|
||||
uint32_t lcl_psn
|
||||
)
|
||||
{
|
||||
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
||||
struct ibv_qp* my_qp;
|
||||
struct ibv_qp_attr qp_attr;
|
||||
struct ibv_qp_init_attr qp_init_attr;
|
||||
|
||||
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
|
||||
|
||||
qp_init_attr.send_cq = cq;
|
||||
qp_init_attr.recv_cq = cq;
|
||||
qp_init_attr.cap.max_send_wr = mca_btl_ud_component.rd_num;
|
||||
qp_init_attr.cap.max_recv_wr = mca_btl_ud_component.rd_num;
|
||||
qp_init_attr.cap.max_send_sge = mca_btl_ud_component.ib_sg_list_size;
|
||||
qp_init_attr.cap.max_recv_sge = mca_btl_ud_component.ib_sg_list_size;
|
||||
qp_init_attr.qp_type = IBV_QPT_UD;
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
if(mca_btl_ud_component.use_srq) {
|
||||
qp_init_attr.srq = srq;
|
||||
}
|
||||
#endif
|
||||
my_qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr);
|
||||
|
||||
if(NULL == my_qp) {
|
||||
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
(*qp) = my_qp;
|
||||
if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) {
|
||||
BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data"));
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_INIT;
|
||||
qp_attr.pkey_index = mca_btl_ud_component.ib_pkey_ix;
|
||||
qp_attr.qkey = mca_btl_ud_component.ib_qkey;
|
||||
qp_attr.port_num = ud_btl->port_num;
|
||||
|
||||
if(ibv_modify_qp(*qp, &qp_attr,
|
||||
IBV_QP_STATE |
|
||||
IBV_QP_PKEY_INDEX |
|
||||
IBV_QP_PORT |
|
||||
IBV_QP_QKEY)) {
|
||||
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_RTR;
|
||||
if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) {
|
||||
BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
qp_attr.qp_state = IBV_QPS_RTS;
|
||||
qp_attr.sq_psn = lcl_psn;
|
||||
if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) {
|
||||
BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
149
ompi/mca/btl/ud/btl_ud_endpoint.h
Обычный файл
149
ompi/mca/btl/ud/btl_ud_endpoint.h
Обычный файл
@ -0,0 +1,149 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_IB_ENDPOINT_H
|
||||
#define MCA_BTL_IB_ENDPOINT_H
|
||||
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "opal/event/event.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "btl_ud_frag.h"
|
||||
#include "btl_ud.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <infiniband/verbs.h>
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
#include "ompi/mca/mpool/openib/mpool_openib.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_endpoint_t);
|
||||
|
||||
|
||||
struct mca_btl_ud_frag_t;
|
||||
|
||||
struct mca_btl_ud_port_info_t {
|
||||
uint16_t subnet;
|
||||
};
|
||||
typedef struct mca_btl_ud_port_info_t mca_btl_ud_port_info_t;
|
||||
|
||||
|
||||
/**
|
||||
* State of IB endpoint connection.
|
||||
*/
|
||||
|
||||
typedef enum {
|
||||
/* Defines the state in which this BTL instance
|
||||
* has started the process of connection */
|
||||
MCA_BTL_IB_CONNECTING,
|
||||
|
||||
/* Connected ... both sender & receiver have
|
||||
* buffers associated with this connection */
|
||||
MCA_BTL_IB_CONNECTED,
|
||||
|
||||
/* Connection is closed, there are no resources
|
||||
* associated with this */
|
||||
MCA_BTL_IB_CLOSED,
|
||||
|
||||
/* Maximum number of retries have been used.
|
||||
* Report failure on send to upper layer */
|
||||
MCA_BTL_IB_FAILED
|
||||
} mca_btl_ud_endpoint_state_t;
|
||||
|
||||
struct mca_btl_ud_rem_info_t {
|
||||
|
||||
uint32_t rem_qp_num_hp;
|
||||
uint32_t rem_qp_num_lp;
|
||||
/* Remote QP number (Low and High priority) */
|
||||
|
||||
uint16_t rem_lid;
|
||||
/* Local identifier of the remote process */
|
||||
|
||||
|
||||
uint32_t rem_psn_hp;
|
||||
uint32_t rem_psn_lp;
|
||||
/* Remote processes port sequence number (Low and High) */
|
||||
|
||||
uint16_t rem_subnet;
|
||||
/* subnet of remote process */
|
||||
};
|
||||
typedef struct mca_btl_ud_rem_info_t mca_btl_ud_rem_info_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* An abstraction that represents a connection to a endpoint process.
|
||||
* An instance of mca_btl_base_endpoint_t is associated w/ each process
|
||||
* and BTL pair at startup. Normally connections are established as-needed.
|
||||
* The UD BTL is connectionless, so no connection is ever established.
|
||||
*/
|
||||
|
||||
struct mca_btl_base_endpoint_t {
|
||||
opal_list_item_t super;
|
||||
|
||||
struct mca_btl_ud_module_t* endpoint_btl;
|
||||
/**< BTL instance that created this connection */
|
||||
|
||||
struct mca_btl_ud_proc_t* endpoint_proc;
|
||||
/**< proc structure corresponding to endpoint */
|
||||
|
||||
mca_btl_ud_endpoint_state_t endpoint_state;
|
||||
/**< current state of the connection */
|
||||
|
||||
opal_mutex_t endpoint_lock;
|
||||
/**< lock for concurrent access to endpoint state */
|
||||
|
||||
opal_list_t pending_send_frags;
|
||||
/**< list of pending send frags for this endpoint */
|
||||
|
||||
mca_btl_ud_rem_info_t rem_info;
|
||||
|
||||
struct ibv_ah* rmt_ah_hp;
|
||||
struct ibv_ah* rmt_ah_lp;
|
||||
/* Local Address Handle (Low and High) */
|
||||
|
||||
uint16_t subnet; /**< subnet of this endpoint*/
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
typedef mca_btl_base_endpoint_t mca_btl_ud_endpoint_t;
|
||||
|
||||
int mca_btl_ud_endpoint_send(mca_btl_base_endpoint_t* endpoint, struct mca_btl_ud_frag_t* frag);
|
||||
inline int mca_btl_ud_endpoint_post_send(struct mca_btl_ud_module_t* ud_btl,
|
||||
mca_btl_ud_endpoint_t * endpoint,
|
||||
struct mca_btl_ud_frag_t * frag);
|
||||
int mca_btl_ud_endpoint_connect(mca_btl_base_endpoint_t*);
|
||||
void mca_btl_ud_post_recv(void);
|
||||
int mca_btl_ud_endpoint_init_qp(
|
||||
mca_btl_base_module_t* btl,
|
||||
struct ibv_cq* cq,
|
||||
#ifdef OMPI_MCA_BTL_OPENIB_HAVE_SRQ
|
||||
struct ibv_srq* srq,
|
||||
#endif
|
||||
struct ibv_qp** qp,
|
||||
uint32_t lcl_psn);
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
153
ompi/mca/btl/ud/btl_ud_frag.c
Обычный файл
153
ompi/mca/btl/ud/btl_ud_frag.c
Обычный файл
@ -0,0 +1,153 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "btl_ud_frag.h"
|
||||
#include "ompi/mca/mpool/openib/mpool_openib.h"
|
||||
|
||||
|
||||
|
||||
static inline void mca_btl_ud_frag_common_constructor( mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->ud_reg = (mca_mpool_openib_registration_t*)frag->base.super.user_data;
|
||||
frag->sg_entry.lkey = frag->ud_reg->mr->lkey;
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_send_frag_common_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
|
||||
mca_btl_ud_frag_common_constructor(frag);
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
frag->base.des_dst = NULL;
|
||||
frag->base.des_dst_cnt = 0;
|
||||
|
||||
/* We do not include the mca_btl_ud_ib_header_t data when sending */
|
||||
frag->hdr = (mca_btl_ud_header_t*)(frag+1);
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
|
||||
frag->sg_entry.addr = (unsigned long)frag->hdr;
|
||||
|
||||
frag->wr_desc.sr_desc.wr_id = (unsigned long) frag;
|
||||
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.sr_desc.num_sge = 1;
|
||||
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
|
||||
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
|
||||
frag->wr_desc.sr_desc.next = NULL;
|
||||
frag->wr_desc.sr_desc.wr.ud.remote_qkey = mca_btl_ud_component.ib_qkey;
|
||||
}
|
||||
|
||||
static void mca_btl_ud_recv_frag_common_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
mca_btl_ud_frag_common_constructor(frag);
|
||||
frag->base.des_dst = &frag->segment;
|
||||
frag->base.des_dst_cnt = 1;
|
||||
frag->base.des_src = NULL;
|
||||
frag->base.des_src_cnt = 0;
|
||||
|
||||
/* Receive frag headers start 40 bytes later */
|
||||
frag->hdr = (mca_btl_ud_header_t*)((unsigned char*)frag) +
|
||||
sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_ib_header_t);
|
||||
frag->segment.seg_addr.pval = frag->hdr + 1;
|
||||
|
||||
frag->sg_entry.addr = (unsigned long)(frag + 1);
|
||||
frag->segment.seg_len = frag->size;
|
||||
frag->sg_entry.length = frag->size +
|
||||
sizeof(mca_btl_ud_ib_header_t) + sizeof(mca_btl_ud_header_t);
|
||||
|
||||
frag->wr_desc.rd_desc.wr_id = (unsigned long) frag;
|
||||
frag->wr_desc.rd_desc.sg_list = &frag->sg_entry;
|
||||
frag->wr_desc.rd_desc.num_sge = 1;
|
||||
frag->wr_desc.rd_desc.next = NULL;
|
||||
}
|
||||
|
||||
static void mca_btl_ud_send_frag_eager_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_ud_component.eager_limit;
|
||||
mca_btl_ud_send_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_send_frag_max_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_ud_component.max_send_size;
|
||||
mca_btl_ud_send_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_ud_recv_frag_max_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_ud_component.max_send_size;
|
||||
mca_btl_ud_recv_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
|
||||
static void mca_btl_ud_recv_frag_eager_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_ud_component.eager_limit;
|
||||
mca_btl_ud_recv_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_ud_send_frag_frag_constructor(mca_btl_ud_frag_t* frag)
|
||||
{
|
||||
frag->size = 0;
|
||||
mca_btl_ud_send_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_send_frag_eager_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_send_frag_eager_constructor,
|
||||
NULL);
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_send_frag_max_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_send_frag_max_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_send_frag_frag_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_send_frag_frag_constructor,
|
||||
NULL);
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_recv_frag_eager_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_recv_frag_eager_constructor,
|
||||
NULL);
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(
|
||||
mca_btl_ud_recv_frag_max_t,
|
||||
mca_btl_base_descriptor_t,
|
||||
mca_btl_ud_recv_frag_max_constructor,
|
||||
NULL);
|
||||
|
||||
|
150
ompi/mca/btl/ud/btl_ud_frag.h
Обычный файл
150
ompi/mca/btl/ud/btl_ud_frag.h
Обычный файл
@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_UD_FRAG_H
|
||||
#define MCA_BTL_UD_FRAG_H
|
||||
|
||||
|
||||
#define MCA_BTL_IB_FRAG_ALIGN (8)
|
||||
#include "ompi_config.h"
|
||||
#include "btl_ud.h"
|
||||
|
||||
#include <infiniband/verbs.h>
|
||||
#include "ompi/mca/mpool/openib/mpool_openib.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t);
|
||||
|
||||
/* UD adds a 40 byte global routing header */
|
||||
/* This works in strange ways - the sending side does not need to explicitly
|
||||
include this data in sg lists. Then, on the receiving side, the extra 40
|
||||
bytes magically appear. */
|
||||
struct mca_btl_ud_ib_header_t {
|
||||
uint8_t ib_grh[40];
|
||||
};
|
||||
typedef struct mca_btl_ud_ib_header_t mca_btl_ud_ib_header_t;
|
||||
|
||||
struct mca_btl_ud_header_t {
|
||||
mca_btl_base_tag_t tag;
|
||||
};
|
||||
typedef struct mca_btl_ud_header_t mca_btl_ud_header_t;
|
||||
|
||||
/**
|
||||
* IB send fragment derived type.
|
||||
*/
|
||||
struct mca_btl_ud_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_base_segment_t segment;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
size_t size;
|
||||
union{
|
||||
struct ibv_recv_wr rd_desc;
|
||||
struct ibv_send_wr sr_desc;
|
||||
} wr_desc;
|
||||
struct ibv_sge sg_entry;
|
||||
|
||||
/* When this is a send frag, hdr points right after this, as expected.
|
||||
But when this is a receive frag, we have an extra 40 bytes provided
|
||||
by IB, so this points 40 bytes past the end of the frag. */
|
||||
mca_btl_ud_header_t *hdr;
|
||||
|
||||
mca_mpool_openib_registration_t* ud_reg;
|
||||
opal_timer_t tm;
|
||||
};
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_frag_t;
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_frag_t);
|
||||
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_eager_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_eager_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_max_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_max_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_send_frag_frag_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_send_frag_frag_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_eager_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_eager_t);
|
||||
|
||||
typedef struct mca_btl_ud_frag_t mca_btl_ud_recv_frag_max_t;
|
||||
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_recv_frag_max_t);
|
||||
|
||||
|
||||
/*
|
||||
* Allocate an IB send descriptor
|
||||
*
|
||||
*/
|
||||
|
||||
#define MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc) \
|
||||
{ \
|
||||
\
|
||||
opal_list_item_t *item; \
|
||||
OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_eager, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_eager, (opal_list_item_t*)(frag)); \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc) \
|
||||
{ \
|
||||
\
|
||||
opal_list_item_t *item; \
|
||||
OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_max, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_max, (opal_list_item_t*)(frag)); \
|
||||
}
|
||||
|
||||
|
||||
#define MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc) \
|
||||
{ \
|
||||
\
|
||||
opal_list_item_t *item; \
|
||||
OMPI_FREE_LIST_WAIT(&((mca_btl_ud_module_t*)btl)->send_free_frag, item, rc); \
|
||||
frag = (mca_btl_ud_frag_t*) item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag) \
|
||||
{ \
|
||||
OMPI_FREE_LIST_RETURN(&((mca_btl_ud_module_t*)btl)->send_free_frag, (opal_list_item_t*)(frag)); \
|
||||
}
|
||||
|
||||
|
||||
struct mca_btl_ud_module_t;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
181
ompi/mca/btl/ud/btl_ud_proc.c
Обычный файл
181
ompi/mca/btl/ud/btl_ud_proc.c
Обычный файл
@ -0,0 +1,181 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
#include "btl_ud.h"
|
||||
#include "btl_ud_proc.h"
|
||||
|
||||
static void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc);
|
||||
static void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc);
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_btl_ud_proc_t,
|
||||
opal_list_item_t, mca_btl_ud_proc_construct,
|
||||
mca_btl_ud_proc_destruct);
|
||||
|
||||
void mca_btl_ud_proc_construct(mca_btl_ud_proc_t* proc)
|
||||
{
|
||||
proc->proc_ompi = 0;
|
||||
proc->proc_port_count = 0;
|
||||
proc->proc_endpoints = 0;
|
||||
proc->proc_endpoint_count = 0;
|
||||
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
|
||||
/* add to list of all proc instance */
|
||||
OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock);
|
||||
opal_list_append(&mca_btl_ud_component.ib_procs, &proc->super);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleanup ib proc instance
|
||||
*/
|
||||
|
||||
void mca_btl_ud_proc_destruct(mca_btl_ud_proc_t* proc)
|
||||
{
|
||||
/* remove from list of all proc instances */
|
||||
OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock);
|
||||
opal_list_remove_item(&mca_btl_ud_component.ib_procs, &proc->super);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock);
|
||||
|
||||
/* release resources */
|
||||
if(NULL != proc->proc_endpoints) {
|
||||
free(proc->proc_endpoints);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Look for an existing IB process instances based on the associated
|
||||
* ompi_proc_t instance.
|
||||
*/
|
||||
static mca_btl_ud_proc_t* mca_btl_ud_proc_lookup_ompi(ompi_proc_t* ompi_proc)
|
||||
{
|
||||
mca_btl_ud_proc_t* ib_proc;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_btl_ud_component.ib_lock);
|
||||
|
||||
for(ib_proc = (mca_btl_ud_proc_t*)
|
||||
opal_list_get_first(&mca_btl_ud_component.ib_procs);
|
||||
ib_proc != (mca_btl_ud_proc_t*)
|
||||
opal_list_get_end(&mca_btl_ud_component.ib_procs);
|
||||
ib_proc = (mca_btl_ud_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
if(ib_proc->proc_ompi == ompi_proc) {
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock);
|
||||
return ib_proc;
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_ud_component.ib_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a IB process structure. There is a one-to-one correspondence
|
||||
* between a ompi_proc_t and a mca_btl_ud_proc_t instance. We cache
|
||||
* additional data (specifically the list of mca_btl_ud_endpoint_t instances,
|
||||
* and published addresses) associated w/ a given destination on this
|
||||
* datastructure.
|
||||
*/
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc)
|
||||
{
|
||||
mca_btl_ud_proc_t* module_proc = NULL;
|
||||
size_t size;
|
||||
int rc;
|
||||
|
||||
/* Check if we have already created a IB proc
|
||||
* structure for this ompi process */
|
||||
module_proc = mca_btl_ud_proc_lookup_ompi(ompi_proc);
|
||||
|
||||
if(module_proc != NULL) {
|
||||
/* Gotcha! */
|
||||
return module_proc;
|
||||
}
|
||||
|
||||
/* Oops! First time, gotta create a new IB proc
|
||||
* out of the ompi_proc ... */
|
||||
module_proc = OBJ_NEW(mca_btl_ud_proc_t);
|
||||
/* Initialize number of peer */
|
||||
module_proc->proc_endpoint_count = 0;
|
||||
module_proc->proc_ompi = ompi_proc;
|
||||
|
||||
/* build a unique identifier (of arbitrary
|
||||
* size) to represent the proc */
|
||||
module_proc->proc_guid = ompi_proc->proc_name;
|
||||
|
||||
|
||||
/* query for the peer address info */
|
||||
rc = mca_pml_base_modex_recv(
|
||||
&mca_btl_ud_component.super.btl_version,
|
||||
ompi_proc,
|
||||
(void*)&module_proc->proc_ports,
|
||||
&size
|
||||
);
|
||||
|
||||
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
opal_output(0, "[%s:%d] mca_pml_base_modex_recv failed for peer [%d,%d,%d]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((size % sizeof(mca_btl_ud_port_info_t)) != 0) {
|
||||
opal_output(0, "[%s:%d] invalid module address for peer [%d,%d,%d]",
|
||||
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
module_proc->proc_port_count = size/sizeof(mca_btl_ud_port_info_t);
|
||||
|
||||
|
||||
if (0 == module_proc->proc_port_count) {
|
||||
module_proc->proc_endpoints = NULL;
|
||||
} else {
|
||||
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||
malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
|
||||
}
|
||||
|
||||
if(NULL == module_proc->proc_endpoints) {
|
||||
OBJ_RELEASE(module_proc);
|
||||
return NULL;
|
||||
}
|
||||
return module_proc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note that this routine must be called with the lock on the process
|
||||
* already held. Insert a btl instance into the proc array and assign
|
||||
* it an address.
|
||||
*/
|
||||
int mca_btl_ud_proc_insert(mca_btl_ud_proc_t* module_proc,
|
||||
mca_btl_base_endpoint_t* module_endpoint)
|
||||
{
|
||||
/* insert into endpoint array */
|
||||
module_endpoint->endpoint_proc = module_proc;
|
||||
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
72
ompi/mca/btl/ud/btl_ud_proc.h
Обычный файл
72
ompi/mca/btl/ud/btl_ud_proc.h
Обычный файл
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_BTL_UD_PROC_H
|
||||
#define MCA_BTL_UD_PROC_H
|
||||
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "opal/class/opal_object.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "btl_ud.h"
|
||||
#include "btl_ud_endpoint.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
OBJ_CLASS_DECLARATION(mca_btl_ud_proc_t);
|
||||
|
||||
/**
|
||||
* Represents the state of a remote process and the set of addresses
|
||||
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
|
||||
* each
|
||||
* BTL instance that attempts to open a connection to the process.
|
||||
*/
|
||||
struct mca_btl_ud_proc_t {
|
||||
opal_list_item_t super;
|
||||
/**< allow proc to be placed on a list */
|
||||
|
||||
ompi_proc_t *proc_ompi;
|
||||
/**< pointer to corresponding ompi_proc_t */
|
||||
|
||||
orte_process_name_t proc_guid;
|
||||
/**< globally unique identifier for the process */
|
||||
|
||||
struct mca_btl_ud_port_info_t* proc_ports;
|
||||
size_t proc_port_count;
|
||||
/**< number of ports published by endpoint */
|
||||
|
||||
struct mca_btl_base_endpoint_t **proc_endpoints;
|
||||
/**< array of endpoints that have been created to access this proc */
|
||||
|
||||
size_t proc_endpoint_count;
|
||||
/**< number of endpoints */
|
||||
|
||||
opal_mutex_t proc_lock;
|
||||
/**< lock to protect against concurrent access to proc state */
|
||||
};
|
||||
typedef struct mca_btl_ud_proc_t mca_btl_ud_proc_t;
|
||||
|
||||
mca_btl_ud_proc_t* mca_btl_ud_proc_create(ompi_proc_t* ompi_proc);
|
||||
int mca_btl_ud_proc_insert(mca_btl_ud_proc_t*, mca_btl_base_endpoint_t*);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
#endif
|
43
ompi/mca/btl/ud/configure.m4
Обычный файл
43
ompi/mca/btl/ud/configure.m4
Обычный файл
@ -0,0 +1,43 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
|
||||
# MCA_btl_ud_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_btl_ud_CONFIG],[
|
||||
OMPI_CHECK_OPENIB([btl_ud],
|
||||
[btl_ud_happy="yes"],
|
||||
[btl_ud_happy="no"])
|
||||
|
||||
AS_IF([test "$btl_ud_happy" = "yes"],
|
||||
[btl_ud_WRAPPER_EXTRA_LDFLAGS="$btl_ud_LDFLAGS"
|
||||
btl_ud_WRAPPER_EXTRA_LIBS="$btl_ud_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
|
||||
# substitute in the things needed to build ud
|
||||
AC_SUBST([btl_ud_CFLAGS])
|
||||
AC_SUBST([btl_ud_CPPFLAGS])
|
||||
AC_SUBST([btl_ud_LDFLAGS])
|
||||
AC_SUBST([btl_ud_LIBS])
|
||||
])dnl
|
26
ompi/mca/btl/ud/configure.params
Обычный файл
26
ompi/mca/btl/ud/configure.params
Обычный файл
@ -0,0 +1,26 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
# reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
|
||||
PARAM_INIT_FILE=btl_ud.c
|
||||
PARAM_CONFIG_HEADER_FILE="ud_config.h"
|
||||
PARAM_CONFIG_FILES="Makefile"
|
Загрузка…
x
Ссылка в новой задаче
Block a user