1
1

Remove the mvapi BTL. Woo hoo!

This commit was SVN r16483.
Этот коммит содержится в:
Jeff Squyres 2007-10-17 14:08:03 +00:00
родитель 0bf61a1b84
Коммит b7eeae0a74
16 изменённых файлов: 3 добавлений и 4972 удалений

3
NEWS
Просмотреть файл

@ -68,6 +68,9 @@ Trunk (not on release branches yet)
- Added checkpoint/restart process fault tolerance support. Initially - Added checkpoint/restart process fault tolerance support. Initially
support a LAM/MPI-like protocol. support a LAM/MPI-like protocol.
--> Expected: 1.3 --> Expected: 1.3
- Removed "mvapi" BTL; all InfiniBand support now uses the OpenFabrics
driver stacks.
--> Expected: 1.3
- Fixed issue with pthread detection when compilers are not all - Fixed issue with pthread detection when compilers are not all
from the same vendor. Thanks to Ake Sandgren for the bug from the same vendor. Thanks to Ake Sandgren for the bug

Просмотреть файл

@ -1,69 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
CFLAGS = $(btl_mvapi_CFLAGS)
AM_CPPFLAGS = $(btl_mvapi_CPPFLAGS)
dist_pkgdata_DATA=help-mpi-btl-mvapi.txt
sources = \
btl_mvapi.c \
btl_mvapi.h \
btl_mvapi_component.c \
btl_mvapi_endpoint.c \
btl_mvapi_endpoint.h \
btl_mvapi_frag.c \
btl_mvapi_frag.h \
btl_mvapi_proc.c \
btl_mvapi_proc.h \
btl_mvapi_eager_rdma.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_btl_mvapi_DSO
lib =
lib_sources =
component = mca_btl_mvapi.la
component_sources = $(sources)
else
lib = libmca_btl_mvapi.la
lib_sources = $(sources)
component =
component_sources =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component)
mca_btl_mvapi_la_SOURCES = $(component_sources)
mca_btl_mvapi_la_LDFLAGS = -module -avoid-version $(btl_mvapi_LDFLAGS)
mca_btl_mvapi_la_LIBADD = \
$(btl_mvapi_LIBS) \
$(top_ompi_builddir)/ompi/libmpi.la \
$(top_ompi_builddir)/orte/libopen-rte.la \
$(top_ompi_builddir)/opal/libopen-pal.la
noinst_LTLIBRARIES = $(lib)
libmca_btl_mvapi_la_SOURCES = $(lib_sources)
libmca_btl_mvapi_la_LDFLAGS = -module -avoid-version$ $(btl_mvapi_LDFLAGS)
libmca_btl_mvapi_la_LIBADD = $(btl_mvapi_LIBS)

Просмотреть файл

@ -1,856 +0,0 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/if.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "btl_mvapi.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi_proc.h"
#include "btl_mvapi_endpoint.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include <vapi_types.h>
#include <math.h> /* for log2 */
mca_btl_mvapi_module_t mca_btl_mvapi_module = {
{
&mca_btl_mvapi_component.super,
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */
0, /* btl_rdma_pipeline_send_length */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* exclusivity */
0, /* latency */
0, /* bandwidth */
0, /* TODO this should be PUT btl flags */
mca_btl_mvapi_add_procs,
mca_btl_mvapi_del_procs,
mca_btl_mvapi_register,
mca_btl_mvapi_finalize,
/* we need alloc free, pack */
mca_btl_mvapi_alloc,
mca_btl_mvapi_free,
mca_btl_mvapi_prepare_src,
mca_btl_mvapi_prepare_dst,
mca_btl_mvapi_send,
mca_btl_mvapi_put,
mca_btl_mvapi_get,
mca_btl_mvapi_dump,
NULL, /* mpool */
NULL, /* error call back registration */
mca_btl_mvapi_ft_event
}
};
/*
* add a proc to this btl module
* creates an endpoint that is setup on the
* first send to the endpoint
*/
int mca_btl_mvapi_add_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **ompi_procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl;
int i, rc;
for(i = 0; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_mvapi_proc_t* ib_proc;
mca_btl_base_endpoint_t* ib_peer;
/* mvapi doesn't support heterogeneous yet... */
if (ompi_proc_local()->proc_arch != ompi_proc->proc_arch) {
continue;
}
if(NULL == (ib_proc = mca_btl_mvapi_proc_create(ompi_proc))) {
continue;
}
/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this PTL instance to the proc.
*/
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
/* The btl_proc datastructure is shared by all IB PTL
* instances that are trying to reach this destination.
* Cache the peer instance on the btl_proc.
*/
ib_peer = OBJ_NEW(mca_btl_mvapi_endpoint_t);
if(NULL == ib_peer) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
ib_peer->endpoint_btl = mvapi_btl;
ib_peer->subnet = mvapi_btl->port_info.subnet;
rc = mca_btl_mvapi_proc_insert(ib_proc, ib_peer);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
continue;
}
ompi_bitmap_set_bit(reachable, i);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
peers[i] = ib_peer;
}
/* currently we only scale the srq the first time
add_procs is called, subsequent calls are ignored,
we should be able to change this to modify the SRQ but
I am unsure as to what this entails
*/
if( 0 == mvapi_btl->num_peers ) {
mvapi_btl->num_peers += nprocs;
if(mca_btl_mvapi_component.use_srq) {
mvapi_btl->rd_num = mca_btl_mvapi_component.rd_num + log2(nprocs) * mca_btl_mvapi_component.srq_rd_per_peer;
if(mvapi_btl->rd_num > mca_btl_mvapi_component.srq_rd_max)
mvapi_btl->rd_num = mca_btl_mvapi_component.srq_rd_max;
mvapi_btl->rd_low = mvapi_btl->rd_num - 1;
free(mvapi_btl->rr_desc_post);
mvapi_btl->rr_desc_post = (VAPI_rr_desc_t*) malloc((mvapi_btl->rd_num * sizeof(VAPI_rr_desc_t)));
}
}
return OMPI_SUCCESS;
}
/*
* delete the proc as reachable from this btl module
*/
int mca_btl_mvapi_del_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t ** peers)
{
/* Stub */
BTL_VERBOSE(("Stub\n"));
return OMPI_SUCCESS;
}
/*
*Register callback function to support send/recv semantics
*/
int mca_btl_mvapi_register(
struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
mvapi_btl->ib_reg[tag].cbfunc = cbfunc;
mvapi_btl->ib_reg[tag].cbdata = cbdata;
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
}
/**
* Allocate a segment.
*
* @param btl (IN) BTL module
* @param size (IN) Request segment size.
*
* When allocating a segment we pull a pre-alllocated segment
* from one of two free lists, an eager list and a max list
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_alloc(
struct mca_btl_base_module_t* btl,
uint8_t order,
size_t size)
{
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_module_t* mvapi_btl;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
if(size <= mca_btl_mvapi_component.eager_limit){
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
if(NULL == frag) return NULL;
frag->segment.seg_len = size;
} else if (size <= mca_btl_mvapi_component.max_send_size) {
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) return NULL;
frag->segment.seg_len = size;
} else {
return NULL;
}
frag->segment.seg_len = size <= mvapi_btl->super.btl_eager_limit ? size : mvapi_btl->super.btl_eager_limit;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return (mca_btl_base_descriptor_t*)frag;
}
/**
* Return a segment
*
* Return the segment to the appropriate
* preallocated segment list
*/
int mca_btl_mvapi_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des)
{
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)des;
if (MCA_BTL_MVAPI_FRAG_FRAG == frag->type && frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration);
frag->registration = NULL;
}
MCA_BTL_IB_FRAG_RETURN(btl, frag);
return OMPI_SUCCESS;
}
/**
* register user buffer or pack
* data into pre-registered buffer and return a
* descriptor that can be
* used for send/put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL peer addressing
*
* prepare source's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registred user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registions is full, this prevents resources from being exhausted.
* Is the requested size larger than the btl's max send size?
* if so and we aren't asked to leave the registration pinned than we register the memory if
* the users buffer is contiguous
* Otherwise we choose from two free lists of pre-registered memory in which to pack the data into.
*
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size
)
{
mca_btl_mvapi_module_t* mvapi_btl;
mca_btl_mvapi_frag_t* frag = NULL;
mca_btl_mvapi_reg_t *mvapi_reg;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*)btl;
if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) {
if(registration != NULL || max_data > btl->btl_max_send_size) {
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
if(NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
iov.iov_base, max_data, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag);
return NULL;
}
frag->registration = (mca_btl_mvapi_reg_t*)registration;
}
mvapi_reg = (mca_btl_mvapi_reg_t*)registration;
frag->base.des_flags = 0;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
frag->sg_entry.len = max_data;
frag->sg_entry.lkey = mvapi_reg->l_key;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t)iov.iov_base;
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
frag->sg_entry.lkey, frag->sg_entry.addr,
frag->segment.seg_key.key32[0]));
return &frag->base;
}
}
if(max_data + reserve <= btl->btl_eager_limit) {
/* the data is small enough to fit in the eager frag and
* memory is not prepinned */
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
}
if(NULL == frag) {
/* the data doesn't fit into eager frag or eger frag is
* not available */
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
if( rc < 0 ) {
MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag);
return NULL;
}
*size = max_data;
frag->segment.seg_len = max_data + reserve;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
/**
* Prepare the dst buffer
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
* prepare dest's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registred user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registions is full, this prevents resources from being exhausted.
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size)
{
mca_btl_mvapi_module_t* mvapi_btl;
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_reg_t *mvapi_reg;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
if(NULL == frag){
return NULL;
}
frag->segment.seg_len = *size;
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
if(NULL == registration) {
/* we didn't get a memory registration passed in, so we have to register the region
* ourselves
*/
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
frag->segment.seg_addr.pval, *size, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
BTL_ERROR(("mpool_register(%p,%lu) failed: base %p offset %lu",
frag->segment.seg_addr.pval, *size, convertor->pBaseBuf, convertor->bConverted));
MCA_BTL_IB_FRAG_RETURN(btl, frag);
return NULL;
}
frag->registration = (mca_btl_mvapi_reg_t*)registration;
}
mvapi_reg = (mca_btl_mvapi_reg_t*)registration;
frag->sg_entry.len = *size;
frag->sg_entry.lkey = mvapi_reg->l_key;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->segment.seg_addr.pval;
frag->segment.seg_key.key32[0] =mvapi_reg->r_key;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
return &frag->base;
}
int mca_btl_mvapi_finalize(struct mca_btl_base_module_t* btl)
{
mca_btl_mvapi_module_t* mvapi_btl;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
return OMPI_SUCCESS;
}
/*
* Initiate a send.
*/
int mca_btl_mvapi_send(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)descriptor;
frag->endpoint = endpoint;
frag->hdr->tag = tag;
frag->desc.sr_desc.opcode = VAPI_SEND;
return mca_btl_mvapi_endpoint_send(endpoint, frag);
}
/*
* RDMA local buffer to remote buffer address.
*/
int mca_btl_mvapi_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc;
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor;
/* setup for queued requests */
frag->endpoint = endpoint;
frag->desc.sr_desc.opcode = VAPI_RDMA_WRITE;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
/* post descriptor */
} else {
frag->desc.sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_lp;
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) frag->base.des_dst->seg_addr.lval;
frag->desc.sr_desc.r_key = frag->base.des_dst->seg_key.key32[0];
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_src->seg_addr.pval;
frag->sg_entry.len = frag->base.des_src->seg_len;
if(VAPI_OK != VAPI_post_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_lp, &frag->desc.sr_desc)) {
rc = OMPI_ERROR;
} else {
rc = OMPI_SUCCESS;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
} else
#endif
{
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1);
}
}
return rc;
}
/*
* RDMA read remote buffer to local buffer address.
*/
int mca_btl_mvapi_get( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc;
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor;
frag->endpoint = endpoint;
frag->desc.sr_desc.opcode = VAPI_RDMA_READ;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
/* check for a get token */
} else if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_ADD32(&endpoint->get_tokens,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
} else {
frag->desc.sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_lp;
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) frag->base.des_src->seg_addr.lval;
frag->desc.sr_desc.r_key = frag->base.des_src->seg_key.key32[0];
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_dst->seg_addr.pval;
frag->sg_entry.len = frag->base.des_dst->seg_len;
if(VAPI_OK != VAPI_post_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_lp, &frag->desc.sr_desc)) {
rc = OMPI_ERROR;
} else {
rc = OMPI_SUCCESS;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
} else
#endif
{
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1);
}
}
return rc;
}
/*
* Asynchronous event handler to detect unforseen
* events. Usually, such events are catastrophic.
* Should have a robust mechanism to handle these
* events and abort the OMPI application if necessary.
*
*/
static void async_event_handler(VAPI_hca_hndl_t hca_hndl,
VAPI_event_record_t * event_p,
void *priv_data)
{
switch (event_p->type) {
case VAPI_QP_PATH_MIGRATED:
case VAPI_EEC_PATH_MIGRATED:
case VAPI_QP_COMM_ESTABLISHED:
case VAPI_EEC_COMM_ESTABLISHED:
case VAPI_SEND_QUEUE_DRAINED:
case VAPI_PORT_ACTIVE:
{
BTL_VERBOSE(("Got an asynchronous event: %s\n", VAPI_event_record_sym(event_p->type)));
break;
}
case VAPI_CQ_ERROR:
case VAPI_LOCAL_WQ_INV_REQUEST_ERROR:
case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR:
case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR:
case VAPI_PATH_MIG_REQ_ERROR:
case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR:
case VAPI_LOCAL_CATASTROPHIC_ERROR:
case VAPI_PORT_ERROR:
{
BTL_ERROR(("Got an asynchronous event: %s (%s)",
VAPI_event_record_sym(event_p->type),
VAPI_event_syndrome_sym(event_p->syndrome)));
break;
}
#ifdef VAPI_FEATURE_SRQ
case VAPI_SRQ_LIMIT_REACHED:
{
size_t i;
BTL_ERROR(("SRQ limit is reached, posting more buffers %s\n", VAPI_event_record_sym(event_p->type)));
for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i];
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
}
}
#endif
/* BWB - is this right? */
#ifdef VAPI_FEATURE_SRQ
case VAPI_RECEIVE_QUEUE_DRAINED: {
fprintf(stderr, "VAPI_RECEIVE_QUEUE_DRAINEDD\n");
}
#endif
default:
BTL_ERROR(("Warning!! Got an undefined "
"asynchronous event %s", VAPI_event_record_sym(event_p->type)));
}
}
/*
* Initialize the btl module by allocating a protection domain
* and creating both the high and low priority completion queues
*/
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl)
{
/* Allocate Protection Domain */
VAPI_ret_t ret;
uint32_t cqe_cnt = 0;
#ifdef VAPI_FEATURE_SRQ
VAPI_srq_attr_t srq_attr, srq_attr_out, srq_attr_mod;
VAPI_srq_attr_mask_t srq_attr_mask;
uint32_t max_outs_wr;
#endif
ret = VAPI_alloc_pd(mvapi_btl->nic, &mvapi_btl->ptag);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_alloc_pd: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
mvapi_btl->srd_posted_hp = 0;
mvapi_btl->srd_posted_lp = 0;
srq_attr.pd_hndl = mvapi_btl->ptag;
srq_attr.max_outs_wr = mca_btl_mvapi_component.srq_rd_max;
srq_attr.max_sentries = mca_btl_mvapi_component.ib_sg_list_size;
srq_attr_mod.srq_limit = mvapi_btl->rd_num * 0.9;
ret = VAPI_create_srq(mvapi_btl->nic,
&srq_attr,
&mvapi_btl->srq_hndl_hp,
&srq_attr_out);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
srq_attr_mask = 0;
srq_attr_mask |= VAPI_SRQ_ATTR_LIMIT;
ret = VAPI_modify_srq
(
mvapi_btl->nic,
mvapi_btl->srq_hndl_hp,
&srq_attr_mod,
srq_attr_mask,
&max_outs_wr
);
if(ret != VAPI_OK) {
/* BTL_ERROR(("error in VAPI_modify_srq: %s", VAPI_strerror(ret))); */
/* return OMPI_ERROR; */
}
ret = VAPI_create_srq(mvapi_btl->nic,
&srq_attr,
&mvapi_btl->srq_hndl_lp,
&srq_attr_out);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
srq_attr_mask = 0;
srq_attr_mask |= VAPI_SRQ_ATTR_LIMIT;
ret = VAPI_modify_srq
(
mvapi_btl->nic,
mvapi_btl->srq_hndl_lp,
&srq_attr_mod,
srq_attr_mask,
&max_outs_wr
);
if(ret != VAPI_OK) {
/* BTL_ERROR(("error in VAPI_modify_srq: %s", VAPI_strerror(ret))); */
/* return OMPI_ERROR; */
}
} else {
mvapi_btl->srq_hndl_hp = VAPI_INVAL_SRQ_HNDL;
mvapi_btl->srq_hndl_lp = VAPI_INVAL_SRQ_HNDL;
}
#endif /* VAPI_FEATURE_SRQ */
ret = VAPI_create_cq(mvapi_btl->nic, mca_btl_mvapi_component.ib_cq_size,
&mvapi_btl->cq_hndl_lp, &cqe_cnt);
if( VAPI_OK != ret) {
BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
ret = VAPI_create_cq(mvapi_btl->nic, mca_btl_mvapi_component.ib_cq_size,
&mvapi_btl->cq_hndl_hp, &cqe_cnt);
if( VAPI_OK != ret) {
BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
if(cqe_cnt <= 0) {
BTL_ERROR(("error creating completion queue "));
return OMPI_ERROR;
}
ret = EVAPI_set_async_event_handler(mvapi_btl->nic,
async_event_handler, 0, &mvapi_btl->async_handler);
if(VAPI_OK != ret) {
BTL_ERROR(("error in EVAPI_set_async_event_handler: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/*
* Dump state of btl/queues
*/
/*#include "orte/mca/ns/ns_types.h"*/
void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl;
if( NULL == endpoint ) {
opal_output( 0, "No endpoint for this peer\n" );
return;
}
opal_output( 0, "endpoint with processor %s\n",
ORTE_NAME_PRINT( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
opal_output( 0, "endpoint state: %s\n",
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTING ? "connecting" :
(endpoint->endpoint_state == MCA_BTL_IB_CONNECT_ACK ? "waiting ack" :
(endpoint->endpoint_state == MCA_BTL_IB_WAITING_ACK ? "waiting final ack" :
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTED ? "connected" :
(endpoint->endpoint_state == MCA_BTL_IB_CLOSED ? "closed" :
(endpoint->endpoint_state == MCA_BTL_IB_FAILED ? "failed" : "unknown")))))));
opal_output( 0, "pending send frags: %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_send_frags) );
opal_output( 0, "pending frags hp : %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_frags_hp) );
opal_output( 0, "pending frags lp : %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_frags_lp) );
#ifdef VAPI_FEATURE_SRQ
if( mca_btl_mvapi_component.use_srq ) {
opal_output( 0, "mvapi_btl->srd_posted_hp %d\n", mvapi_btl->srd_posted_hp );
opal_output( 0, "mvapi_btl->srd_posted_lp %d\n", mvapi_btl->srd_posted_lp );
opal_output( 0, "mvapi_btl->sd_tokens_hp %d\n", mvapi_btl->sd_tokens_hp );
opal_output( 0, "mvapi_btl->sd_tokens_lp %d\n", mvapi_btl->sd_tokens_lp );
} else {
#endif /* VAPI_FEATURE_SRQ */
opal_output( 0, "sd_tokens_hp %d\n", endpoint->sd_tokens_hp );
opal_output( 0, "sd_tokens_lp %d\n", endpoint->sd_tokens_lp );
opal_output( 0, "get_tokens %d\n", endpoint->get_tokens );
opal_output( 0, "rd_posted_hp %d\n", endpoint->rd_posted_hp );
opal_output( 0, "rd_posted_lp %d\n", endpoint->rd_posted_lp );
opal_output( 0, "rd_credits_hp %d\n", endpoint->rd_credits_hp );
opal_output( 0, "rd_credits_lp %d\n", endpoint->rd_credits_lp );
opal_output( 0, "sd_credits_hp %d\n", endpoint->sd_credits_hp );
opal_output( 0, "sd_credits_lp %d\n", endpoint->sd_credits_lp );
#ifdef VAPI_FEATURE_SRQ
}
#endif /* VAPI_FEATURE_SRQ */
opal_output( 0, "sd_wqe_hp %d\n", endpoint->sd_wqe_hp );
opal_output( 0, "sd_wqe_lp %d\n", endpoint->sd_wqe_lp );
}
int mca_btl_mvapi_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,532 +0,0 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PTL_IB_H
#define MCA_PTL_IB_H
/* Standard system includes */
#include <sys/types.h>
#include <string.h>
/* Open MPI includes */
#include "ompi/class/ompi_free_list.h"
#include "ompi/class/ompi_bitmap.h"
#include "orte/class/orte_pointer_array.h"
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "opal/util/output.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "btl_mvapi_endpoint.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#define MCA_BTL_IB_LEAVE_PINNED 1
/**
* Infiniband (IB) BTL component.
*/
struct mca_btl_mvapi_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
uint32_t ib_max_btls;
/**< maximum number of hcas available to the IB component */
uint32_t ib_num_btls;
/**< number of hcas available to the IB component */
struct mca_btl_mvapi_module_t *mvapi_btls;
/**< array of available PTLs */
int ib_free_list_num;
/**< initial size of free lists */
int ib_free_list_max;
/**< maximum size of free lists */
int ib_free_list_inc;
/**< number of elements to alloc when growing free lists */
opal_list_t ib_procs;
/**< list of ib proc structures */
opal_event_t ib_send_event;
/**< event structure for sends */
opal_event_t ib_recv_event;
/**< event structure for recvs */
opal_mutex_t ib_lock;
/**< lock for accessing module state */
char* ib_mpool_name;
/**< name of ib memory pool */
int32_t rd_num; /**< the number of receive descriptors to post to each queue pair */
int32_t rd_low; /**< low water mark to reach before posting additional receive descriptors */
int32_t rd_win; /**< ack credits when window size exceeded */
int32_t rd_rsv; /**< descriptors held in reserve for control messages */
/* number of srq send tokes available */
int32_t srq_sd_max;
int32_t srq_rd_max;
int32_t srq_rd_per_peer;
/**< the number of recv desc posted per log(peer) in SRQ mode */
size_t eager_limit;
size_t max_send_size;
uint32_t reg_mru_len;
uint32_t use_srq;
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_wq_size; /**< Max outstanding WR on the WQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
uint32_t ib_pkey_ix;
uint32_t ib_psn;
uint32_t ib_qp_ous_rd_atom;
uint32_t ib_mtu;
uint32_t ib_min_rnr_timer;
uint32_t ib_timeout;
uint32_t ib_retry_count;
uint32_t ib_rnr_retry;
uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
uint32_t use_eager_rdma;
uint32_t eager_rdma_threshold;
uint32_t eager_rdma_num;
uint32_t max_eager_rdma;
}; typedef struct mca_btl_mvapi_component_t mca_btl_mvapi_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_mvapi_component_t mca_btl_mvapi_component;
typedef mca_btl_base_recv_reg_t mca_btl_mvapi_recv_reg_t;
/**
* IB PTL Interface
*/
struct mca_btl_mvapi_module_t {
mca_btl_base_module_t super; /**< base PTL interface */
bool btl_inited;
mca_btl_mvapi_recv_reg_t ib_reg[256];
mca_btl_mvapi_port_info_t port_info; /* contains only the subnet right now */
VAPI_hca_id_t hca_id; /**< ID of HCA */
IB_port_t port_id; /**< ID of the PORT */
VAPI_hca_port_t port; /**< IB port of this PTL */
VAPI_hca_hndl_t nic; /**< NIC handle */
VAPI_pd_hndl_t ptag; /**< Protection Domain tag */
VAPI_cq_hndl_t cq_hndl_hp; /**< High Priority Completion Queue handle */
VAPI_cq_hndl_t cq_hndl_lp; /**< Low Priority Completion Queue handle */
EVAPI_async_handler_hndl_t async_handler;
/**< Async event handler used to detect weird/unknown events */
ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */
ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */
ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */
ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */
ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */
opal_mutex_t ib_lock; /**< module level lock */
VAPI_rr_desc_t* rr_desc_post; /**< an array to allow posting of rr in one swoop */
#ifdef VAPI_FEATURE_SRQ
VAPI_srq_hndl_t srq_hndl_hp; /**< A high priority shared receive queue
runtime optional, can also use a receive queue
per queue pair.. */
VAPI_srq_hndl_t srq_hndl_lp; /**< A low priority shared receive queue */
#endif
size_t ib_inline_max; /**< max size of inline send*/
int32_t num_peers;
int32_t srd_posted_hp; /**< number of high priority shared receive descriptors posted to the nic*/
int32_t srd_posted_lp; /**< number of low priority shared receive descriptors posted to the nic*/
int32_t rd_num; /**< number of receive descriptors to post to srq */
int32_t rd_low; /**< low water mark before reposting descriptors to srq */
int32_t sd_tokens_hp; /**< number of send tokens available on high priority srq */
int32_t sd_tokens_lp; /**< number of send tokens available on low priority srq */
opal_list_t pending_frags_hp; /**< list of pending high priority frags */
opal_list_t pending_frags_lp; /**< list of pending low priority frags */
opal_mutex_t eager_rdma_lock;
size_t eager_rdma_frag_size; /**< length of eager frag */
orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */
uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */
}; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t;
struct mca_btl_mvapi_reg_t {
mca_mpool_base_registration_t base;
VAPI_mr_hndl_t hndl; /* Memory region handle */
VAPI_lkey_t l_key; /* Local key to registered memory */
VAPI_rkey_t r_key; /* Remote key to registered memory */
};
typedef struct mca_btl_mvapi_reg_t mca_btl_mvapi_reg_t;
#define MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, \
additional) \
{ \
do { \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(mvapi_btl->srd_posted_hp <= mvapi_btl->rd_low+additional && \
mvapi_btl->srd_posted_hp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_num - \
mvapi_btl->srd_posted_hp, \
mvapi_btl, \
&mvapi_btl->recv_free_eager, \
&mvapi_btl->srd_posted_hp, \
mvapi_btl->nic, \
mvapi_btl->srq_hndl_hp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
}while(0);\
}
#define MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, \
additional) \
{ \
do { \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(mvapi_btl->srd_posted_lp <= mvapi_btl->rd_low+additional && \
mvapi_btl->srd_posted_lp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_num - \
mvapi_btl->srd_posted_lp, \
mvapi_btl, \
&mvapi_btl->recv_free_max, \
&mvapi_btl->srd_posted_lp, \
mvapi_btl->nic, \
mvapi_btl->srq_hndl_lp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_POST_SRR_SUB(cnt, \
mvapi_btl, \
frag_list, \
srd_posted, \
nic, \
srq_hndl) \
{\
do { \
int32_t i; \
VAPI_ret_t ret; \
uint32_t rwqe_posted = 0; \
int rc; \
ompi_free_list_item_t* item = NULL; \
mca_btl_mvapi_frag_t* frag = NULL; \
VAPI_rr_desc_t* desc_post = mvapi_btl->rr_desc_post; \
for(i = 0; i < cnt; i++) { \
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->sg_entry.len = frag->size + \
((unsigned char*) frag->segment.seg_addr.pval- \
(unsigned char*) frag->hdr); \
desc_post[i] = frag->desc.rr_desc; \
}\
ret = VAPI_post_srq( nic, \
srq_hndl, \
cnt, \
desc_post, \
&rwqe_posted); \
if(VAPI_OK != ret) { \
BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\
VAPI_strerror(ret))); \
} else if(rwqe_posted < 1) { \
BTL_ERROR(("error posting receive descriptors to shared receive queue, number of entries posted is %d", rwqe_posted)); \
} else {\
OPAL_THREAD_ADD32(srd_posted, cnt); \
}\
} while(0);\
}
struct mca_btl_mvapi_frag_t;
extern mca_btl_mvapi_module_t mca_btl_mvapi_module;
/**
* Register IB component parameters with the MCA framework
*/
extern int mca_btl_mvapi_component_open(void);
/**
* Any final cleanup before being unloaded.
*/
extern int mca_btl_mvapi_component_close(void);
/**
* IB component initialization.
*
* @param num_btl_modules (OUT) Number of BTLs returned in BTL array.
* @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE)
* @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE)
*
* (1) read interface list from kernel and compare against component parameters
* then create a BTL instance for selected interfaces
* (2) setup IB listen socket for incoming connection attempts
* (3) publish BTL addressing info
*
*/
extern mca_btl_base_module_t** mca_btl_mvapi_component_init(
int *num_btl_modules,
bool allow_multi_user_threads,
bool have_hidden_threads
);
/**
* IB component progress.
*/
extern int mca_btl_mvapi_component_progress( void );
/**
* Register a callback function that is called on receipt
* of a fragment.
*
* @param btl (IN) BTL module
* @return Status indicating if cleanup was successful
*
* When the process list changes, the PML notifies the BTL of the
* change, to provide the opportunity to cleanup or release any
* resources associated with the peer.
*/
int mca_btl_mvapi_register(
struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata
);
/**
* Cleanup any resources held by the BTL.
*
* @param btl BTL instance.
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_btl_mvapi_finalize(
struct mca_btl_base_module_t* btl
);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN)
* @param nprocs (IN) Number of processes
* @param procs (IN) Set of processes
* @param peers (OUT) Set of (optional) peer addressing info.
* @param peers (IN/OUT) Set of processes that are reachable via this BTL.
* @return OMPI_SUCCESS or error status on failure.
*
*/
extern int mca_btl_mvapi_add_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable
);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN) BTL instance
* @param nproc (IN) Number of processes.
* @param procs (IN) Set of processes.
* @param peers (IN) Set of peer data structures.
* @return Status indicating if cleanup was successful
*
*/
extern int mca_btl_mvapi_del_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers
);
/**
* PML->BTL Initiate a send of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_send(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag
);
/**
* PML->BTL Initiate a put of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
/**
* PML->BTL Initiate a get of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_get(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
/**
* Allocate a descriptor.
*
* @param btl (IN) BTL module
* @param size (IN) Requested descriptor size.
*/
extern mca_btl_base_descriptor_t* mca_btl_mvapi_alloc(
struct mca_btl_base_module_t* btl,
uint8_t order,
size_t size);
/**
* Return a segment allocated by this BTL.
*
* @param btl (IN) BTL module
* @param descriptor (IN) Allocated descriptor.
*/
extern int mca_btl_mvapi_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des);
/**
* Pack data and return a descriptor that can be
* used for send/put.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size
);
/**
* Allocate a descriptor initialized for RDMA write.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
extern mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size);
/**
* Return a send fragment to the modules free list.
*
* @param btl (IN) BTL instance
* @param frag (IN) IB send fragment
*
*/
extern void mca_btl_mvapi_send_frag_return(
struct mca_btl_base_module_t* btl,
struct mca_btl_mvapi_frag_t*
);
/*
* Dump state of btl/queues
*/
extern void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose
);
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t* mvapi_btl);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_mvapi_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,88 +0,0 @@
/*
* Copyright (c) 2006 Voltaire All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_MVAPI_EAGER_RDMA_BUF_H
#define MCA_BTL_MVAPI_EAGER_RDMA_BUF_H
#include "ompi_config.h"
#include "btl_mvapi.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_btl_mvapi_reg_t;
struct mca_btl_mvapi_frag_t;
struct mca_btl_mvapi_eager_rdma_local_t {
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
struct mca_btl_mvapi_frag_t *frags;
struct mca_btl_mvapi_reg_t *reg;
uint16_t head; /**< RDMA buffer to poll */
uint16_t tail; /**< Needed for credit managment */
int32_t credits; /**< number of RDMA credits */
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
opal_mutex_t lock; /**< guard access to RDMA buffer */
};
typedef struct mca_btl_mvapi_eager_rdma_local_t mca_btl_mvapi_eager_rdma_local_t;
struct mca_btl_mvapi_eager_rdma_remote_t {
ompi_ptr_t base; /**< address of remote buffer */
uint64_t rkey; /**< RKey for accessing remote buffer */
uint16_t head; /**< RDMA buffer to post to */
int32_t tokens; /**< number of rdam tokens */
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
};
typedef struct mca_btl_mvapi_eager_rdma_remote_t mca_btl_mvapi_eager_rdma_remote_t;
#define MCA_BTL_MVAPI_RDMA_FRAG(F) ((F)->type == MCA_BTL_MVAPI_FRAG_EAGER_RDMA)
#define EAGER_RDMA_BUFFER_REMOTE (0)
#define EAGER_RDMA_BUFFER_LOCAL (0xff)
#ifdef WORDS_BIGENDIAN
#define MCA_BTL_MVAPI_RDMA_FRAG_GET_SIZE(F) ((F)->u.size >> 8)
#define MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(F, S) \
((F)->u.size = (S) << 8)
#else
#define MCA_BTL_MVAPI_RDMA_FRAG_GET_SIZE(F) ((F)->u.size & 0x00ffffff)
#define MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(F, S) \
((F)->u.size = (S) & 0x00ffffff)
#endif
#define MCA_BTL_MVAPI_RDMA_FRAG_LOCAL(F) \
(((volatile uint8_t*)(F)->ftr->u.buf)[3] != EAGER_RDMA_BUFFER_REMOTE)
#define MCA_BTL_MVAPI_RDMA_FRAG_REMOTE(F) \
(!MCA_BTL_MVAPI_RDMA_FRAG_LOCAL(F))
#define MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(F) do { \
((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_REMOTE; \
}while (0)
#define MCA_BTL_MVAPI_RDMA_MAKE_LOCAL(F) do { \
((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_LOCAL; \
}while (0)
#define MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(E, I) \
(&(E)->eager_rdma_local.frags[(I)])
#define MCA_BTL_MVAPI_RDMA_NEXT_INDEX(I) do { \
(I) = ((I) + 1) % \
mca_btl_mvapi_component.eager_rdma_num; \
} while (0)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,257 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_ENDPOINT_H
#define MCA_BTL_IB_ENDPOINT_H
#include "opal/class/opal_list.h"
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi.h"
#include "btl_mvapi_eager_rdma.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_btl_mvapi_endpoint_t);
struct mca_btl_mvapi_frag_t;
struct mca_btl_mvapi_port_info_t {
uint32_t subnet;
};
typedef struct mca_btl_mvapi_port_info_t mca_btl_mvapi_port_info_t;
/**
* State of IB endpoint connection.
*/
typedef enum {
/* Defines the state in which this BTL instance
* has started the process of connection */
MCA_BTL_IB_CONNECTING,
/* Waiting for ack from endpoint */
MCA_BTL_IB_CONNECT_ACK,
/*Waiting for final connection ACK from endpoint */
MCA_BTL_IB_WAITING_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BTL_IB_CONNECTED,
/* Connection is closed, there are no resources
* associated with this */
MCA_BTL_IB_CLOSED,
/* Maximum number of retries have been used.
* Report failure on send to upper layer */
MCA_BTL_IB_FAILED
} mca_btl_mvapi_endpoint_state_t;
struct mca_btl_mvapi_rem_info_t {
VAPI_qp_num_t rem_qp_num_hp;
/* High priority remote side QP number */
VAPI_qp_num_t rem_qp_num_lp;
/* Low prioirty remote size QP number */
IB_lid_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_subnet;
/* subnet of remote process */
} ;
typedef struct mca_btl_mvapi_rem_info_t mca_btl_mvapi_rem_info_t;
/**
* An abstraction that represents a connection to a endpoint process.
* An instance of mca_btl_base_endpoint_t is associated w/ each process
* and BTL pair at startup. However, connections to the endpoint
* are established dynamically on an as-needed basis:
*/
struct mca_btl_base_endpoint_t {
opal_list_item_t super;
struct mca_btl_mvapi_module_t* endpoint_btl;
/**< BTL instance that created this connection */
struct mca_btl_mvapi_proc_t* endpoint_proc;
/**< proc structure corresponding to endpoint */
mca_btl_mvapi_endpoint_state_t endpoint_state;
/**< current state of the connection */
size_t endpoint_retries;
/**< number of connection retries attempted */
double endpoint_tstamp;
/**< timestamp of when the first connection was attempted */
opal_mutex_t endpoint_lock;
/**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpoint */
opal_list_t pending_frags_hp; /**< list of pending high priority frags */
opal_list_t pending_frags_lp; /**< list of pending low priority frags */
mca_btl_mvapi_rem_info_t rem_info;
VAPI_qp_hndl_t lcl_qp_hndl_hp; /* High priority local QP handle */
VAPI_qp_hndl_t lcl_qp_hndl_lp; /* Low priority local QP handle */
VAPI_qp_prop_t lcl_qp_prop_hp; /* High priority local QP properties */
VAPI_qp_prop_t lcl_qp_prop_lp; /* Low priority local QP properties */
int32_t sd_tokens_hp; /**< number of high priority send tokens */
int32_t sd_tokens_lp; /**< number of low priority send tokens */
int32_t get_tokens; /**< number of available get tokens */
int32_t rd_posted_hp; /**< number of high priority descriptors posted to the nic*/
int32_t rd_posted_lp; /**< number of low priority descriptors posted to the nic*/
int32_t rd_credits_hp; /**< number of high priority credits to return to peer */
int32_t rd_credits_lp; /**< number of low priority credits to return to peer */
int32_t sd_credits_hp; /**< number of send wqe entries being used to return credits */
int32_t sd_credits_lp; /**< number of send wqe entries being used to return credits */
int32_t sd_wqe_hp; /**< number of available high priority send wqe entries */
int32_t sd_wqe_lp; /**< number of available low priority send wqe entries */
uint32_t subnet;
uint32_t eager_recv_count; /**< number of eager received */
mca_btl_mvapi_eager_rdma_remote_t eager_rdma_remote;
/**< info about remote RDMA buffer */
mca_btl_mvapi_eager_rdma_local_t eager_rdma_local;
/**< info about local RDMA buffer */
int32_t eager_rdma_index; /**< index into RDMA buffers pointer array */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_mvapi_endpoint_t;
int mca_btl_mvapi_endpoint_send(mca_btl_base_endpoint_t* endpoint, struct mca_btl_mvapi_frag_t* frag);
int mca_btl_mvapi_endpoint_connect(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_hp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_lp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_post_recv(void);
void mca_btl_mvapi_endpoint_connect_eager_rdma(mca_btl_mvapi_endpoint_t*);
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, \
additional) \
{ \
do { \
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(endpoint->rd_posted_hp <= mca_btl_mvapi_component.rd_low+additional && \
endpoint->rd_posted_hp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(mvapi_btl->rd_num - \
endpoint->rd_posted_hp, \
endpoint, \
&mvapi_btl->recv_free_eager, \
endpoint->rd_posted_hp, \
endpoint->rd_credits_hp, \
mvapi_btl->nic, \
endpoint->lcl_qp_hndl_hp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, \
additional) \
{ \
do { \
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(endpoint->rd_posted_lp <= mca_btl_mvapi_component.rd_low+additional && \
endpoint->rd_posted_lp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(mvapi_btl->rd_num - \
endpoint->rd_posted_lp, \
endpoint, \
&mvapi_btl->recv_free_max, \
endpoint->rd_posted_lp, \
endpoint->rd_credits_lp, \
mvapi_btl->nic, \
endpoint->lcl_qp_hndl_lp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(cnt, \
my_endpoint, \
frag_list, \
rd_posted, \
rd_credits, \
nic, \
qp ) \
{ \
do { \
int32_t i; \
int rc; \
int32_t num_post = cnt; \
mca_btl_mvapi_module_t *mvapi_btl = my_endpoint->endpoint_btl; \
VAPI_rr_desc_t* desc_post = mvapi_btl->rr_desc_post; \
for(i = 0; i < num_post; i++) { \
ompi_free_list_item_t* item; \
mca_btl_mvapi_frag_t* frag = NULL; \
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->endpoint = my_endpoint; \
frag->sg_entry.len = frag->size + \
((unsigned char*) frag->segment.seg_addr.pval- \
(unsigned char*) frag->hdr); \
desc_post[i] = frag->desc.rr_desc; \
}\
rc = EVAPI_post_rr_list( nic, \
qp, \
num_post, \
desc_post); \
if(VAPI_OK != rc) { \
BTL_ERROR(("error posting receive descriptors: %s",\
VAPI_strerror(rc))); \
} else { \
OPAL_THREAD_ADD32(&(rd_posted), num_post); \
OPAL_THREAD_ADD32(&(rd_credits), num_post); \
}\
} while(0); \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,160 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_mvapi_frag.h"
static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_reg_t* mem_hndl =
(mca_btl_mvapi_reg_t*)frag->base.super.registration;
frag->hdr = (mca_btl_mvapi_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.lkey = mem_hndl->l_key;
frag->segment.seg_key.key32[0] = frag->sg_entry.lkey;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr;
frag->base.des_flags = 0;
}
static void mca_btl_mvapi_send_frag_common_constructor(mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_frag_common_constructor(frag);
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->desc.sr_desc.comp_type = VAPI_SIGNALED;
frag->desc.sr_desc.opcode = VAPI_SEND;
frag->desc.sr_desc.remote_qkey = 0;
frag->desc.sr_desc.sg_lst_len = 1;
frag->desc.sr_desc.sg_lst_p = &frag->sg_entry;
frag->desc.sr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag;
}
static void mca_btl_mvapi_recv_frag_common_constructor(mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_frag_common_constructor(frag);
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->desc.rr_desc.comp_type = VAPI_SIGNALED;
frag->desc.rr_desc.opcode = VAPI_RECEIVE;
frag->desc.rr_desc.sg_lst_len = 1;
frag->desc.rr_desc.sg_lst_p = &frag->sg_entry;
frag->desc.rr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag;
}
static void mca_btl_mvapi_send_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
static void mca_btl_mvapi_send_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
static void mca_btl_mvapi_recv_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_recv_frag_common_constructor(frag);
}
static void mca_btl_mvapi_recv_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_recv_frag_common_constructor(frag);
frag->ftr = (mca_btl_mvapi_footer_t*)((char*)frag->segment.seg_addr.pval
+ frag->size);
MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(frag->ftr);
}
static void mca_btl_mvapi_send_frag_frag_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = 0;
frag->type = MCA_BTL_MVAPI_FRAG_FRAG;
frag->registration = NULL;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_frag_t,
mca_btl_base_descriptor_t,
NULL,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_eager_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_max_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_frag_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_recv_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_recv_frag_eager_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_recv_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_recv_frag_max_constructor,
NULL);

Просмотреть файл

@ -1,204 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_FRAG_H
#define MCA_BTL_IB_FRAG_H
#define MCA_BTL_IB_FRAG_ALIGN (8)
#include "ompi_config.h"
#include "btl_mvapi.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_btl_mvapi_reg_t;
struct mca_btl_mvapi_header_t {
mca_btl_base_tag_t tag;
int16_t credits;
int16_t rdma_credits;
};
typedef struct mca_btl_mvapi_header_t mca_btl_mvapi_header_t;
struct mca_btl_mvapi_footer_t {
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
union {
uint32_t size;
uint8_t buf[4];
} u;
};
typedef struct mca_btl_mvapi_footer_t mca_btl_mvapi_footer_t;
typedef enum {
MCA_BTL_MVAPI_CONTROL_NOOP,
MCA_BTL_MVAPI_CONTROL_RDMA
} mca_btl_mvapi_control_t;
struct mca_btl_mvapi_control_header_t {
mca_btl_mvapi_control_t type;
};
typedef struct mca_btl_mvapi_control_header_t mca_btl_mvapi_control_header_t;
struct mca_btl_mvapi_eager_rdma_header_t {
mca_btl_mvapi_control_header_t control;
ompi_ptr_t rdma_start;
uint64_t rkey;
};
typedef struct mca_btl_mvapi_eager_rdma_header_t mca_btl_mvapi_eager_rdma_header_t;
enum mca_btl_mvapi_frag_type_t {
MCA_BTL_MVAPI_FRAG_EAGER,
MCA_BTL_MVAPI_FRAG_MAX,
MCA_BTL_MVAPI_FRAG_FRAG,
MCA_BTL_MVAPI_FRAG_EAGER_RDMA
};
typedef enum mca_btl_mvapi_frag_type_t mca_btl_mvapi_frag_type_t;
union mca_btl_mvapi_frag_desc_t {
VAPI_rr_desc_t rr_desc;
VAPI_sr_desc_t sr_desc;
};
typedef union mca_btl_mvapi_frag_desc_t mca_btl_mvapi_frag_desc_t;
/**
* IB send fragment derived type.
*/
struct mca_btl_mvapi_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segment;
struct mca_btl_base_endpoint_t *endpoint;
size_t size;
int rc;
mca_btl_mvapi_frag_type_t type;
mca_btl_mvapi_frag_desc_t desc;
VAPI_sg_lst_entry_t sg_entry;
mca_btl_mvapi_header_t *hdr;
mca_btl_mvapi_footer_t *ftr;
struct mca_btl_mvapi_reg_t *registration;
ompi_free_list_t* my_list;
};
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_frag_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_eager_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_max_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_frag_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_recv_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_recv_frag_eager_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_recv_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_recv_frag_max_t);
/*
* Allocate an IB send descriptor
*
*/
#define MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_eager, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_eager; \
}
#define MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_max, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_max; \
}
#define MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_frag, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_frag; \
}
#define MCA_BTL_IB_FRAG_RETURN(btl, frag) \
{ \
OMPI_FREE_LIST_RETURN(frag->my_list, \
(ompi_free_list_item_t*)(frag)); \
}
#define MCA_BTL_IB_FRAG_PROGRESS(frag) \
do { \
switch(frag->desc.sr_desc.opcode) { \
case VAPI_SEND: \
if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) { \
BTL_ERROR(("error in posting pending send\n")); \
} \
break; \
case VAPI_RDMA_WRITE: \
if(OMPI_SUCCESS != mca_btl_mvapi_put((mca_btl_base_module_t*) mvapi_btl, \
frag->endpoint, \
(mca_btl_base_descriptor_t*) frag)) { \
BTL_ERROR(("error in posting pending rdma write\n")); \
} \
break; \
case VAPI_RDMA_READ: \
if(OMPI_SUCCESS != mca_btl_mvapi_get((mca_btl_base_module_t *) mvapi_btl, \
frag->endpoint, \
(mca_btl_base_descriptor_t*) frag)) { \
BTL_ERROR(("error in posting pending rdma read\n")); \
} \
break; \
default: \
BTL_ERROR(("error in posting pending operation, invalide opcode %d\n", frag->desc.sr_desc.opcode)); \
break; \
} \
} while (0)
struct mca_btl_mvapi_module_t;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,192 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_hash_table.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "btl_mvapi.h"
#include "btl_mvapi_proc.h"
static void mca_btl_mvapi_proc_construct(mca_btl_mvapi_proc_t* proc);
static void mca_btl_mvapi_proc_destruct(mca_btl_mvapi_proc_t* proc);
OBJ_CLASS_INSTANCE(mca_btl_mvapi_proc_t,
opal_list_item_t, mca_btl_mvapi_proc_construct,
mca_btl_mvapi_proc_destruct);
void mca_btl_mvapi_proc_construct(mca_btl_mvapi_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_port_count = 0;
proc->proc_endpoints = 0;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
/* add to list of all proc instance */
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
opal_list_append(&mca_btl_mvapi_component.ib_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
}
/*
* Cleanup ib proc instance
*/
void mca_btl_mvapi_proc_destruct(mca_btl_mvapi_proc_t* proc)
{
/* remove from list of all proc instances */
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
opal_list_remove_item(&mca_btl_mvapi_component.ib_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
/* release resources */
if(NULL != proc->proc_endpoints) {
free(proc->proc_endpoints);
}
}
/*
* Look for an existing IB process instances based on the associated
* ompi_proc_t instance.
*/
static mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_lookup_ompi(ompi_proc_t* ompi_proc)
{
mca_btl_mvapi_proc_t* ib_proc;
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
for(ib_proc = (mca_btl_mvapi_proc_t*)
opal_list_get_first(&mca_btl_mvapi_component.ib_procs);
ib_proc != (mca_btl_mvapi_proc_t*)
opal_list_get_end(&mca_btl_mvapi_component.ib_procs);
ib_proc = (mca_btl_mvapi_proc_t*)opal_list_get_next(ib_proc)) {
if(ib_proc->proc_ompi == ompi_proc) {
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
return ib_proc;
}
}
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
return NULL;
}
/*
* Create a IB process structure. There is a one-to-one correspondence
* between a ompi_proc_t and a mca_btl_mvapi_proc_t instance. We cache
* additional data (specifically the list of mca_btl_mvapi_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
{
mca_btl_mvapi_proc_t* mvapi_proc = NULL;
size_t size;
int rc;
/* Check if we have already created a IB proc
* structure for this ompi process */
mvapi_proc = mca_btl_mvapi_proc_lookup_ompi(ompi_proc);
if(mvapi_proc != NULL) {
/* Gotcha! */
return mvapi_proc;
}
/* Oops! First time, gotta create a new IB proc
* out of the ompi_proc ... */
mvapi_proc = OBJ_NEW(mca_btl_mvapi_proc_t);
/* Initialize number of peer */
mvapi_proc->proc_endpoint_count = 0;
mvapi_proc->proc_ompi = ompi_proc;
/* build a unique identifier (of arbitrary
* size) to represent the proc */
mvapi_proc->proc_guid = ompi_proc->proc_name;
/* query for the peer address info */
rc = ompi_modex_recv(
&mca_btl_mvapi_component.super.btl_version,
ompi_proc,
(void*)&mvapi_proc->proc_ports,
&size
);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}
if((size % sizeof(mca_btl_mvapi_port_info_t)) != 0) {
opal_output(0, "[%s:%d] invalid mvapi address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}
mvapi_proc->proc_port_count = size/sizeof(mca_btl_mvapi_port_info_t);
if (0 == mvapi_proc->proc_port_count) {
mvapi_proc->proc_endpoints = NULL;
} else {
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == mvapi_proc->proc_endpoints) {
OBJ_RELEASE(mvapi_proc);
return NULL;
}
return mvapi_proc;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a btl instance into the proc array and assign
* it an address.
*/
int mca_btl_mvapi_proc_insert(mca_btl_mvapi_proc_t* mvapi_proc,
mca_btl_base_endpoint_t* mvapi_endpoint)
{
/* insert into endpoint array */
if(mvapi_proc->proc_port_count <= mvapi_proc->proc_endpoint_count)
return OMPI_ERR_OUT_OF_RESOURCE;
mvapi_endpoint->endpoint_proc = mvapi_proc;
mvapi_proc->proc_endpoints[mvapi_proc->proc_endpoint_count++] = mvapi_endpoint;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,72 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_PROC_H
#define MCA_BTL_IB_PROC_H
#include "orte/mca/ns/ns.h"
#include "opal/class/opal_object.h"
#include "ompi/proc/proc.h"
#include "btl_mvapi.h"
#include "btl_mvapi_endpoint.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Represents the state of a remote process and the set of addresses
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
* each
* BTL instance that attempts to open a connection to the process.
*/
struct mca_btl_mvapi_proc_t {
opal_list_item_t super;
/**< allow proc to be placed on a list */
ompi_proc_t *proc_ompi;
/**< pointer to corresponding ompi_proc_t */
orte_process_name_t proc_guid;
/**< globally unique identifier for the process */
struct mca_btl_mvapi_port_info_t* proc_ports;
size_t proc_port_count;
/**< number of ports published by endpoint */
struct mca_btl_base_endpoint_t **proc_endpoints;
/**< array of endpoints that have been created to access this proc */
size_t proc_endpoint_count;
/**< number of endpoints */
opal_mutex_t proc_lock;
/**< lock to protect against concurrent access to proc state */
};
typedef struct mca_btl_mvapi_proc_t mca_btl_mvapi_proc_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_proc_t);
mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc);
int mca_btl_mvapi_proc_insert(mca_btl_mvapi_proc_t*, mca_btl_base_endpoint_t*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,50 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_MVAPI_RDMA_BUF_H
#define MCA_BTL_MVAPI_RDMA_BUF_H
#include "ompi_config.h"
#include "btl_mvapi.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#if 0
struct mca_btl_mvapi_rdma_buf_t {
void* base;
size_t entry_size;
uint32_t entry_cnt;
void* current;
opal_mutex_t lock;
mca_mpool_base_registration_t* reg;
uint32_t tokens;
void* rem_addr;
size_t rem_size;
uint32_t rem_cnt;
void* rem_current;
VAPI_rkey_t r_key;
};
typedef struct mca_btl_mvapi_rdma_buf_t mca_btl_mvapi_rdma_buf_t;
#endif
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,51 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_btl_mvapi_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_btl_mvapi_CONFIG],[
OMPI_CHECK_MVAPI([btl_mvapi],
[btl_mvapi_happy="yes"],
[btl_mvapi_happy="no"])
AS_IF([test "$btl_mvapi_happy" = "yes"],
[btl_mvapi_WRAPPER_EXTRA_LDFLAGS="$btl_mvapi_LDFLAGS"
btl_mvapi_WRAPPER_EXTRA_LIBS="$btl_mvapi_LIBS"
$1],
[$2])
# Many of the vapi.h files floating around don't obey ISO99 C
# standard, so cause oodles of warnings with -pedantic and
# -Wundef. Remove them from CFLAGS, which is then used to
# forcefully override CFLAGS in the makefile for MVAPI
# components
btl_mvapi_CFLAGS="`echo $CFLAGS | sed 's/-pedantic//g'`"
btl_mvapi_CFLAGS="`echo $btl_mvapi_CFLAGS | sed 's/-Wundef//g'`"
AS_IF([test "$btl_mvapi_CFLAGS" != "$CFLAGS" -a "$btl_mvapi_happy" = "yes"],
[AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for
mvapi component because some vapi.h files are not really ANSI C])])
# substitute in the things needed to build mvapi
AC_SUBST([btl_mvapi_CFLAGS])
AC_SUBST([btl_mvapi_CPPFLAGS])
AC_SUBST([btl_mvapi_LDFLAGS])
AC_SUBST([btl_mvapi_LIBS])
])dnl

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,41 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[btl_mvapi:retry-exceeded]
The retry count is a down counter initialized on creation of the QP. Retry
count is defined in the InfiniBand Spec 1.2 (12.7.38):
The total number of times that the sender wishes the receiver to retry tim-
eout, packet sequence, etc. errors before posting a completion error.
Note that two mca parameters are involved here:
btl_mvapi_ib_retry_count - The number of times the sender will attempt to
retry (defaulted to 7, the maximum value).
btl_mvapi_ib_timeout - The local ack timeout parameter (defaulted to 10). The
actual timeout value used is calculated as:
(4.096 micro-seconds * 2^btl_mvapi_ib_timeout).
See InfiniBand Spec 1.2 (12.7.34) for more details.
What to do next:
One item to note is the hosts on which this error has occured, it has been
observed that rebooting or removing a particular host from the job can resolve
this issue. Should you be able to identify a specific cause or additional
trouble shooting information please report this to devel@open-mpi.org.