1
1

Remove the mvapi BTL. Woo hoo!

This commit was SVN r16483.
Этот коммит содержится в:
Jeff Squyres 2007-10-17 14:08:03 +00:00
родитель 0bf61a1b84
Коммит b7eeae0a74
16 изменённых файлов: 3 добавлений и 4972 удалений

3
NEWS
Просмотреть файл

@ -68,6 +68,9 @@ Trunk (not on release branches yet)
- Added checkpoint/restart process fault tolerance support. Initially
support a LAM/MPI-like protocol.
--> Expected: 1.3
- Removed "mvapi" BTL; all InfiniBand support now uses the OpenFabrics
driver stacks.
--> Expected: 1.3
- Fixed issue with pthread detection when compilers are not all
from the same vendor. Thanks to Ake Sandgren for the bug

Просмотреть файл

@ -1,69 +0,0 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
CFLAGS = $(btl_mvapi_CFLAGS)
AM_CPPFLAGS = $(btl_mvapi_CPPFLAGS)
dist_pkgdata_DATA=help-mpi-btl-mvapi.txt
sources = \
btl_mvapi.c \
btl_mvapi.h \
btl_mvapi_component.c \
btl_mvapi_endpoint.c \
btl_mvapi_endpoint.h \
btl_mvapi_frag.c \
btl_mvapi_frag.h \
btl_mvapi_proc.c \
btl_mvapi_proc.h \
btl_mvapi_eager_rdma.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_btl_mvapi_DSO
lib =
lib_sources =
component = mca_btl_mvapi.la
component_sources = $(sources)
else
lib = libmca_btl_mvapi.la
lib_sources = $(sources)
component =
component_sources =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component)
mca_btl_mvapi_la_SOURCES = $(component_sources)
mca_btl_mvapi_la_LDFLAGS = -module -avoid-version $(btl_mvapi_LDFLAGS)
mca_btl_mvapi_la_LIBADD = \
$(btl_mvapi_LIBS) \
$(top_ompi_builddir)/ompi/libmpi.la \
$(top_ompi_builddir)/orte/libopen-rte.la \
$(top_ompi_builddir)/opal/libopen-pal.la
noinst_LTLIBRARIES = $(lib)
libmca_btl_mvapi_la_SOURCES = $(lib_sources)
libmca_btl_mvapi_la_LDFLAGS = -module -avoid-version$ $(btl_mvapi_LDFLAGS)
libmca_btl_mvapi_la_LIBADD = $(btl_mvapi_LIBS)

Просмотреть файл

@ -1,856 +0,0 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/if.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "btl_mvapi.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi_proc.h"
#include "btl_mvapi_endpoint.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/datatype.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include <vapi_types.h>
#include <math.h> /* for log2 */
mca_btl_mvapi_module_t mca_btl_mvapi_module = {
{
&mca_btl_mvapi_component.super,
0, /* max size of first fragment */
0, /* min send fragment size */
0, /* max send fragment size */
0, /* btl_rdma_pipeline_send_length */
0, /* btl_rdma_pipeline_frag_size */
0, /* btl_min_rdma_pipeline_size */
0, /* exclusivity */
0, /* latency */
0, /* bandwidth */
0, /* TODO this should be PUT btl flags */
mca_btl_mvapi_add_procs,
mca_btl_mvapi_del_procs,
mca_btl_mvapi_register,
mca_btl_mvapi_finalize,
/* we need alloc free, pack */
mca_btl_mvapi_alloc,
mca_btl_mvapi_free,
mca_btl_mvapi_prepare_src,
mca_btl_mvapi_prepare_dst,
mca_btl_mvapi_send,
mca_btl_mvapi_put,
mca_btl_mvapi_get,
mca_btl_mvapi_dump,
NULL, /* mpool */
NULL, /* error call back registration */
mca_btl_mvapi_ft_event
}
};
/*
* add a proc to this btl module
* creates an endpoint that is setup on the
* first send to the endpoint
*/
int mca_btl_mvapi_add_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **ompi_procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl;
int i, rc;
for(i = 0; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_mvapi_proc_t* ib_proc;
mca_btl_base_endpoint_t* ib_peer;
/* mvapi doesn't support heterogeneous yet... */
if (ompi_proc_local()->proc_arch != ompi_proc->proc_arch) {
continue;
}
if(NULL == (ib_proc = mca_btl_mvapi_proc_create(ompi_proc))) {
continue;
}
/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this PTL instance to the proc.
*/
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
/* The btl_proc datastructure is shared by all IB PTL
* instances that are trying to reach this destination.
* Cache the peer instance on the btl_proc.
*/
ib_peer = OBJ_NEW(mca_btl_mvapi_endpoint_t);
if(NULL == ib_peer) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
ib_peer->endpoint_btl = mvapi_btl;
ib_peer->subnet = mvapi_btl->port_info.subnet;
rc = mca_btl_mvapi_proc_insert(ib_proc, ib_peer);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
continue;
}
ompi_bitmap_set_bit(reachable, i);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
peers[i] = ib_peer;
}
/* currently we only scale the srq the first time
add_procs is called, subsequent calls are ignored,
we should be able to change this to modify the SRQ but
I am unsure as to what this entails
*/
if( 0 == mvapi_btl->num_peers ) {
mvapi_btl->num_peers += nprocs;
if(mca_btl_mvapi_component.use_srq) {
mvapi_btl->rd_num = mca_btl_mvapi_component.rd_num + log2(nprocs) * mca_btl_mvapi_component.srq_rd_per_peer;
if(mvapi_btl->rd_num > mca_btl_mvapi_component.srq_rd_max)
mvapi_btl->rd_num = mca_btl_mvapi_component.srq_rd_max;
mvapi_btl->rd_low = mvapi_btl->rd_num - 1;
free(mvapi_btl->rr_desc_post);
mvapi_btl->rr_desc_post = (VAPI_rr_desc_t*) malloc((mvapi_btl->rd_num * sizeof(VAPI_rr_desc_t)));
}
}
return OMPI_SUCCESS;
}
/*
* delete the proc as reachable from this btl module
*/
int mca_btl_mvapi_del_procs(struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t ** peers)
{
/* Stub */
BTL_VERBOSE(("Stub\n"));
return OMPI_SUCCESS;
}
/*
*Register callback function to support send/recv semantics
*/
int mca_btl_mvapi_register(
struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
mvapi_btl->ib_reg[tag].cbfunc = cbfunc;
mvapi_btl->ib_reg[tag].cbdata = cbdata;
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
}
/**
* Allocate a segment.
*
* @param btl (IN) BTL module
* @param size (IN) Request segment size.
*
* When allocating a segment we pull a pre-alllocated segment
* from one of two free lists, an eager list and a max list
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_alloc(
struct mca_btl_base_module_t* btl,
uint8_t order,
size_t size)
{
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_module_t* mvapi_btl;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
if(size <= mca_btl_mvapi_component.eager_limit){
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
if(NULL == frag) return NULL;
frag->segment.seg_len = size;
} else if (size <= mca_btl_mvapi_component.max_send_size) {
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) return NULL;
frag->segment.seg_len = size;
} else {
return NULL;
}
frag->segment.seg_len = size <= mvapi_btl->super.btl_eager_limit ? size : mvapi_btl->super.btl_eager_limit;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return (mca_btl_base_descriptor_t*)frag;
}
/**
* Return a segment
*
* Return the segment to the appropriate
* preallocated segment list
*/
int mca_btl_mvapi_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des)
{
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)des;
if (MCA_BTL_MVAPI_FRAG_FRAG == frag->type && frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool, (mca_mpool_base_registration_t*) frag->registration);
frag->registration = NULL;
}
MCA_BTL_IB_FRAG_RETURN(btl, frag);
return OMPI_SUCCESS;
}
/**
* register user buffer or pack
* data into pre-registered buffer and return a
* descriptor that can be
* used for send/put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL peer addressing
*
* prepare source's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registred user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registions is full, this prevents resources from being exhausted.
* Is the requested size larger than the btl's max send size?
* if so and we aren't asked to leave the registration pinned than we register the memory if
* the users buffer is contiguous
* Otherwise we choose from two free lists of pre-registered memory in which to pack the data into.
*
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size
)
{
mca_btl_mvapi_module_t* mvapi_btl;
mca_btl_mvapi_frag_t* frag = NULL;
mca_btl_mvapi_reg_t *mvapi_reg;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*)btl;
if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) {
if(registration != NULL || max_data > btl->btl_max_send_size) {
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
if(NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
iov.iov_base, max_data, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag);
return NULL;
}
frag->registration = (mca_btl_mvapi_reg_t*)registration;
}
mvapi_reg = (mca_btl_mvapi_reg_t*)registration;
frag->base.des_flags = 0;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
frag->sg_entry.len = max_data;
frag->sg_entry.lkey = mvapi_reg->l_key;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t)iov.iov_base;
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
frag->sg_entry.lkey, frag->sg_entry.addr,
frag->segment.seg_key.key32[0]));
return &frag->base;
}
}
if(max_data + reserve <= btl->btl_eager_limit) {
/* the data is small enough to fit in the eager frag and
* memory is not prepinned */
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
}
if(NULL == frag) {
/* the data doesn't fit into eager frag or eger frag is
* not available */
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
if( rc < 0 ) {
MCA_BTL_IB_FRAG_RETURN(mvapi_btl, frag);
return NULL;
}
*size = max_data;
frag->segment.seg_len = max_data + reserve;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
return &frag->base;
}
/**
* Prepare the dst buffer
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
* prepare dest's behavior depends on the following:
* Has a valid memory registration been passed to prepare_src?
* if so we attempt to use the pre-registred user-buffer, if the memory registration
* is to small (only a portion of the user buffer) then we must reregister the user buffer
* Has the user requested the memory to be left pinned?
* if so we insert the memory registration into a memory tree for later lookup, we
* may also remove a previous registration if a MRU (most recently used) list of
* registions is full, this prevents resources from being exhausted.
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size)
{
mca_btl_mvapi_module_t* mvapi_btl;
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_reg_t *mvapi_reg;
int rc;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc);
if(NULL == frag){
return NULL;
}
frag->segment.seg_len = *size;
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
frag->base.des_flags = 0;
frag->base.order = MCA_BTL_NO_ORDER;
if(NULL == registration) {
/* we didn't get a memory registration passed in, so we have to register the region
* ourselves
*/
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
frag->segment.seg_addr.pval, *size, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
BTL_ERROR(("mpool_register(%p,%lu) failed: base %p offset %lu",
frag->segment.seg_addr.pval, *size, convertor->pBaseBuf, convertor->bConverted));
MCA_BTL_IB_FRAG_RETURN(btl, frag);
return NULL;
}
frag->registration = (mca_btl_mvapi_reg_t*)registration;
}
mvapi_reg = (mca_btl_mvapi_reg_t*)registration;
frag->sg_entry.len = *size;
frag->sg_entry.lkey = mvapi_reg->l_key;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->segment.seg_addr.pval;
frag->segment.seg_key.key32[0] =mvapi_reg->r_key;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
return &frag->base;
}
int mca_btl_mvapi_finalize(struct mca_btl_base_module_t* btl)
{
mca_btl_mvapi_module_t* mvapi_btl;
mvapi_btl = (mca_btl_mvapi_module_t*) btl;
return OMPI_SUCCESS;
}
/*
* Initiate a send.
*/
int mca_btl_mvapi_send(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)descriptor;
frag->endpoint = endpoint;
frag->hdr->tag = tag;
frag->desc.sr_desc.opcode = VAPI_SEND;
return mca_btl_mvapi_endpoint_send(endpoint, frag);
}
/*
* RDMA local buffer to remote buffer address.
*/
int mca_btl_mvapi_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc;
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor;
/* setup for queued requests */
frag->endpoint = endpoint;
frag->desc.sr_desc.opcode = VAPI_RDMA_WRITE;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
/* post descriptor */
} else {
frag->desc.sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_lp;
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) frag->base.des_dst->seg_addr.lval;
frag->desc.sr_desc.r_key = frag->base.des_dst->seg_key.key32[0];
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_src->seg_addr.pval;
frag->sg_entry.len = frag->base.des_src->seg_len;
if(VAPI_OK != VAPI_post_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_lp, &frag->desc.sr_desc)) {
rc = OMPI_ERROR;
} else {
rc = OMPI_SUCCESS;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
} else
#endif
{
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1);
}
}
return rc;
}
/*
* RDMA read remote buffer to local buffer address.
*/
int mca_btl_mvapi_get( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc;
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl;
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor;
frag->endpoint = endpoint;
frag->desc.sr_desc.opcode = VAPI_RDMA_READ;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
opal_list_append(&mvapi_btl->pending_frags_lp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
/* check for a get token */
} else if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,1);
OPAL_THREAD_ADD32(&endpoint->get_tokens,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_frags_lp, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
} else {
frag->desc.sr_desc.remote_qp = endpoint->rem_info.rem_qp_num_lp;
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) frag->base.des_src->seg_addr.lval;
frag->desc.sr_desc.r_key = frag->base.des_src->seg_key.key32[0];
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_dst->seg_addr.pval;
frag->sg_entry.len = frag->base.des_dst->seg_len;
if(VAPI_OK != VAPI_post_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_lp, &frag->desc.sr_desc)) {
rc = OMPI_ERROR;
} else {
rc = OMPI_SUCCESS;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
} else
#endif
{
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1);
}
}
return rc;
}
/*
* Asynchronous event handler to detect unforseen
* events. Usually, such events are catastrophic.
* Should have a robust mechanism to handle these
* events and abort the OMPI application if necessary.
*
*/
static void async_event_handler(VAPI_hca_hndl_t hca_hndl,
VAPI_event_record_t * event_p,
void *priv_data)
{
switch (event_p->type) {
case VAPI_QP_PATH_MIGRATED:
case VAPI_EEC_PATH_MIGRATED:
case VAPI_QP_COMM_ESTABLISHED:
case VAPI_EEC_COMM_ESTABLISHED:
case VAPI_SEND_QUEUE_DRAINED:
case VAPI_PORT_ACTIVE:
{
BTL_VERBOSE(("Got an asynchronous event: %s\n", VAPI_event_record_sym(event_p->type)));
break;
}
case VAPI_CQ_ERROR:
case VAPI_LOCAL_WQ_INV_REQUEST_ERROR:
case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR:
case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR:
case VAPI_PATH_MIG_REQ_ERROR:
case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR:
case VAPI_LOCAL_CATASTROPHIC_ERROR:
case VAPI_PORT_ERROR:
{
BTL_ERROR(("Got an asynchronous event: %s (%s)",
VAPI_event_record_sym(event_p->type),
VAPI_event_syndrome_sym(event_p->syndrome)));
break;
}
#ifdef VAPI_FEATURE_SRQ
case VAPI_SRQ_LIMIT_REACHED:
{
size_t i;
BTL_ERROR(("SRQ limit is reached, posting more buffers %s\n", VAPI_event_record_sym(event_p->type)));
for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i];
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1);
MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1);
}
}
#endif
/* BWB - is this right? */
#ifdef VAPI_FEATURE_SRQ
case VAPI_RECEIVE_QUEUE_DRAINED: {
fprintf(stderr, "VAPI_RECEIVE_QUEUE_DRAINEDD\n");
}
#endif
default:
BTL_ERROR(("Warning!! Got an undefined "
"asynchronous event %s", VAPI_event_record_sym(event_p->type)));
}
}
/*
* Initialize the btl module by allocating a protection domain
* and creating both the high and low priority completion queues
*/
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl)
{
/* Allocate Protection Domain */
VAPI_ret_t ret;
uint32_t cqe_cnt = 0;
#ifdef VAPI_FEATURE_SRQ
VAPI_srq_attr_t srq_attr, srq_attr_out, srq_attr_mod;
VAPI_srq_attr_mask_t srq_attr_mask;
uint32_t max_outs_wr;
#endif
ret = VAPI_alloc_pd(mvapi_btl->nic, &mvapi_btl->ptag);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_alloc_pd: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
mvapi_btl->srd_posted_hp = 0;
mvapi_btl->srd_posted_lp = 0;
srq_attr.pd_hndl = mvapi_btl->ptag;
srq_attr.max_outs_wr = mca_btl_mvapi_component.srq_rd_max;
srq_attr.max_sentries = mca_btl_mvapi_component.ib_sg_list_size;
srq_attr_mod.srq_limit = mvapi_btl->rd_num * 0.9;
ret = VAPI_create_srq(mvapi_btl->nic,
&srq_attr,
&mvapi_btl->srq_hndl_hp,
&srq_attr_out);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
srq_attr_mask = 0;
srq_attr_mask |= VAPI_SRQ_ATTR_LIMIT;
ret = VAPI_modify_srq
(
mvapi_btl->nic,
mvapi_btl->srq_hndl_hp,
&srq_attr_mod,
srq_attr_mask,
&max_outs_wr
);
if(ret != VAPI_OK) {
/* BTL_ERROR(("error in VAPI_modify_srq: %s", VAPI_strerror(ret))); */
/* return OMPI_ERROR; */
}
ret = VAPI_create_srq(mvapi_btl->nic,
&srq_attr,
&mvapi_btl->srq_hndl_lp,
&srq_attr_out);
if(ret != VAPI_OK) {
BTL_ERROR(("error in VAPI_create_srq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
srq_attr_mask = 0;
srq_attr_mask |= VAPI_SRQ_ATTR_LIMIT;
ret = VAPI_modify_srq
(
mvapi_btl->nic,
mvapi_btl->srq_hndl_lp,
&srq_attr_mod,
srq_attr_mask,
&max_outs_wr
);
if(ret != VAPI_OK) {
/* BTL_ERROR(("error in VAPI_modify_srq: %s", VAPI_strerror(ret))); */
/* return OMPI_ERROR; */
}
} else {
mvapi_btl->srq_hndl_hp = VAPI_INVAL_SRQ_HNDL;
mvapi_btl->srq_hndl_lp = VAPI_INVAL_SRQ_HNDL;
}
#endif /* VAPI_FEATURE_SRQ */
ret = VAPI_create_cq(mvapi_btl->nic, mca_btl_mvapi_component.ib_cq_size,
&mvapi_btl->cq_hndl_lp, &cqe_cnt);
if( VAPI_OK != ret) {
BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
ret = VAPI_create_cq(mvapi_btl->nic, mca_btl_mvapi_component.ib_cq_size,
&mvapi_btl->cq_hndl_hp, &cqe_cnt);
if( VAPI_OK != ret) {
BTL_ERROR(("error in VAPI_create_cq: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
if(cqe_cnt <= 0) {
BTL_ERROR(("error creating completion queue "));
return OMPI_ERROR;
}
ret = EVAPI_set_async_event_handler(mvapi_btl->nic,
async_event_handler, 0, &mvapi_btl->async_handler);
if(VAPI_OK != ret) {
BTL_ERROR(("error in EVAPI_set_async_event_handler: %s", VAPI_strerror(ret)));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/*
* Dump state of btl/queues
*/
/*#include "orte/mca/ns/ns_types.h"*/
void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose)
{
mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*)btl;
if( NULL == endpoint ) {
opal_output( 0, "No endpoint for this peer\n" );
return;
}
opal_output( 0, "endpoint with processor %s\n",
ORTE_NAME_PRINT( &(endpoint->endpoint_proc->proc_ompi->proc_name) ) );
opal_output( 0, "endpoint state: %s\n",
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTING ? "connecting" :
(endpoint->endpoint_state == MCA_BTL_IB_CONNECT_ACK ? "waiting ack" :
(endpoint->endpoint_state == MCA_BTL_IB_WAITING_ACK ? "waiting final ack" :
(endpoint->endpoint_state == MCA_BTL_IB_CONNECTED ? "connected" :
(endpoint->endpoint_state == MCA_BTL_IB_CLOSED ? "closed" :
(endpoint->endpoint_state == MCA_BTL_IB_FAILED ? "failed" : "unknown")))))));
opal_output( 0, "pending send frags: %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_send_frags) );
opal_output( 0, "pending frags hp : %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_frags_hp) );
opal_output( 0, "pending frags lp : %u\n", (unsigned int)opal_list_get_size(&endpoint->pending_frags_lp) );
#ifdef VAPI_FEATURE_SRQ
if( mca_btl_mvapi_component.use_srq ) {
opal_output( 0, "mvapi_btl->srd_posted_hp %d\n", mvapi_btl->srd_posted_hp );
opal_output( 0, "mvapi_btl->srd_posted_lp %d\n", mvapi_btl->srd_posted_lp );
opal_output( 0, "mvapi_btl->sd_tokens_hp %d\n", mvapi_btl->sd_tokens_hp );
opal_output( 0, "mvapi_btl->sd_tokens_lp %d\n", mvapi_btl->sd_tokens_lp );
} else {
#endif /* VAPI_FEATURE_SRQ */
opal_output( 0, "sd_tokens_hp %d\n", endpoint->sd_tokens_hp );
opal_output( 0, "sd_tokens_lp %d\n", endpoint->sd_tokens_lp );
opal_output( 0, "get_tokens %d\n", endpoint->get_tokens );
opal_output( 0, "rd_posted_hp %d\n", endpoint->rd_posted_hp );
opal_output( 0, "rd_posted_lp %d\n", endpoint->rd_posted_lp );
opal_output( 0, "rd_credits_hp %d\n", endpoint->rd_credits_hp );
opal_output( 0, "rd_credits_lp %d\n", endpoint->rd_credits_lp );
opal_output( 0, "sd_credits_hp %d\n", endpoint->sd_credits_hp );
opal_output( 0, "sd_credits_lp %d\n", endpoint->sd_credits_lp );
#ifdef VAPI_FEATURE_SRQ
}
#endif /* VAPI_FEATURE_SRQ */
opal_output( 0, "sd_wqe_hp %d\n", endpoint->sd_wqe_hp );
opal_output( 0, "sd_wqe_lp %d\n", endpoint->sd_wqe_lp );
}
int mca_btl_mvapi_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,532 +0,0 @@
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PTL_IB_H
#define MCA_PTL_IB_H
/* Standard system includes */
#include <sys/types.h>
#include <string.h>
/* Open MPI includes */
#include "ompi/class/ompi_free_list.h"
#include "ompi/class/ompi_bitmap.h"
#include "orte/class/orte_pointer_array.h"
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "opal/util/output.h"
#include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "btl_mvapi_endpoint.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#define MCA_BTL_IB_LEAVE_PINNED 1
/**
* Infiniband (IB) BTL component.
*/
struct mca_btl_mvapi_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
uint32_t ib_max_btls;
/**< maximum number of hcas available to the IB component */
uint32_t ib_num_btls;
/**< number of hcas available to the IB component */
struct mca_btl_mvapi_module_t *mvapi_btls;
/**< array of available PTLs */
int ib_free_list_num;
/**< initial size of free lists */
int ib_free_list_max;
/**< maximum size of free lists */
int ib_free_list_inc;
/**< number of elements to alloc when growing free lists */
opal_list_t ib_procs;
/**< list of ib proc structures */
opal_event_t ib_send_event;
/**< event structure for sends */
opal_event_t ib_recv_event;
/**< event structure for recvs */
opal_mutex_t ib_lock;
/**< lock for accessing module state */
char* ib_mpool_name;
/**< name of ib memory pool */
int32_t rd_num; /**< the number of receive descriptors to post to each queue pair */
int32_t rd_low; /**< low water mark to reach before posting additional receive descriptors */
int32_t rd_win; /**< ack credits when window size exceeded */
int32_t rd_rsv; /**< descriptors held in reserve for control messages */
/* number of srq send tokes available */
int32_t srq_sd_max;
int32_t srq_rd_max;
int32_t srq_rd_per_peer;
/**< the number of recv desc posted per log(peer) in SRQ mode */
size_t eager_limit;
size_t max_send_size;
uint32_t reg_mru_len;
uint32_t use_srq;
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_wq_size; /**< Max outstanding WR on the WQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ*/
uint32_t ib_pkey_ix;
uint32_t ib_psn;
uint32_t ib_qp_ous_rd_atom;
uint32_t ib_mtu;
uint32_t ib_min_rnr_timer;
uint32_t ib_timeout;
uint32_t ib_retry_count;
uint32_t ib_rnr_retry;
uint32_t ib_max_rdma_dst_ops;
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
uint32_t use_eager_rdma;
uint32_t eager_rdma_threshold;
uint32_t eager_rdma_num;
uint32_t max_eager_rdma;
}; typedef struct mca_btl_mvapi_component_t mca_btl_mvapi_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_mvapi_component_t mca_btl_mvapi_component;
typedef mca_btl_base_recv_reg_t mca_btl_mvapi_recv_reg_t;
/**
* IB PTL Interface
*/
struct mca_btl_mvapi_module_t {
mca_btl_base_module_t super; /**< base PTL interface */
bool btl_inited;
mca_btl_mvapi_recv_reg_t ib_reg[256];
mca_btl_mvapi_port_info_t port_info; /* contains only the subnet right now */
VAPI_hca_id_t hca_id; /**< ID of HCA */
IB_port_t port_id; /**< ID of the PORT */
VAPI_hca_port_t port; /**< IB port of this PTL */
VAPI_hca_hndl_t nic; /**< NIC handle */
VAPI_pd_hndl_t ptag; /**< Protection Domain tag */
VAPI_cq_hndl_t cq_hndl_hp; /**< High Priority Completion Queue handle */
VAPI_cq_hndl_t cq_hndl_lp; /**< Low Priority Completion Queue handle */
EVAPI_async_handler_hndl_t async_handler;
/**< Async event handler used to detect weird/unknown events */
ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */
ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */
ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */
ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */
ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */
opal_mutex_t ib_lock; /**< module level lock */
VAPI_rr_desc_t* rr_desc_post; /**< an array to allow posting of rr in one swoop */
#ifdef VAPI_FEATURE_SRQ
VAPI_srq_hndl_t srq_hndl_hp; /**< A high priority shared receive queue
runtime optional, can also use a receive queue
per queue pair.. */
VAPI_srq_hndl_t srq_hndl_lp; /**< A low priority shared receive queue */
#endif
size_t ib_inline_max; /**< max size of inline send*/
int32_t num_peers;
int32_t srd_posted_hp; /**< number of high priority shared receive descriptors posted to the nic*/
int32_t srd_posted_lp; /**< number of low priority shared receive descriptors posted to the nic*/
int32_t rd_num; /**< number of receive descriptors to post to srq */
int32_t rd_low; /**< low water mark before reposting descriptors to srq */
int32_t sd_tokens_hp; /**< number of send tokens available on high priority srq */
int32_t sd_tokens_lp; /**< number of send tokens available on low priority srq */
opal_list_t pending_frags_hp; /**< list of pending high priority frags */
opal_list_t pending_frags_lp; /**< list of pending low priority frags */
opal_mutex_t eager_rdma_lock;
size_t eager_rdma_frag_size; /**< length of eager frag */
orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */
uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */
}; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t;
struct mca_btl_mvapi_reg_t {
mca_mpool_base_registration_t base;
VAPI_mr_hndl_t hndl; /* Memory region handle */
VAPI_lkey_t l_key; /* Local key to registered memory */
VAPI_rkey_t r_key; /* Remote key to registered memory */
};
typedef struct mca_btl_mvapi_reg_t mca_btl_mvapi_reg_t;
#define MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, \
additional) \
{ \
do { \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(mvapi_btl->srd_posted_hp <= mvapi_btl->rd_low+additional && \
mvapi_btl->srd_posted_hp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_num - \
mvapi_btl->srd_posted_hp, \
mvapi_btl, \
&mvapi_btl->recv_free_eager, \
&mvapi_btl->srd_posted_hp, \
mvapi_btl->nic, \
mvapi_btl->srq_hndl_hp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
}while(0);\
}
#define MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, \
additional) \
{ \
do { \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(mvapi_btl->srd_posted_lp <= mvapi_btl->rd_low+additional && \
mvapi_btl->srd_posted_lp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_POST_SRR_SUB(mvapi_btl->rd_num - \
mvapi_btl->srd_posted_lp, \
mvapi_btl, \
&mvapi_btl->recv_free_max, \
&mvapi_btl->srd_posted_lp, \
mvapi_btl->nic, \
mvapi_btl->srq_hndl_lp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_POST_SRR_SUB(cnt, \
mvapi_btl, \
frag_list, \
srd_posted, \
nic, \
srq_hndl) \
{\
do { \
int32_t i; \
VAPI_ret_t ret; \
uint32_t rwqe_posted = 0; \
int rc; \
ompi_free_list_item_t* item = NULL; \
mca_btl_mvapi_frag_t* frag = NULL; \
VAPI_rr_desc_t* desc_post = mvapi_btl->rr_desc_post; \
for(i = 0; i < cnt; i++) { \
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->sg_entry.len = frag->size + \
((unsigned char*) frag->segment.seg_addr.pval- \
(unsigned char*) frag->hdr); \
desc_post[i] = frag->desc.rr_desc; \
}\
ret = VAPI_post_srq( nic, \
srq_hndl, \
cnt, \
desc_post, \
&rwqe_posted); \
if(VAPI_OK != ret) { \
BTL_ERROR(("error posting receive descriptors to shared receive queue: %s",\
VAPI_strerror(ret))); \
} else if(rwqe_posted < 1) { \
BTL_ERROR(("error posting receive descriptors to shared receive queue, number of entries posted is %d", rwqe_posted)); \
} else {\
OPAL_THREAD_ADD32(srd_posted, cnt); \
}\
} while(0);\
}
struct mca_btl_mvapi_frag_t;
extern mca_btl_mvapi_module_t mca_btl_mvapi_module;
/**
* Register IB component parameters with the MCA framework
*/
extern int mca_btl_mvapi_component_open(void);
/**
* Any final cleanup before being unloaded.
*/
extern int mca_btl_mvapi_component_close(void);
/**
* IB component initialization.
*
* @param num_btl_modules (OUT) Number of BTLs returned in BTL array.
* @param allow_multi_user_threads (OUT) Flag indicating wether BTL supports user threads (TRUE)
* @param have_hidden_threads (OUT) Flag indicating wether BTL uses threads (TRUE)
*
* (1) read interface list from kernel and compare against component parameters
* then create a BTL instance for selected interfaces
* (2) setup IB listen socket for incoming connection attempts
* (3) publish BTL addressing info
*
*/
extern mca_btl_base_module_t** mca_btl_mvapi_component_init(
int *num_btl_modules,
bool allow_multi_user_threads,
bool have_hidden_threads
);
/**
* IB component progress.
*/
extern int mca_btl_mvapi_component_progress( void );
/**
* Register a callback function that is called on receipt
* of a fragment.
*
* @param btl (IN) BTL module
* @return Status indicating if cleanup was successful
*
* When the process list changes, the PML notifies the BTL of the
* change, to provide the opportunity to cleanup or release any
* resources associated with the peer.
*/
int mca_btl_mvapi_register(
struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_module_recv_cb_fn_t cbfunc,
void* cbdata
);
/**
* Cleanup any resources held by the BTL.
*
* @param btl BTL instance.
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_btl_mvapi_finalize(
struct mca_btl_base_module_t* btl
);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN)
* @param nprocs (IN) Number of processes
* @param procs (IN) Set of processes
* @param peers (OUT) Set of (optional) peer addressing info.
* @param peers (IN/OUT) Set of processes that are reachable via this BTL.
* @return OMPI_SUCCESS or error status on failure.
*
*/
extern int mca_btl_mvapi_add_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers,
ompi_bitmap_t* reachable
);
/**
* PML->BTL notification of change in the process list.
*
* @param btl (IN) BTL instance
* @param nproc (IN) Number of processes.
* @param procs (IN) Set of processes.
* @param peers (IN) Set of peer data structures.
* @return Status indicating if cleanup was successful
*
*/
extern int mca_btl_mvapi_del_procs(
struct mca_btl_base_module_t* btl,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_btl_base_endpoint_t** peers
);
/**
* PML->BTL Initiate a send of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_send(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* descriptor,
mca_btl_base_tag_t tag
);
/**
* PML->BTL Initiate a put of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_put(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
/**
* PML->BTL Initiate a get of the specified size.
*
* @param btl (IN) BTL instance
* @param btl_base_peer (IN) BTL peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BTL to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments
*/
extern int mca_btl_mvapi_get(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* btl_peer,
struct mca_btl_base_descriptor_t* decriptor
);
/**
* Allocate a descriptor.
*
* @param btl (IN) BTL module
* @param size (IN) Requested descriptor size.
*/
extern mca_btl_base_descriptor_t* mca_btl_mvapi_alloc(
struct mca_btl_base_module_t* btl,
uint8_t order,
size_t size);
/**
* Return a segment allocated by this BTL.
*
* @param btl (IN) BTL module
* @param descriptor (IN) Allocated descriptor.
*/
extern int mca_btl_mvapi_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des);
/**
* Pack data and return a descriptor that can be
* used for send/put.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size
);
/**
* Allocate a descriptor initialized for RDMA write.
*
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
extern mca_btl_base_descriptor_t* mca_btl_mvapi_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
uint8_t order,
size_t reserve,
size_t* size);
/**
* Return a send fragment to the modules free list.
*
* @param btl (IN) BTL instance
* @param frag (IN) IB send fragment
*
*/
extern void mca_btl_mvapi_send_frag_return(
struct mca_btl_base_module_t* btl,
struct mca_btl_mvapi_frag_t*
);
/*
* Dump state of btl/queues
*/
extern void mca_btl_mvapi_dump(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
int verbose
);
int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t* mvapi_btl);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_btl_mvapi_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,88 +0,0 @@
/*
* Copyright (c) 2006 Voltaire All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_MVAPI_EAGER_RDMA_BUF_H
#define MCA_BTL_MVAPI_EAGER_RDMA_BUF_H
#include "ompi_config.h"
#include "btl_mvapi.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_btl_mvapi_reg_t;
struct mca_btl_mvapi_frag_t;
struct mca_btl_mvapi_eager_rdma_local_t {
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
struct mca_btl_mvapi_frag_t *frags;
struct mca_btl_mvapi_reg_t *reg;
uint16_t head; /**< RDMA buffer to poll */
uint16_t tail; /**< Needed for credit managment */
int32_t credits; /**< number of RDMA credits */
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
opal_mutex_t lock; /**< guard access to RDMA buffer */
};
typedef struct mca_btl_mvapi_eager_rdma_local_t mca_btl_mvapi_eager_rdma_local_t;
struct mca_btl_mvapi_eager_rdma_remote_t {
ompi_ptr_t base; /**< address of remote buffer */
uint64_t rkey; /**< RKey for accessing remote buffer */
uint16_t head; /**< RDMA buffer to post to */
int32_t tokens; /**< number of rdam tokens */
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
};
typedef struct mca_btl_mvapi_eager_rdma_remote_t mca_btl_mvapi_eager_rdma_remote_t;
#define MCA_BTL_MVAPI_RDMA_FRAG(F) ((F)->type == MCA_BTL_MVAPI_FRAG_EAGER_RDMA)
#define EAGER_RDMA_BUFFER_REMOTE (0)
#define EAGER_RDMA_BUFFER_LOCAL (0xff)
#ifdef WORDS_BIGENDIAN
#define MCA_BTL_MVAPI_RDMA_FRAG_GET_SIZE(F) ((F)->u.size >> 8)
#define MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(F, S) \
((F)->u.size = (S) << 8)
#else
#define MCA_BTL_MVAPI_RDMA_FRAG_GET_SIZE(F) ((F)->u.size & 0x00ffffff)
#define MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(F, S) \
((F)->u.size = (S) & 0x00ffffff)
#endif
#define MCA_BTL_MVAPI_RDMA_FRAG_LOCAL(F) \
(((volatile uint8_t*)(F)->ftr->u.buf)[3] != EAGER_RDMA_BUFFER_REMOTE)
#define MCA_BTL_MVAPI_RDMA_FRAG_REMOTE(F) \
(!MCA_BTL_MVAPI_RDMA_FRAG_LOCAL(F))
#define MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(F) do { \
((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_REMOTE; \
}while (0)
#define MCA_BTL_MVAPI_RDMA_MAKE_LOCAL(F) do { \
((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_LOCAL; \
}while (0)
#define MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(E, I) \
(&(E)->eager_rdma_local.frags[(I)])
#define MCA_BTL_MVAPI_RDMA_NEXT_INDEX(I) do { \
(I) = ((I) + 1) % \
mca_btl_mvapi_component.eager_rdma_num; \
} while (0)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,257 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_ENDPOINT_H
#define MCA_BTL_IB_ENDPOINT_H
#include "opal/class/opal_list.h"
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi.h"
#include "btl_mvapi_eager_rdma.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_btl_mvapi_endpoint_t);
struct mca_btl_mvapi_frag_t;
struct mca_btl_mvapi_port_info_t {
uint32_t subnet;
};
typedef struct mca_btl_mvapi_port_info_t mca_btl_mvapi_port_info_t;
/**
* State of IB endpoint connection.
*/
typedef enum {
/* Defines the state in which this BTL instance
* has started the process of connection */
MCA_BTL_IB_CONNECTING,
/* Waiting for ack from endpoint */
MCA_BTL_IB_CONNECT_ACK,
/*Waiting for final connection ACK from endpoint */
MCA_BTL_IB_WAITING_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BTL_IB_CONNECTED,
/* Connection is closed, there are no resources
* associated with this */
MCA_BTL_IB_CLOSED,
/* Maximum number of retries have been used.
* Report failure on send to upper layer */
MCA_BTL_IB_FAILED
} mca_btl_mvapi_endpoint_state_t;
struct mca_btl_mvapi_rem_info_t {
VAPI_qp_num_t rem_qp_num_hp;
/* High priority remote side QP number */
VAPI_qp_num_t rem_qp_num_lp;
/* Low prioirty remote size QP number */
IB_lid_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_subnet;
/* subnet of remote process */
} ;
typedef struct mca_btl_mvapi_rem_info_t mca_btl_mvapi_rem_info_t;
/**
* An abstraction that represents a connection to a endpoint process.
* An instance of mca_btl_base_endpoint_t is associated w/ each process
* and BTL pair at startup. However, connections to the endpoint
* are established dynamically on an as-needed basis:
*/
struct mca_btl_base_endpoint_t {
opal_list_item_t super;
struct mca_btl_mvapi_module_t* endpoint_btl;
/**< BTL instance that created this connection */
struct mca_btl_mvapi_proc_t* endpoint_proc;
/**< proc structure corresponding to endpoint */
mca_btl_mvapi_endpoint_state_t endpoint_state;
/**< current state of the connection */
size_t endpoint_retries;
/**< number of connection retries attempted */
double endpoint_tstamp;
/**< timestamp of when the first connection was attempted */
opal_mutex_t endpoint_lock;
/**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpoint */
opal_list_t pending_frags_hp; /**< list of pending high priority frags */
opal_list_t pending_frags_lp; /**< list of pending low priority frags */
mca_btl_mvapi_rem_info_t rem_info;
VAPI_qp_hndl_t lcl_qp_hndl_hp; /* High priority local QP handle */
VAPI_qp_hndl_t lcl_qp_hndl_lp; /* Low priority local QP handle */
VAPI_qp_prop_t lcl_qp_prop_hp; /* High priority local QP properties */
VAPI_qp_prop_t lcl_qp_prop_lp; /* Low priority local QP properties */
int32_t sd_tokens_hp; /**< number of high priority send tokens */
int32_t sd_tokens_lp; /**< number of low priority send tokens */
int32_t get_tokens; /**< number of available get tokens */
int32_t rd_posted_hp; /**< number of high priority descriptors posted to the nic*/
int32_t rd_posted_lp; /**< number of low priority descriptors posted to the nic*/
int32_t rd_credits_hp; /**< number of high priority credits to return to peer */
int32_t rd_credits_lp; /**< number of low priority credits to return to peer */
int32_t sd_credits_hp; /**< number of send wqe entries being used to return credits */
int32_t sd_credits_lp; /**< number of send wqe entries being used to return credits */
int32_t sd_wqe_hp; /**< number of available high priority send wqe entries */
int32_t sd_wqe_lp; /**< number of available low priority send wqe entries */
uint32_t subnet;
uint32_t eager_recv_count; /**< number of eager received */
mca_btl_mvapi_eager_rdma_remote_t eager_rdma_remote;
/**< info about remote RDMA buffer */
mca_btl_mvapi_eager_rdma_local_t eager_rdma_local;
/**< info about local RDMA buffer */
int32_t eager_rdma_index; /**< index into RDMA buffers pointer array */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
typedef mca_btl_base_endpoint_t mca_btl_mvapi_endpoint_t;
int mca_btl_mvapi_endpoint_send(mca_btl_base_endpoint_t* endpoint, struct mca_btl_mvapi_frag_t* frag);
int mca_btl_mvapi_endpoint_connect(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_hp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_lp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_post_recv(void);
void mca_btl_mvapi_endpoint_connect_eager_rdma(mca_btl_mvapi_endpoint_t*);
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, \
additional) \
{ \
do { \
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(endpoint->rd_posted_hp <= mca_btl_mvapi_component.rd_low+additional && \
endpoint->rd_posted_hp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(mvapi_btl->rd_num - \
endpoint->rd_posted_hp, \
endpoint, \
&mvapi_btl->recv_free_eager, \
endpoint->rd_posted_hp, \
endpoint->rd_credits_hp, \
mvapi_btl->nic, \
endpoint->lcl_qp_hndl_hp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, \
additional) \
{ \
do { \
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; \
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); \
if(endpoint->rd_posted_lp <= mca_btl_mvapi_component.rd_low+additional && \
endpoint->rd_posted_lp < mvapi_btl->rd_num){ \
MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(mvapi_btl->rd_num - \
endpoint->rd_posted_lp, \
endpoint, \
&mvapi_btl->recv_free_max, \
endpoint->rd_posted_lp, \
endpoint->rd_credits_lp, \
mvapi_btl->nic, \
endpoint->lcl_qp_hndl_lp); \
} \
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); \
} while(0); \
}
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_SUB(cnt, \
my_endpoint, \
frag_list, \
rd_posted, \
rd_credits, \
nic, \
qp ) \
{ \
do { \
int32_t i; \
int rc; \
int32_t num_post = cnt; \
mca_btl_mvapi_module_t *mvapi_btl = my_endpoint->endpoint_btl; \
VAPI_rr_desc_t* desc_post = mvapi_btl->rr_desc_post; \
for(i = 0; i < num_post; i++) { \
ompi_free_list_item_t* item; \
mca_btl_mvapi_frag_t* frag = NULL; \
OMPI_FREE_LIST_WAIT(frag_list, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->endpoint = my_endpoint; \
frag->sg_entry.len = frag->size + \
((unsigned char*) frag->segment.seg_addr.pval- \
(unsigned char*) frag->hdr); \
desc_post[i] = frag->desc.rr_desc; \
}\
rc = EVAPI_post_rr_list( nic, \
qp, \
num_post, \
desc_post); \
if(VAPI_OK != rc) { \
BTL_ERROR(("error posting receive descriptors: %s",\
VAPI_strerror(rc))); \
} else { \
OPAL_THREAD_ADD32(&(rd_posted), num_post); \
OPAL_THREAD_ADD32(&(rd_credits), num_post); \
}\
} while(0); \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,160 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_mvapi_frag.h"
static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_reg_t* mem_hndl =
(mca_btl_mvapi_reg_t*)frag->base.super.registration;
frag->hdr = (mca_btl_mvapi_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.lkey = mem_hndl->l_key;
frag->segment.seg_key.key32[0] = frag->sg_entry.lkey;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr;
frag->base.des_flags = 0;
}
static void mca_btl_mvapi_send_frag_common_constructor(mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_frag_common_constructor(frag);
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->desc.sr_desc.comp_type = VAPI_SIGNALED;
frag->desc.sr_desc.opcode = VAPI_SEND;
frag->desc.sr_desc.remote_qkey = 0;
frag->desc.sr_desc.sg_lst_len = 1;
frag->desc.sr_desc.sg_lst_p = &frag->sg_entry;
frag->desc.sr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag;
}
static void mca_btl_mvapi_recv_frag_common_constructor(mca_btl_mvapi_frag_t* frag)
{
mca_btl_mvapi_frag_common_constructor(frag);
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->desc.rr_desc.comp_type = VAPI_SIGNALED;
frag->desc.rr_desc.opcode = VAPI_RECEIVE;
frag->desc.rr_desc.sg_lst_len = 1;
frag->desc.rr_desc.sg_lst_p = &frag->sg_entry;
frag->desc.rr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag;
}
static void mca_btl_mvapi_send_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
static void mca_btl_mvapi_send_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
static void mca_btl_mvapi_recv_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_recv_frag_common_constructor(frag);
}
static void mca_btl_mvapi_recv_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_recv_frag_common_constructor(frag);
frag->ftr = (mca_btl_mvapi_footer_t*)((char*)frag->segment.seg_addr.pval
+ frag->size);
MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(frag->ftr);
}
static void mca_btl_mvapi_send_frag_frag_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = 0;
frag->type = MCA_BTL_MVAPI_FRAG_FRAG;
frag->registration = NULL;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_frag_t,
mca_btl_base_descriptor_t,
NULL,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_eager_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_max_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_send_frag_frag_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_send_frag_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_recv_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_recv_frag_eager_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_mvapi_recv_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_mvapi_recv_frag_max_constructor,
NULL);

Просмотреть файл

@ -1,204 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_FRAG_H
#define MCA_BTL_IB_FRAG_H
#define MCA_BTL_IB_FRAG_ALIGN (8)
#include "ompi_config.h"
#include "btl_mvapi.h"
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
struct mca_btl_mvapi_reg_t;
struct mca_btl_mvapi_header_t {
mca_btl_base_tag_t tag;
int16_t credits;
int16_t rdma_credits;
};
typedef struct mca_btl_mvapi_header_t mca_btl_mvapi_header_t;
struct mca_btl_mvapi_footer_t {
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
union {
uint32_t size;
uint8_t buf[4];
} u;
};
typedef struct mca_btl_mvapi_footer_t mca_btl_mvapi_footer_t;
typedef enum {
MCA_BTL_MVAPI_CONTROL_NOOP,
MCA_BTL_MVAPI_CONTROL_RDMA
} mca_btl_mvapi_control_t;
struct mca_btl_mvapi_control_header_t {
mca_btl_mvapi_control_t type;
};
typedef struct mca_btl_mvapi_control_header_t mca_btl_mvapi_control_header_t;
struct mca_btl_mvapi_eager_rdma_header_t {
mca_btl_mvapi_control_header_t control;
ompi_ptr_t rdma_start;
uint64_t rkey;
};
typedef struct mca_btl_mvapi_eager_rdma_header_t mca_btl_mvapi_eager_rdma_header_t;
enum mca_btl_mvapi_frag_type_t {
MCA_BTL_MVAPI_FRAG_EAGER,
MCA_BTL_MVAPI_FRAG_MAX,
MCA_BTL_MVAPI_FRAG_FRAG,
MCA_BTL_MVAPI_FRAG_EAGER_RDMA
};
typedef enum mca_btl_mvapi_frag_type_t mca_btl_mvapi_frag_type_t;
union mca_btl_mvapi_frag_desc_t {
VAPI_rr_desc_t rr_desc;
VAPI_sr_desc_t sr_desc;
};
typedef union mca_btl_mvapi_frag_desc_t mca_btl_mvapi_frag_desc_t;
/**
* IB send fragment derived type.
*/
struct mca_btl_mvapi_frag_t {
mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segment;
struct mca_btl_base_endpoint_t *endpoint;
size_t size;
int rc;
mca_btl_mvapi_frag_type_t type;
mca_btl_mvapi_frag_desc_t desc;
VAPI_sg_lst_entry_t sg_entry;
mca_btl_mvapi_header_t *hdr;
mca_btl_mvapi_footer_t *ftr;
struct mca_btl_mvapi_reg_t *registration;
ompi_free_list_t* my_list;
};
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_frag_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_eager_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_max_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_send_frag_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_send_frag_frag_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_recv_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_recv_frag_eager_t);
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_recv_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_recv_frag_max_t);
/*
* Allocate an IB send descriptor
*
*/
#define MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_eager, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_eager; \
}
#define MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_max, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_max; \
}
#define MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_mvapi_module_t*)btl)->send_free_frag, item, rc); \
frag = (mca_btl_mvapi_frag_t*) item; \
frag->my_list = &((mca_btl_mvapi_module_t*)btl)->send_free_frag; \
}
#define MCA_BTL_IB_FRAG_RETURN(btl, frag) \
{ \
OMPI_FREE_LIST_RETURN(frag->my_list, \
(ompi_free_list_item_t*)(frag)); \
}
#define MCA_BTL_IB_FRAG_PROGRESS(frag) \
do { \
switch(frag->desc.sr_desc.opcode) { \
case VAPI_SEND: \
if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) { \
BTL_ERROR(("error in posting pending send\n")); \
} \
break; \
case VAPI_RDMA_WRITE: \
if(OMPI_SUCCESS != mca_btl_mvapi_put((mca_btl_base_module_t*) mvapi_btl, \
frag->endpoint, \
(mca_btl_base_descriptor_t*) frag)) { \
BTL_ERROR(("error in posting pending rdma write\n")); \
} \
break; \
case VAPI_RDMA_READ: \
if(OMPI_SUCCESS != mca_btl_mvapi_get((mca_btl_base_module_t *) mvapi_btl, \
frag->endpoint, \
(mca_btl_base_descriptor_t*) frag)) { \
BTL_ERROR(("error in posting pending rdma read\n")); \
} \
break; \
default: \
BTL_ERROR(("error in posting pending operation, invalide opcode %d\n", frag->desc.sr_desc.opcode)); \
break; \
} \
} while (0)
struct mca_btl_mvapi_module_t;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,192 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "opal/class/opal_hash_table.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "btl_mvapi.h"
#include "btl_mvapi_proc.h"
static void mca_btl_mvapi_proc_construct(mca_btl_mvapi_proc_t* proc);
static void mca_btl_mvapi_proc_destruct(mca_btl_mvapi_proc_t* proc);
OBJ_CLASS_INSTANCE(mca_btl_mvapi_proc_t,
opal_list_item_t, mca_btl_mvapi_proc_construct,
mca_btl_mvapi_proc_destruct);
void mca_btl_mvapi_proc_construct(mca_btl_mvapi_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_port_count = 0;
proc->proc_endpoints = 0;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
/* add to list of all proc instance */
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
opal_list_append(&mca_btl_mvapi_component.ib_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
}
/*
* Cleanup ib proc instance
*/
void mca_btl_mvapi_proc_destruct(mca_btl_mvapi_proc_t* proc)
{
/* remove from list of all proc instances */
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
opal_list_remove_item(&mca_btl_mvapi_component.ib_procs, &proc->super);
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
/* release resources */
if(NULL != proc->proc_endpoints) {
free(proc->proc_endpoints);
}
}
/*
* Look for an existing IB process instances based on the associated
* ompi_proc_t instance.
*/
static mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_lookup_ompi(ompi_proc_t* ompi_proc)
{
mca_btl_mvapi_proc_t* ib_proc;
OPAL_THREAD_LOCK(&mca_btl_mvapi_component.ib_lock);
for(ib_proc = (mca_btl_mvapi_proc_t*)
opal_list_get_first(&mca_btl_mvapi_component.ib_procs);
ib_proc != (mca_btl_mvapi_proc_t*)
opal_list_get_end(&mca_btl_mvapi_component.ib_procs);
ib_proc = (mca_btl_mvapi_proc_t*)opal_list_get_next(ib_proc)) {
if(ib_proc->proc_ompi == ompi_proc) {
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
return ib_proc;
}
}
OPAL_THREAD_UNLOCK(&mca_btl_mvapi_component.ib_lock);
return NULL;
}
/*
* Create a IB process structure. There is a one-to-one correspondence
* between a ompi_proc_t and a mca_btl_mvapi_proc_t instance. We cache
* additional data (specifically the list of mca_btl_mvapi_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
{
mca_btl_mvapi_proc_t* mvapi_proc = NULL;
size_t size;
int rc;
/* Check if we have already created a IB proc
* structure for this ompi process */
mvapi_proc = mca_btl_mvapi_proc_lookup_ompi(ompi_proc);
if(mvapi_proc != NULL) {
/* Gotcha! */
return mvapi_proc;
}
/* Oops! First time, gotta create a new IB proc
* out of the ompi_proc ... */
mvapi_proc = OBJ_NEW(mca_btl_mvapi_proc_t);
/* Initialize number of peer */
mvapi_proc->proc_endpoint_count = 0;
mvapi_proc->proc_ompi = ompi_proc;
/* build a unique identifier (of arbitrary
* size) to represent the proc */
mvapi_proc->proc_guid = ompi_proc->proc_name;
/* query for the peer address info */
rc = ompi_modex_recv(
&mca_btl_mvapi_component.super.btl_version,
ompi_proc,
(void*)&mvapi_proc->proc_ports,
&size
);
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}
if((size % sizeof(mca_btl_mvapi_port_info_t)) != 0) {
opal_output(0, "[%s:%d] invalid mvapi address for peer %s",
__FILE__,__LINE__,ORTE_NAME_PRINT(&ompi_proc->proc_name));
OBJ_RELEASE(mvapi_proc);
return NULL;
}
mvapi_proc->proc_port_count = size/sizeof(mca_btl_mvapi_port_info_t);
if (0 == mvapi_proc->proc_port_count) {
mvapi_proc->proc_endpoints = NULL;
} else {
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == mvapi_proc->proc_endpoints) {
OBJ_RELEASE(mvapi_proc);
return NULL;
}
return mvapi_proc;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a btl instance into the proc array and assign
* it an address.
*/
int mca_btl_mvapi_proc_insert(mca_btl_mvapi_proc_t* mvapi_proc,
mca_btl_base_endpoint_t* mvapi_endpoint)
{
/* insert into endpoint array */
if(mvapi_proc->proc_port_count <= mvapi_proc->proc_endpoint_count)
return OMPI_ERR_OUT_OF_RESOURCE;
mvapi_endpoint->endpoint_proc = mvapi_proc;
mvapi_proc->proc_endpoints[mvapi_proc->proc_endpoint_count++] = mvapi_endpoint;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,72 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_IB_PROC_H
#define MCA_BTL_IB_PROC_H
#include "orte/mca/ns/ns.h"
#include "opal/class/opal_object.h"
#include "ompi/proc/proc.h"
#include "btl_mvapi.h"
#include "btl_mvapi_endpoint.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Represents the state of a remote process and the set of addresses
* that it exports. Also cache an instance of mca_btl_base_endpoint_t for
* each
* BTL instance that attempts to open a connection to the process.
*/
struct mca_btl_mvapi_proc_t {
opal_list_item_t super;
/**< allow proc to be placed on a list */
ompi_proc_t *proc_ompi;
/**< pointer to corresponding ompi_proc_t */
orte_process_name_t proc_guid;
/**< globally unique identifier for the process */
struct mca_btl_mvapi_port_info_t* proc_ports;
size_t proc_port_count;
/**< number of ports published by endpoint */
struct mca_btl_base_endpoint_t **proc_endpoints;
/**< array of endpoints that have been created to access this proc */
size_t proc_endpoint_count;
/**< number of endpoints */
opal_mutex_t proc_lock;
/**< lock to protect against concurrent access to proc state */
};
typedef struct mca_btl_mvapi_proc_t mca_btl_mvapi_proc_t;
OBJ_CLASS_DECLARATION(mca_btl_mvapi_proc_t);
mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc);
int mca_btl_mvapi_proc_insert(mca_btl_mvapi_proc_t*, mca_btl_base_endpoint_t*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,50 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_MVAPI_RDMA_BUF_H
#define MCA_BTL_MVAPI_RDMA_BUF_H
#include "ompi_config.h"
#include "btl_mvapi.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#if 0
struct mca_btl_mvapi_rdma_buf_t {
void* base;
size_t entry_size;
uint32_t entry_cnt;
void* current;
opal_mutex_t lock;
mca_mpool_base_registration_t* reg;
uint32_t tokens;
void* rem_addr;
size_t rem_size;
uint32_t rem_cnt;
void* rem_current;
VAPI_rkey_t r_key;
};
typedef struct mca_btl_mvapi_rdma_buf_t mca_btl_mvapi_rdma_buf_t;
#endif
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Просмотреть файл

@ -1,51 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# MCA_btl_mvapi_CONFIG([action-if-can-compile],
# [action-if-cant-compile])
# ------------------------------------------------
AC_DEFUN([MCA_btl_mvapi_CONFIG],[
OMPI_CHECK_MVAPI([btl_mvapi],
[btl_mvapi_happy="yes"],
[btl_mvapi_happy="no"])
AS_IF([test "$btl_mvapi_happy" = "yes"],
[btl_mvapi_WRAPPER_EXTRA_LDFLAGS="$btl_mvapi_LDFLAGS"
btl_mvapi_WRAPPER_EXTRA_LIBS="$btl_mvapi_LIBS"
$1],
[$2])
# Many of the vapi.h files floating around don't obey ISO99 C
# standard, so cause oodles of warnings with -pedantic and
# -Wundef. Remove them from CFLAGS, which is then used to
# forcefully override CFLAGS in the makefile for MVAPI
# components
btl_mvapi_CFLAGS="`echo $CFLAGS | sed 's/-pedantic//g'`"
btl_mvapi_CFLAGS="`echo $btl_mvapi_CFLAGS | sed 's/-Wundef//g'`"
AS_IF([test "$btl_mvapi_CFLAGS" != "$CFLAGS" -a "$btl_mvapi_happy" = "yes"],
[AC_MSG_WARN([Removed -pedantic and -Wundef from CFLAGS for
mvapi component because some vapi.h files are not really ANSI C])])
# substitute in the things needed to build mvapi
AC_SUBST([btl_mvapi_CFLAGS])
AC_SUBST([btl_mvapi_CPPFLAGS])
AC_SUBST([btl_mvapi_LDFLAGS])
AC_SUBST([btl_mvapi_LIBS])
])dnl

Просмотреть файл

@ -1,24 +0,0 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007 Los Alamos National Security, LLC. All rights
# reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_CONFIG_FILES="Makefile"

Просмотреть файл

@ -1,41 +0,0 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[btl_mvapi:retry-exceeded]
The retry count is a down counter initialized on creation of the QP. Retry
count is defined in the InfiniBand Spec 1.2 (12.7.38):
The total number of times that the sender wishes the receiver to retry tim-
eout, packet sequence, etc. errors before posting a completion error.
Note that two mca parameters are involved here:
btl_mvapi_ib_retry_count - The number of times the sender will attempt to
retry (defaulted to 7, the maximum value).
btl_mvapi_ib_timeout - The local ack timeout parameter (defaulted to 10). The
actual timeout value used is calculated as:
(4.096 micro-seconds * 2^btl_mvapi_ib_timeout).
See InfiniBand Spec 1.2 (12.7.34) for more details.
What to do next:
One item to note is the hosts on which this error has occured, it has been
observed that rebooting or removing a particular host from the job can resolve
this issue. Should you be able to identify a specific cause or additional
trouble shooting information please report this to devel@open-mpi.org.