2007-07-06 03:42:54 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
|
|
|
* reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include "opal/prefetch.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
#include "ompi/datatype/convertor.h"
|
|
|
|
#include "ompi/datatype/datatype.h"
|
|
|
|
#include "ompi/mca/btl/btl.h"
|
|
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
|
|
#include "ompi/mca/mpool/mpool.h"
|
|
|
|
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
|
|
|
|
|
|
|
#include "btl_ofud.h"
|
|
|
|
#include "btl_ofud_frag.h"
|
|
|
|
#include "btl_ofud_proc.h"
|
|
|
|
#include "btl_ofud_endpoint.h"
|
|
|
|
|
|
|
|
|
|
|
|
mca_btl_ud_module_t mca_btl_ofud_module = {
|
|
|
|
{
|
|
|
|
&mca_btl_ofud_component.super,
|
|
|
|
0, /* eager_limit */
|
|
|
|
0, /* min_send_size */
|
|
|
|
0, /* max_send_size */
|
|
|
|
0, /* rdma_pipeline_send_length */
|
|
|
|
0, /* rdma_pipeline_frag_size */
|
|
|
|
0, /* min_rdma_pipeline_size */
|
|
|
|
0, /* exclusivity */
|
|
|
|
0, /* latency */
|
|
|
|
0, /* bandwidth */
|
|
|
|
MCA_BTL_FLAGS_SEND,
|
|
|
|
mca_btl_ud_add_procs,
|
|
|
|
mca_btl_ud_del_procs,
|
|
|
|
mca_btl_ud_register,
|
|
|
|
mca_btl_ud_finalize,
|
|
|
|
mca_btl_ud_alloc,
|
|
|
|
mca_btl_ud_free,
|
|
|
|
mca_btl_ud_prepare_src,
|
|
|
|
NULL, /*mca_btl_ud_prepare_dst */
|
|
|
|
mca_btl_ud_send,
|
|
|
|
NULL, /*mca_btl_ud_put */
|
|
|
|
NULL, /*mca_btl_ud_get */
|
|
|
|
mca_btl_base_dump,
|
|
|
|
NULL, /* mpool */
|
|
|
|
NULL, /* register error */
|
2007-07-11 00:37:42 +04:00
|
|
|
mca_btl_ud_ft_event
|
2007-07-06 03:42:54 +04:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add procs to this BTL module, receiving endpoint information from the modex.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_add_procs(struct mca_btl_base_module_t* btl,
|
|
|
|
size_t nprocs,
|
|
|
|
struct ompi_proc_t **ompi_procs,
|
|
|
|
struct mca_btl_base_endpoint_t** peers,
|
|
|
|
ompi_bitmap_t* reachable)
|
|
|
|
{
|
|
|
|
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
|
|
|
struct ibv_ah_attr ah_attr;
|
|
|
|
int i, rc;
|
|
|
|
|
|
|
|
/* Set up the endpoint lookup table if it hasn't been already */
|
|
|
|
/* We do this here so we can initialize the table to a reasonable size
|
|
|
|
based on nprocs */
|
|
|
|
#if 0
|
|
|
|
if(NULL == ud_btl->ep_lookup) {
|
|
|
|
ud_btl->ep_lookup = malloc(sizeof(opal_hash_table_t));
|
|
|
|
OBJ_CONSTRUCT(ud_btl->ep_lookup, opal_hash_table_t);
|
|
|
|
opal_hash_table_init(ud_btl->ep_lookup, nprocs);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for(i = 0; i < (int)nprocs; i++) {
|
|
|
|
struct ompi_proc_t* ompi_proc = ompi_procs[i];
|
|
|
|
mca_btl_ud_proc_t* ib_proc;
|
|
|
|
mca_btl_base_endpoint_t* ib_peer;
|
|
|
|
|
|
|
|
if(NULL == (ib_proc = mca_btl_ud_proc_create(ompi_proc))) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* The btl_proc datastructure is shared by all IB BTL instances that are
|
|
|
|
* trying to reach this destination. Cache the peer instance on the
|
|
|
|
* btl_proc.
|
|
|
|
*/
|
|
|
|
ib_peer = OBJ_NEW(mca_btl_ud_endpoint_t);
|
|
|
|
if(NULL == ib_peer) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&ib_proc->proc_lock);
|
|
|
|
rc = mca_btl_ud_proc_insert(ib_proc, ib_peer);
|
|
|
|
if(rc != OMPI_SUCCESS) {
|
|
|
|
OBJ_RELEASE(ib_peer);
|
|
|
|
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
BTL_VERBOSE(("modex_recv QP num %d, LID = %d",
|
|
|
|
ib_peer->rem_addr.qp_num, ib_peer->rem_addr.lid));
|
|
|
|
|
|
|
|
/* Set up IB address handles for the endpoint */
|
|
|
|
ah_attr.is_global = 0;
|
|
|
|
ah_attr.dlid = ib_peer->rem_addr.lid;
|
|
|
|
ah_attr.sl = mca_btl_ofud_component.ib_service_level;
|
|
|
|
ah_attr.src_path_bits = mca_btl_ofud_component.ib_src_path_bits;
|
|
|
|
ah_attr.port_num = ud_btl->ib_port_num;
|
|
|
|
|
|
|
|
ib_peer->rmt_ah = ibv_create_ah(ud_btl->ib_pd, &ah_attr);
|
|
|
|
if(NULL == ib_peer->rmt_ah) {
|
|
|
|
BTL_ERROR(("error creating address handle: %s\n", strerror(errno)));
|
|
|
|
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert a pointer to the endpoint in the BTL lookup table */
|
|
|
|
#if 0
|
|
|
|
opal_hash_table_set_value_uint64(ud_btl->ep_lookup,
|
|
|
|
((uint64_t)ib_peer->rem_addr.lid << 32) |
|
|
|
|
ib_peer->rem_addr.qp_num,
|
|
|
|
ib_peer);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ompi_bitmap_set_bit(reachable, i);
|
|
|
|
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
|
|
|
peers[i] = ib_peer;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Delete the proc as reachable from this btl module
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_del_procs(struct mca_btl_base_module_t* btl,
|
|
|
|
size_t nprocs,
|
|
|
|
struct ompi_proc_t** procs,
|
|
|
|
struct mca_btl_base_endpoint_t** peers)
|
|
|
|
{
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
for(i = 0; i < nprocs; i++) {
|
|
|
|
mca_btl_ud_endpoint_t* endpoint = (mca_btl_ud_endpoint_t*)peers[i];
|
|
|
|
mca_btl_ud_proc_t* proc = mca_btl_ud_proc_lookup_ompi(procs[i]);
|
|
|
|
#if 0
|
|
|
|
opal_hash_table_remove_value_uint64(ud_btl->ep_lookup,
|
|
|
|
((uint64_t)endpoint->rem_addr.lid << 32) |
|
|
|
|
endpoint->rem_addr.qp_num);
|
|
|
|
#endif
|
|
|
|
if(NULL != proc) {
|
|
|
|
mca_btl_ud_proc_remove(proc, endpoint);
|
|
|
|
}
|
|
|
|
|
|
|
|
OBJ_RELEASE(endpoint);
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register callback function to support send/recv semantics
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_register(struct mca_btl_base_module_t* btl,
|
|
|
|
mca_btl_base_tag_t tag,
|
|
|
|
mca_btl_base_module_recv_cb_fn_t cbfunc,
|
|
|
|
void* cbdata)
|
|
|
|
{
|
|
|
|
|
|
|
|
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
|
|
|
|
|
|
|
OPAL_THREAD_LOCK(&ud_btl->ud_lock);
|
|
|
|
ud_btl->ib_reg[tag].cbfunc = cbfunc;
|
|
|
|
ud_btl->ib_reg[tag].cbdata = cbdata;
|
|
|
|
OPAL_THREAD_UNLOCK(&ud_btl->ud_lock);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Allocate a segment.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param size (IN) Request segment size.
|
|
|
|
*
|
|
|
|
* When allocating a segment we pull a pre-alllocated segment
|
|
|
|
* from one of two free lists, an eager list and a max list
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_btl_base_descriptor_t* mca_btl_ud_alloc(struct mca_btl_base_module_t* btl,
|
|
|
|
uint8_t order,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
mca_btl_ud_frag_t* frag = NULL;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if(OPAL_LIKELY(size <= mca_btl_ofud_module.super.btl_eager_limit)) {
|
|
|
|
MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(NULL == frag) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag->base.order = MCA_BTL_NO_ORDER;
|
|
|
|
frag->segment.seg_len = size;
|
|
|
|
return (mca_btl_base_descriptor_t*)frag;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return a segment
|
|
|
|
*
|
|
|
|
* Return the segment to the appropriate
|
|
|
|
* preallocated segment list
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_free(struct mca_btl_base_module_t* btl,
|
|
|
|
mca_btl_base_descriptor_t* des)
|
|
|
|
{
|
|
|
|
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)des;
|
|
|
|
|
|
|
|
if(OPAL_LIKELY(frag->type == MCA_BTL_UD_FRAG_SEND)) {
|
|
|
|
MCA_BTL_UD_RETURN_FRAG(btl, frag);
|
|
|
|
} else if(frag->type == MCA_BTL_UD_FRAG_USER && frag->ud_reg != NULL) {
|
|
|
|
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
|
|
|
|
(mca_mpool_base_registration_t*)frag->ud_reg);
|
|
|
|
MCA_BTL_UD_RETURN_USER_FRAG(btl, frag);
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* register user buffer or pack
|
|
|
|
* data into pre-registered buffer and return a
|
|
|
|
* descriptor that can be
|
|
|
|
* used for send/put.
|
|
|
|
*
|
|
|
|
* @param btl (IN) BTL module
|
|
|
|
* @param peer (IN) BTL peer addressing
|
|
|
|
*
|
|
|
|
* prepare source's behavior depends on the following:
|
|
|
|
* Has a valid memory registration been passed to prepare_src?
|
|
|
|
* if so we attempt to use the pre-registred user-buffer, if the memory
|
|
|
|
* registration is to small (only a portion of the user buffer) then we must
|
|
|
|
* reregister the user buffer
|
|
|
|
* Has the user requested the memory to be left pinned?
|
|
|
|
* if so we insert the memory registration into a memory tree for later
|
|
|
|
* lookup, we may also remove a previous registration if a MRU (most recently
|
|
|
|
* used) list of registions is full, this prevents resources from being
|
|
|
|
* exhausted.
|
|
|
|
* Is the requested size larger than the btl's max send size?
|
|
|
|
* if so and we aren't asked to leave the registration pinned then we
|
|
|
|
* register the memory if the user's buffer is contiguous.
|
|
|
|
* Otherwise we choose from two free lists of pre-registered memory in which
|
|
|
|
* to pack the data into.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
mca_btl_base_descriptor_t* mca_btl_ud_prepare_src(
|
|
|
|
struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
mca_mpool_base_registration_t* registration,
|
|
|
|
struct ompi_convertor_t* convertor,
|
|
|
|
uint8_t order,
|
|
|
|
size_t reserve,
|
|
|
|
size_t* size)
|
|
|
|
{
|
|
|
|
mca_btl_ud_frag_t* frag = NULL;
|
|
|
|
struct iovec iov;
|
|
|
|
uint32_t iov_count = 1;
|
|
|
|
size_t max_data = *size;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
if(ompi_convertor_need_buffers(convertor) == 0 && reserve == 0 &&
|
|
|
|
(registration != NULL || max_data > btl->btl_max_send_size)) {
|
|
|
|
/* The user buffer is contigous and we are asked to send more than
|
|
|
|
the max send size. */
|
|
|
|
|
|
|
|
MCA_BTL_UD_ALLOC_USER_FRAG(btl, frag, rc);
|
|
|
|
if(NULL == frag) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
iov.iov_len = max_data;
|
|
|
|
iov.iov_base = NULL;
|
|
|
|
|
|
|
|
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
|
|
|
|
|
|
frag->segment.seg_len = max_data;
|
|
|
|
frag->segment.seg_addr.pval = iov.iov_base;
|
|
|
|
frag->base.des_flags = 0;
|
|
|
|
frag->base.order = MCA_BTL_NO_ORDER;
|
|
|
|
|
|
|
|
if(NULL == registration) {
|
|
|
|
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base,
|
|
|
|
max_data, 0, ®istration);
|
|
|
|
if(OMPI_SUCCESS != rc || NULL == registration) {
|
|
|
|
MCA_BTL_UD_RETURN_USER_FRAG(btl, frag);
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag->ud_reg = (mca_btl_ud_reg_t*)registration;
|
|
|
|
|
|
|
|
frag->sg_entry.lkey = frag->ud_reg->mr->lkey;
|
|
|
|
frag->sg_entry.addr = (unsigned long)iov.iov_base;
|
|
|
|
|
|
|
|
frag->base.des_src = &frag->segment;
|
|
|
|
frag->base.des_src_cnt = 1;
|
|
|
|
frag->base.des_dst = NULL;
|
|
|
|
frag->base.des_dst_cnt = 0;
|
|
|
|
*size = max_data;
|
|
|
|
return &frag->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(max_data + reserve > btl->btl_eager_limit) {
|
|
|
|
max_data = btl->btl_eager_limit - reserve;
|
|
|
|
}
|
|
|
|
|
|
|
|
MCA_BTL_UD_ALLOC_FRAG(btl, frag, rc);
|
|
|
|
if(OPAL_UNLIKELY(NULL == frag)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
iov.iov_len = max_data;
|
|
|
|
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
|
|
|
|
|
|
|
|
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
|
|
if(OPAL_UNLIKELY(rc < 0)) {
|
|
|
|
MCA_BTL_UD_RETURN_FRAG(btl, frag);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag->segment.seg_len = max_data + reserve;
|
|
|
|
frag->sg_entry.length =
|
|
|
|
max_data + reserve + sizeof(mca_btl_ud_header_t);
|
|
|
|
|
|
|
|
frag->base.des_src = &frag->segment;
|
|
|
|
frag->base.des_src_cnt = 1;
|
|
|
|
frag->base.des_dst = NULL;
|
|
|
|
frag->base.des_dst_cnt = 0;
|
|
|
|
frag->base.des_flags = 0;
|
|
|
|
frag->base.order = MCA_BTL_NO_ORDER;
|
|
|
|
*size = max_data;
|
|
|
|
|
|
|
|
return &frag->base;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int mca_btl_ud_finalize(struct mca_btl_base_module_t* btl)
|
|
|
|
{
|
|
|
|
mca_btl_ud_module_t* ud_btl = (mca_btl_ud_module_t*)btl;
|
|
|
|
int32_t i;
|
|
|
|
|
|
|
|
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
|
|
|
ibv_destroy_qp(ud_btl->ib_qp[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
ibv_dealloc_pd(ud_btl->ib_pd);
|
|
|
|
|
|
|
|
OBJ_DESTRUCT(&ud_btl->ud_lock);
|
|
|
|
OBJ_DESTRUCT(&ud_btl->pending_frags);
|
|
|
|
OBJ_DESTRUCT(&ud_btl->send_frags);
|
|
|
|
OBJ_DESTRUCT(&ud_btl->user_frags);
|
|
|
|
OBJ_DESTRUCT(&ud_btl->recv_frags);
|
|
|
|
mca_mpool_base_module_destroy(ud_btl->super.btl_mpool);
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initiate a send.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_send(struct mca_btl_base_module_t* btl,
|
|
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
|
|
mca_btl_base_tag_t tag)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
mca_btl_ud_frag_t* frag = (mca_btl_ud_frag_t*)descriptor;
|
|
|
|
MCA_BTL_UD_START_TIME(post_send);
|
|
|
|
frag->endpoint = endpoint;
|
|
|
|
frag->hdr->tag = tag;
|
|
|
|
|
|
|
|
rc = mca_btl_ud_endpoint_post_send((mca_btl_ud_module_t*)btl, frag);
|
|
|
|
|
|
|
|
MCA_BTL_UD_END_TIME(post_send);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RDMA Memory Pool (de)register callbacks
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int mca_btl_ud_reg_mr(void* reg_data, void* base, size_t size,
|
|
|
|
mca_mpool_base_registration_t* reg)
|
|
|
|
{
|
|
|
|
mca_btl_ud_module_t* mod = (mca_btl_ud_module_t*)reg_data;
|
|
|
|
mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg;
|
|
|
|
|
|
|
|
ud_reg->mr = ibv_reg_mr(mod->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE |
|
|
|
|
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
|
|
|
|
|
|
|
if(NULL == ud_reg->mr)
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int mca_btl_ud_dereg_mr(void* reg_data,
|
|
|
|
mca_mpool_base_registration_t* reg)
|
|
|
|
{
|
|
|
|
mca_btl_ud_reg_t* ud_reg = (mca_btl_ud_reg_t*)reg;
|
|
|
|
|
|
|
|
if(ud_reg->mr != NULL) {
|
|
|
|
if(ibv_dereg_mr(ud_reg->mr)) {
|
|
|
|
opal_output(0, "%s: error unpinning UD memory: %s\n",
|
|
|
|
__func__, strerror(errno));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ud_reg->mr = NULL;
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a single UD queue pair. Since UD is connectionless, the QP is
|
|
|
|
* useable immediately.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* TODO - can remove cq/psn args now with only one type of frag */
|
|
|
|
static int mca_btl_ud_init_qp(mca_btl_ud_module_t* ud_btl,
|
|
|
|
struct ibv_cq* cq,
|
|
|
|
struct ibv_qp** qp,
|
|
|
|
uint32_t lcl_psn)
|
|
|
|
{
|
|
|
|
struct ibv_qp_attr qp_attr;
|
|
|
|
struct ibv_qp_init_attr qp_init_attr;
|
|
|
|
|
|
|
|
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
|
|
|
|
|
|
|
|
qp_init_attr.send_cq = cq;
|
|
|
|
qp_init_attr.recv_cq = cq;
|
|
|
|
qp_init_attr.cap.max_send_wr = mca_btl_ofud_component.sd_num;
|
|
|
|
qp_init_attr.cap.max_recv_wr = mca_btl_ofud_component.rd_num;
|
|
|
|
qp_init_attr.cap.max_send_sge = 1;
|
|
|
|
qp_init_attr.cap.max_recv_sge = 1;
|
|
|
|
/* TODO - find the best value for max_inline_data */
|
|
|
|
qp_init_attr.cap.max_inline_data = 200;
|
|
|
|
qp_init_attr.qp_type = IBV_QPT_UD;
|
|
|
|
|
|
|
|
*qp = ibv_create_qp(ud_btl->ib_pd, &qp_init_attr);
|
|
|
|
if(NULL == *qp) {
|
|
|
|
BTL_ERROR(("error creating QP: %s\n", strerror(errno)));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(0 == (ud_btl->ib_inline_max = qp_init_attr.cap.max_inline_data)) {
|
|
|
|
BTL_ERROR(("ibv_create_qp: returned 0 byte(s) for max inline data"));
|
|
|
|
}
|
|
|
|
|
|
|
|
BTL_VERBOSE((0, "ib_inline_max %d\n", ud_btl->ib_inline_max));
|
|
|
|
|
|
|
|
qp_attr.qp_state = IBV_QPS_INIT;
|
|
|
|
qp_attr.pkey_index = mca_btl_ofud_component.ib_pkey_ix;
|
|
|
|
qp_attr.qkey = mca_btl_ofud_component.ib_qkey;
|
|
|
|
qp_attr.port_num = ud_btl->ib_port_num;
|
|
|
|
|
|
|
|
if(ibv_modify_qp(*qp, &qp_attr,
|
|
|
|
IBV_QP_STATE | IBV_QP_PKEY_INDEX |
|
|
|
|
IBV_QP_PORT | IBV_QP_QKEY)) {
|
|
|
|
BTL_ERROR(("error modifying QP to INIT: %s", strerror(errno)));
|
|
|
|
goto destroy_qp;
|
|
|
|
}
|
|
|
|
|
|
|
|
qp_attr.qp_state = IBV_QPS_RTR;
|
|
|
|
if(ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE)) {
|
|
|
|
BTL_ERROR(("error modifing QP to RTR: %s", strerror(errno)));
|
|
|
|
goto destroy_qp;
|
|
|
|
}
|
|
|
|
|
|
|
|
qp_attr.qp_state = IBV_QPS_RTS;
|
|
|
|
qp_attr.sq_psn = lcl_psn;
|
|
|
|
if (ibv_modify_qp(*qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) {
|
|
|
|
BTL_ERROR(("error modifying QP to RTS: %s", strerror(errno)));
|
|
|
|
goto destroy_qp;
|
|
|
|
}
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
destroy_qp:
|
|
|
|
ibv_destroy_qp(*qp);
|
|
|
|
*qp = NULL;
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the btl module by allocating a protection domain,
|
|
|
|
* memory pool, completion queue, and free lists
|
|
|
|
*/
|
|
|
|
|
|
|
|
int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
|
|
|
{
|
|
|
|
struct mca_mpool_base_resources_t mpool_resources;
|
|
|
|
struct ibv_context *ctx = ud_btl->ib_dev_context;
|
|
|
|
struct ibv_recv_wr* bad_wr;
|
|
|
|
mca_btl_ud_frag_t* frag;
|
|
|
|
ompi_free_list_item_t* item;
|
2007-11-02 02:38:50 +03:00
|
|
|
uint32_t length,length_payload;
|
2007-07-06 03:42:54 +04:00
|
|
|
int32_t rc, i;
|
|
|
|
|
|
|
|
ud_btl->sd_wqe = mca_btl_ofud_component.sd_num;
|
|
|
|
|
|
|
|
ud_btl->ib_pd = ibv_alloc_pd(ctx);
|
|
|
|
if(NULL == ud_btl->ib_pd) {
|
|
|
|
BTL_ERROR(("error allocating PD for %s: %s\n",
|
|
|
|
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
mpool_resources.reg_data = (void*)ud_btl;
|
|
|
|
mpool_resources.sizeof_reg = sizeof(mca_btl_ud_reg_t);
|
|
|
|
mpool_resources.register_mem = mca_btl_ud_reg_mr;
|
|
|
|
mpool_resources.deregister_mem = mca_btl_ud_dereg_mr;
|
|
|
|
ud_btl->super.btl_mpool =
|
|
|
|
mca_mpool_base_module_create(mca_btl_ofud_component.ud_mpool_name,
|
|
|
|
&ud_btl->super, &mpool_resources);
|
|
|
|
|
|
|
|
if(NULL == ud_btl->super.btl_mpool) {
|
|
|
|
BTL_ERROR(("error creating IB mpool for %s: %s\n",
|
|
|
|
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
|
|
|
goto dealloc_pd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create the completion queue */
|
|
|
|
length = mca_btl_ofud_component.rd_num + mca_btl_ofud_component.sd_num;
|
|
|
|
|
|
|
|
ud_btl->ib_cq = ibv_create_cq(ctx, length, NULL, NULL, 0);
|
|
|
|
if(NULL == ud_btl->ib_cq) {
|
|
|
|
BTL_ERROR(("error creating CQ for %s: %s\n",
|
|
|
|
ibv_get_device_name(ud_btl->ib_dev), strerror(errno)));
|
|
|
|
goto mpool_destroy;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set up our packet sequence numbers */
|
|
|
|
ud_btl->addr.psn = lrand48() & 0xffffff;
|
|
|
|
|
|
|
|
/* Set up the QPs for this BTL */
|
|
|
|
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
|
|
|
if(OMPI_SUCCESS != mca_btl_ud_init_qp(ud_btl,
|
|
|
|
ud_btl->ib_cq, &ud_btl->ib_qp[i], ud_btl->addr.psn)) {
|
|
|
|
goto qp_destroy;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Place our QP numbers in our local address information */
|
|
|
|
ud_btl->addr.qp_num = ud_btl->ib_qp[0]->qp_num;
|
|
|
|
ud_btl->ib_qp_next = 0;
|
|
|
|
|
|
|
|
/*ud_btl->rd_posted = mca_btl_ofud_component.rd_num_init;*/
|
|
|
|
|
|
|
|
/* Initialize pool of receive fragments first, since an error may occur */
|
|
|
|
/* TODO - no need for a free list with a static buffer count */
|
|
|
|
OBJ_CONSTRUCT(&ud_btl->recv_frags, ompi_free_list_t);
|
|
|
|
length = sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) +
|
|
|
|
ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN;
|
|
|
|
|
2007-11-02 02:38:50 +03:00
|
|
|
length_payload=sizeof(mca_btl_ud_frag_t) + sizeof(mca_btl_ud_header_t) +
|
|
|
|
ud_btl->super.btl_eager_limit + 2 * MCA_BTL_IB_FRAG_ALIGN -
|
|
|
|
sizeof(mca_btl_ud_recv_frag_t);
|
|
|
|
|
|
|
|
ompi_free_list_init_new(&ud_btl->recv_frags,
|
2007-07-06 03:42:54 +04:00
|
|
|
length + sizeof(mca_btl_ud_ib_header_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
OBJ_CLASS(mca_btl_ud_recv_frag_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
length_payload,CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.rd_num,
|
|
|
|
mca_btl_ofud_component.rd_num,
|
|
|
|
mca_btl_ofud_component.rd_num,
|
|
|
|
ud_btl->super.btl_mpool);
|
|
|
|
#if 0
|
2007-11-02 02:38:50 +03:00
|
|
|
ompi_free_list_init_new(&ud_btl->recv_frags,
|
2007-07-06 03:42:54 +04:00
|
|
|
length + sizeof(mca_btl_ud_ib_header_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
OBJ_CLASS(mca_btl_ud_recv_frag_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
length_payload,CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.rd_num_init,
|
|
|
|
mca_btl_ofud_component.rd_num_max,
|
|
|
|
mca_btl_ofud_component.rd_num_inc,
|
|
|
|
ud_btl->super.btl_mpool);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Post receive descriptors */
|
|
|
|
for(i = 0; i < mca_btl_ofud_component.rd_num; i++) {
|
|
|
|
OMPI_FREE_LIST_GET(&ud_btl->recv_frags, item, rc);
|
|
|
|
frag = (mca_btl_ud_frag_t*)item;
|
|
|
|
|
|
|
|
if(NULL == frag) {
|
|
|
|
BTL_ERROR(("error getting receive buffer from free list\n"));
|
|
|
|
goto obj_destruct;
|
|
|
|
}
|
|
|
|
|
|
|
|
frag->type = MCA_BTL_UD_FRAG_RECV;
|
|
|
|
frag->sg_entry.length = mca_btl_ofud_module.super.btl_eager_limit +
|
|
|
|
sizeof(mca_btl_ud_header_t) + sizeof(mca_btl_ud_ib_header_t);
|
|
|
|
if(ibv_post_recv(ud_btl->ib_qp[0],
|
|
|
|
&frag->wr_desc.rd_desc, &bad_wr)) {
|
|
|
|
BTL_ERROR(("error posting recv, errno %s\n", strerror(errno)));
|
|
|
|
goto obj_destruct;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No more errors anticipated - initialize everything else */
|
|
|
|
OBJ_CONSTRUCT(&ud_btl->ud_lock, opal_mutex_t);
|
|
|
|
OBJ_CONSTRUCT(&ud_btl->pending_frags, opal_list_t);
|
|
|
|
OBJ_CONSTRUCT(&ud_btl->send_frags, ompi_free_list_t);
|
|
|
|
OBJ_CONSTRUCT(&ud_btl->user_frags, ompi_free_list_t);
|
|
|
|
|
2007-11-02 02:38:50 +03:00
|
|
|
ompi_free_list_init_new(&ud_btl->send_frags,
|
2007-07-06 03:42:54 +04:00
|
|
|
length,
|
2007-11-02 02:38:50 +03:00
|
|
|
CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
OBJ_CLASS(mca_btl_ud_send_frag_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
length_payload,CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.sd_num >> 1,
|
2007-10-30 00:51:28 +03:00
|
|
|
-1,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.sd_num << 2,
|
|
|
|
ud_btl->super.btl_mpool);
|
|
|
|
|
|
|
|
/* Initialize pool of user fragments */
|
|
|
|
length = sizeof(mca_btl_ud_frag_t) +
|
|
|
|
sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN;
|
|
|
|
|
2007-11-02 12:19:55 +03:00
|
|
|
length_payload = sizeof(mca_btl_ud_frag_t) +
|
2007-11-02 02:38:50 +03:00
|
|
|
sizeof(mca_btl_ud_header_t) + 2 * MCA_BTL_IB_FRAG_ALIGN-
|
|
|
|
sizeof(mca_btl_ud_user_frag_t);
|
|
|
|
|
|
|
|
ompi_free_list_init_new(&ud_btl->user_frags,
|
2007-07-06 03:42:54 +04:00
|
|
|
length,
|
2007-11-02 02:38:50 +03:00
|
|
|
CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
OBJ_CLASS(mca_btl_ud_user_frag_t),
|
2007-11-02 02:38:50 +03:00
|
|
|
length_payload,CACHE_LINE_SIZE,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.sd_num >> 1,
|
2007-10-30 00:51:28 +03:00
|
|
|
-1,
|
2007-07-06 03:42:54 +04:00
|
|
|
mca_btl_ofud_component.sd_num << 2,
|
|
|
|
ud_btl->super.btl_mpool);
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
|
|
|
|
obj_destruct:
|
|
|
|
OBJ_DESTRUCT(&ud_btl->recv_frags);
|
|
|
|
qp_destroy:
|
|
|
|
for(i = 0; i < MCA_BTL_UD_NUM_QP; i++) {
|
|
|
|
ibv_destroy_qp(ud_btl->ib_qp[i]);
|
|
|
|
}
|
|
|
|
mpool_destroy:
|
|
|
|
mca_mpool_base_module_destroy(ud_btl->super.btl_mpool);
|
|
|
|
dealloc_pd:
|
|
|
|
ibv_dealloc_pd(ud_btl->ib_pd);
|
|
|
|
return OMPI_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-07-11 00:37:42 +04:00
|
|
|
int mca_btl_ud_ft_event(int state) {
|
2007-07-06 03:42:54 +04:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|