1
1
This commit was SVN r5864.
Этот коммит содержится в:
Galen Shipman 2005-05-25 15:27:33 +00:00
родитель 92f34e848c
Коммит ddc19805ab
24 изменённых файлов: 4347 добавлений и 0 удалений

0
src/mca/bmi/ib/.ompi_ignore Обычный файл
Просмотреть файл

2
src/mca/bmi/ib/.ompi_unignore Обычный файл
Просмотреть файл

@ -0,0 +1,2 @@
twoodall
gshipman

60
src/mca/bmi/ib/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,60 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
include $(top_ompi_srcdir)/config/Makefile.options
libmca_bmi_ib_la_SOURCES = \
bmi_ib.c \
bmi_ib.h \
bmi_ib_addr.h \
bmi_ib_component.c \
bmi_ib_endpoint.h \
bmi_ib_frag.c \
bmi_ib_frag.h \
bmi_ib_memory.c \
bmi_ib_peer.c \
bmi_ib_peer.h \
bmi_ib_priv.c \
bmi_ib_priv.h \
bmi_ib_proc.c \
bmi_ib_proc.h \
bmi_ib_recvfrag.c \
bmi_ib_recvfrag.h \
bmi_ib_sendfrag.c \
bmi_ib_sendfrag.h \
bmi_ib_vapi.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_bmi_sm_DSO
component_noinst =
component_install = mca_bmi_ib.la
else
component_noinst = libmca_bmi_ib.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_bmi_ib_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_bmi_ib_la_LDFLAGS = -module -avoid-version

481
src/mca/bmi/ib/bmi_ib.c Обычный файл
Просмотреть файл

@ -0,0 +1,481 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <string.h>
#include "util/output.h"
#include "util/if.h"
#include "mca/pml/pml.h"
#include "mca/bmi/bmi.h"
#include "bmi_ib.h"
#include "bmi_ib_frag.h"
mca_bmi_ib_module_t mca_bmi_ib_module = {
{
&mca_bmi_ib_component.super,
0, /* max size of first fragment */
0, /* min fragment size */
0, /* max fragment size */
0, /* exclusivity */
0, /* latency */
0, /* bandwidth */
MCA_PTL_PUT, /* bmi flags */
mca_bmi_ib_add_procs,
mca_bmi_ib_del_procs,
mca_bmi_ib_register,
mca_bmi_ib_finalize,
/* we need alloc free, pack */
mca_bmi_ib_alloc,
mca_bmi_ib_free,
mca_bmi_ib_pack,
mca_bmi_ib_send,
mca_bmi_ib_put,
NULL /* get */
}
};
int mca_bmi_ib_add_procs(
struct mca_bmi_base_module_t* bmi,
size_t nprocs,
struct ompi_proc_t **ompi_procs,
struct mca_bmi_base_endpoint_t** peers,
ompi_bitmap_t* reachable)
{
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi;
int i, rc;
for(i = 0; i < nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_bmi_ib_proc_t* ib_proc;
mca_bmi_base_endpoint_t* ib_peer;
if(NULL == (ib_proc = mca_bmi_ib_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this PTL instance to the proc.
*/
OMPI_THREAD_LOCK(&ib_proc->proc_lock);
/* The bmi_proc datastructure is shared by all IB PTL
* instances that are trying to reach this destination.
* Cache the peer instance on the bmi_proc.
*/
ib_peer = OBJ_NEW(mca_bmi_ib_endpoint_t);
if(NULL == ib_peer) {
OMPI_THREAD_UNLOCK(&module_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
ib_peer->peer_bmi = ib_bmi;
rc = mca_bmi_ib_proc_insert(ib_proc, ib_peer);
if(rc != OMPI_SUCCESS) {
OBJ_RELEASE(ib_peer);
OMPI_THREAD_UNLOCK(&module_proc->proc_lock);
continue;
}
ompi_bitmap_set_bit(reachable, i);
OMPI_THREAD_UNLOCK(&module_proc->proc_lock);
peers[i] = ib_peer;
}
return OMPI_SUCCESS;
}
int mca_bmi_ib_del_procs(struct mca_bmi_base_module_t* bmi,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_bmi_base_endpoint_t ** peers)
{
/* Stub */
D_PRINT("Stub\n");
return OMPI_SUCCESS;
}
int mca_bmi_ib_register(
struct mca_bmi_base_module_t* bmi,
mca_bmi_base_tag_t tag,
mca_bmi_base_module_recv_cb_fn_t cbfunc,
void* cbdata)
{
/* TODO add register stuff here... */
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*) bmi;
ib_bmi->ib_reg[tag].cbfunc = cbfunc;
ib_bmi->ib_reg[tag].cbdata = cbdata;
return OMPI_SUCCESS;
}
/**
* Allocate a segment.
*
* @param bmi (IN) BMI module
* @param size (IN) Request segment size.
*/
extern mca_bmi_base_descriptor_t* mca_bmi_ib_alloc(
struct mca_bmi_base_module_t* bmi,
size_t size)
{
mca_bmi_ib_frag_t* frag;
int rc;
if(size <= mca_bmi_ib_component.first_fragment_size) {
MCA_BMI_IB_FRAG_ALLOC1(frag,rc);
} else {
}
return (mca_bmi_base_descriptor_t*)frag;
}
extern int mca_bmi_ib_free(
struct mca_bmi_base_module_t* bmi,
mca_bmi_base_descriptor_t* des)
{
mca_bmi_ib_frag_t* frag = (mca_bmi_ib_frag_t*)des;
MCA_BMI_IB_FRAG_RETURN1(frag);
}
/**
* Pack data
*
* @param bmi (IN) BMI module
* @param peer (IN) BMI peer addressing
*/
struct mca_bmi_base_descriptor_t* mca_bmi_ib_pack(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_endpoint_t* peer,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size)
{
return NULL;
}
int mca_bmi_ib_finalize(struct mca_bmi_base_module_t* bmi)
{
/* Stub */
D_PRINT("Stub\n");
return OMPI_SUCCESS;
}
int mca_bmi_ib_request_init( struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_send_request_t* request)
{
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi;
mca_bmi_ib_send_frag_t* sendfrag;
ompi_list_item_t* item;
int rc;
OMPI_FREE_LIST_GET(&ib_bmi->send_free, item, rc);
if(NULL == (sendfrag = (mca_bmi_ib_send_frag_t*)item)) {
return rc;
}
((mca_bmi_ib_send_request_t*) request)->req_frag = sendfrag;
return OMPI_SUCCESS;
}
void mca_bmi_ib_request_fini( struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_send_request_t* request)
{
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi;
mca_bmi_ib_send_request_t* sendreq = (mca_bmi_ib_send_request_t*)request;
OMPI_FREE_LIST_RETURN(&ib_bmi->send_free, (ompi_list_item_t*)sendreq->req_frag);
}
/*
* Initiate a send. If this is the first fragment, use the fragment
* descriptor allocated with the send requests, otherwise obtain
* one from the free list. Initialize the fragment and foward
* on to the peer.
*/
int mca_bmi_ib_send(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_endpoint_t* bmi_peer,
struct mca_bmi_base_descriptor_t* descriptor,
mca_bmi_base_tag_t tag)
{
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi;
mca_bmi_ib_frag_t* frag = (mca_bmi_ib_frag_t*)descriptor;
frag->tag = tag;
frag->type = MCA_BMI_IB_FRAG_SEND;
int rc = OMPI_SUCCESS;
frag->rc = rc;
ompi_convertor_t *convertor;
int rc, freeAfter;
unsigned int iov_count, max_data;
struct iovec iov;
/* first fragment (eager send) and first fragment of long
* protocol can use the convertor initialized on the request,
* remaining fragments must copy/reinit the convertor as the
* transfer could be in parallel.
*/
if( offset <= mca_bmi_ib_module.super.bmi_first_frag_size ) {
convertor = &sendreq->req_send.req_convertor;
} else {
convertor = &sendfrag->frag_send.frag_base.frag_convertor;
ompi_convertor_copy(&sendreq->req_send.req_convertor, convertor);
ompi_convertor_init_for_send( convertor,
0,
sendreq->req_send.req_base.req_datatype,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_addr,
offset,
NULL );
}
/* if data is contigous, convertor will return an offset
* into users buffer - otherwise will return an allocated buffer
* that holds the packed data
*/
if((flags & MCA_PTL_FLAGS_ACK) == 0) {
iov.iov_base = &sendfrag->ib_buf.buf[sizeof(mca_bmi_base_match_header_t)];
} else {
iov.iov_base = &sendfrag->ib_buf.buf[sizeof(mca_bmi_base_rendezvous_header_t)];
}
iov.iov_len = size;
iov_count = 1;
max_data = size;
if((rc = ompi_convertor_pack(convertor,&iov, &iov_count, &max_data, &freeAfter)) < 0) {
ompi_output(0, "Unable to pack data");
return rc;
}
/* adjust size to reflect actual number of bytes packed by convertor */
size = iov.iov_len;
sendfrag->frag_send.frag_base.frag_addr = iov.iov_base;
sendfrag->frag_send.frag_base.frag_size = iov.iov_len;
} else {
sendfrag->frag_send.frag_base.frag_addr = NULL;
sendfrag->frag_send.frag_base.frag_size = 0;
}
/* fragment state */
sendfrag->frag_send.frag_base.frag_owner = &bmi_peer->peer_bmi->super;
sendfrag->frag_send.frag_request = sendreq;
sendfrag->frag_send.frag_base.frag_peer = bmi_peer;
sendfrag->frag_progressed = 0;
/* Initialize header */
hdr = (mca_bmi_base_header_t *) &sendfrag->ib_buf.buf[0];
hdr->hdr_common.hdr_flags = flags;
hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence;
if((flags & MCA_PTL_FLAGS_ACK) == 0) {
hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_MATCH;
hdr_length = sizeof(mca_bmi_base_match_header_t);
} else {
hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_MATCH;
hdr->hdr_rndv.hdr_frag_length = sendfrag->frag_send.frag_base.frag_size;
hdr->hdr_rndv.hdr_src_ptr.lval = 0; /* for VALGRIND/PURIFY - REPLACE WITH MACRO */
hdr->hdr_rndv.hdr_src_ptr.pval = sendfrag;
hdr_length = sizeof(mca_bmi_base_rendezvous_header_t);
}
/* Update the offset after actual fragment size is determined,
* and before attempting to send the fragment */
sendreq->req_offset += size;
IB_SET_SEND_DESC_LEN((&sendfrag->ib_buf), (hdr_length + size));
if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send(bmi_peer, sendfrag))) {
return rc;
}
/* if this is the entire message - signal request is complete */
if(sendreq->req_send.req_bytes_packed == size) {
ompi_request_complete( &(sendreq->req_send.req_base.req_ompi) );
}
return OMPI_SUCCESS;
}
/*
* RDMA local buffer to remote buffer address.
*/
int mca_bmi_ib_put( struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_endpoint_t* bmi_peer,
struct mca_bmi_base_send_request_t* req, size_t offset,
size_t size, int flags)
{
return OMPI_ERR_NOT_IMPLEMENTED;
}
/*
* On a match send an ack to the peer.
*/
static void mca_bmi_ib_ack(
mca_bmi_ib_module_t *ib_bmi,
mca_bmi_ib_send_frag_t *send_frag,
mca_bmi_ib_recv_frag_t *recv_frag)
{
mca_bmi_base_header_t *hdr;
mca_bmi_base_recv_request_t *request;
mca_bmi_ib_endpoint_t *ib_peer;
ib_buffer_t *ib_buf;
int recv_len;
int len_to_reg, len_added = 0;
void *addr_to_reg, *ack_buf;
/* Header starts at beginning of registered
* buffer space */
hdr = (mca_bmi_base_header_t *)
&send_frag->ib_buf.buf[0];
request = recv_frag->super.frag_request;
/* Amount of data we have already received */
recv_len =
recv_frag->super.frag_base.frag_header.hdr_rndv.hdr_frag_length;
hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_ACK;
hdr->hdr_common.hdr_flags = 0;
/* Remote side send descriptor */
hdr->hdr_ack.hdr_src_ptr =
recv_frag->super.frag_base.frag_header.hdr_rndv.hdr_src_ptr;
/* Matched request from recv side */
hdr->hdr_ack.hdr_dst_match.lval = 0;
hdr->hdr_ack.hdr_dst_match.pval = request;
hdr->hdr_ack.hdr_dst_addr.lval = 0;
addr_to_reg = (void*)((char*)request->req_recv.req_base.req_addr + recv_len);
hdr->hdr_ack.hdr_dst_addr.pval = addr_to_reg;
len_to_reg = request->req_recv.req_bytes_packed - recv_len;
hdr->hdr_ack.hdr_dst_size = len_to_reg;
A_PRINT("Dest addr : %p, RDMA Len : %d",
hdr->hdr_ack.hdr_dst_addr.pval,
hdr->hdr_ack.hdr_dst_size);
ack_buf = (void*) ((char*) (&send_frag->ib_buf.buf[0]) +
sizeof(mca_bmi_base_ack_header_t));
/* Prepare ACK packet with IB specific stuff */
mca_bmi_ib_prepare_ack(ib_bmi, addr_to_reg, len_to_reg,
ack_buf, &len_added);
/* Send it right away! */
ib_peer = (mca_bmi_ib_endpoint_t *)
recv_frag->super.frag_base.frag_peer;
ib_buf = &send_frag->ib_buf;
IB_SET_SEND_DESC_LEN(ib_buf,
(sizeof(mca_bmi_base_ack_header_t) + len_added));
mca_bmi_ib_post_send(ib_bmi, ib_peer, &send_frag->ib_buf, send_frag);
/* fragment state */
send_frag->frag_send.frag_base.frag_owner = &ib_bmi->super;
send_frag->frag_send.frag_base.frag_peer = recv_frag->super.frag_base.frag_peer;
send_frag->frag_send.frag_base.frag_addr = NULL;
send_frag->frag_send.frag_base.frag_size = 0;
}
/*
* A posted receive has been matched - if required send an
* ack back to the peer and process the fragment. Copy the
* data to user buffer
*/
void mca_bmi_ib_matched(
mca_bmi_base_module_t* bmi,
mca_bmi_base_recv_frag_t* frag)
{
mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi;
mca_bmi_base_recv_request_t *request;
mca_bmi_base_header_t *header;
mca_bmi_ib_recv_frag_t *recv_frag;
header = &frag->frag_base.frag_header;
request = frag->frag_request;
recv_frag = (mca_bmi_ib_recv_frag_t*) frag;
D_PRINT("Matched frag\n");
if (header->hdr_common.hdr_flags & MCA_PTL_FLAGS_ACK) {
mca_bmi_ib_send_frag_t *send_frag;
send_frag = mca_bmi_ib_alloc_send_frag(ib_bmi, NULL);
if(NULL == send_frag) {
ompi_output(0, "Cannot get send descriptor");
} else {
mca_bmi_ib_ack(ib_bmi, send_frag, recv_frag);
}
}
/* Process the fragment */
/* IN TCP case, IO_VEC is first allocated.
* then recv the data, and copy if needed,
* But in ELAN cases, we save the data into an
* unex buffer if the recv descriptor is not posted
* (for too long) (TODO).
* We then need to copy from
* unex_buffer to application buffer */
if ((header->hdr_common.hdr_type & MCA_PTL_HDR_TYPE_MATCH) &&
(header->hdr_match.hdr_msg_length > 0)) {
struct iovec iov;
ompi_proc_t *proc;
unsigned int iov_count, max_data;
int freeAfter;
iov.iov_base = frag->frag_base.frag_addr;
iov.iov_len = frag->frag_base.frag_size;
proc = ompi_comm_peer_lookup(request->req_recv.req_base.req_comm,
request->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
ompi_convertor_copy(proc->proc_convertor, &frag->frag_base.frag_convertor);
ompi_convertor_init_for_recv( &frag->frag_base.frag_convertor,
0,
request->req_recv.req_base.req_datatype,
request->req_recv.req_base.req_count,
request->req_recv.req_base.req_addr,
0, /* fragment offset */
NULL );
ompi_convertor_unpack(&frag->frag_base.frag_convertor, &iov, &iov_count, &max_data, &freeAfter);
}
mca_bmi_ib_recv_frag_done(header, frag, request);
}

353
src/mca/bmi/ib/bmi_ib.h Обычный файл
Просмотреть файл

@ -0,0 +1,353 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PTL_IB_H
#define MCA_PTL_IB_H
/* Standard system includes */
#include <sys/types.h>
#include <string.h>
/* Open MPI includes */
#include "class/ompi_free_list.h"
#include "class/ompi_bitmap.h"
#include "event/event.h"
#include "mca/pml/pml.h"
#include "mca/bmi/bmi.h"
#include "util/output.h"
/* InfiniBand VAPI includes */
#include "bmi_ib_vapi.h"
#include "bmi_ib_addr.h"
#include "bmi_ib_proc.h"
#include "bmi_ib_peer.h"
#include "bmi_ib_priv.h"
/* Other IB bmi includes */
#include "bmi_ib_sendreq.h"
#include "bmi_ib_recvfrag.h"
#include "bmi_ib_sendfrag.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/**
* Infiniband (IB) BMI component.
*/
struct mca_bmi_ib_registration_t {
mca_bmi_base_module_recv_cb_fn_t cbfunc;
void *cbdata;
}
struct mca_bmi_ib_component_t {
mca_bmi_base_component_1_0_0_t super; /**< base BMI component */
uint32_t ib_num_bmis;
/**< number of hcas available to the IB component */
struct mca_bmi_ib_module_t *ib_bmis;
/**< array of available PTLs */
int ib_free_list_num;
/**< initial size of free lists */
int ib_free_list_max;
/**< maximum size of free lists */
int ib_free_list_inc;
/**< number of elements to alloc when growing free lists */
ompi_free_list_t ib_send_requests;
/**< free list of ib send requests -- sendreq + IB */
ompi_free_list_t ib_send_frags;
/**< free list of ib send fragments */
ompi_free_list_t ib_recv_frags;
/**< free list of ib recv fragments */
ompi_list_t ib_procs;
/**< list of ib proc structures */
ompi_event_t ib_send_event;
/**< event structure for sends */
ompi_event_t ib_recv_event;
/**< event structure for recvs */
ompi_mutex_t ib_lock;
/**< lock for accessing module state */
int ib_mem_registry_hints_log_size;
/**< log2 size of hints hash array used by memory registry */
};
typedef struct mca_bmi_ib_component_t mca_bmi_ib_component_t;
struct mca_bmi_ib_recv_frag_t;
extern mca_bmi_ib_component_t mca_bmi_ib_component;
/**
* IB PTL Interface
*/
struct mca_bmi_ib_module_t {
mca_bmi_base_module_t super; /**< base PTL interface */
bool bmi_inited;
mca_bmi_ib_registration_t ib_reg[256];
VAPI_hca_id_t hca_id; /**< ID of HCA */
VAPI_hca_port_t port; /**< IB port of this PTL */
VAPI_hca_hndl_t nic; /**< NIC handle */
VAPI_pd_hndl_t ptag; /**< Protection Domain tag */
VAPI_cq_hndl_t cq_hndl; /**< Completion Queue handle */
EVAPI_async_handler_hndl_t async_handler;
/**< Async event handler used to detect weird/unknown events */
mca_bmi_ib_mem_registry_t mem_registry; /**< registry of memory regions */
ompi_free_list_t ib_frags1; /**< free list of buffer descriptors */
ompi_list_t repost; /**< list of buffers to repost */
};
typedef struct mca_bmi_ib_module_t mca_bmi_ib_module_t;
extern mca_bmi_ib_module_tmca_bmi_ib_module;
/**
* IB FIN header
*/
typedef struct mca_bmi_ib_fin_header_t mca_bmi_ib_fin_header_t;
struct mca_bmi_ib_fin_header_t {
mca_bmi_base_frag_header_t frag_hdr;
ompi_ptr_t mr_addr;
uint64_t mr_size;
};
/**
* Register IB component parameters with the MCA framework
*/
extern int mca_bmi_ib_component_open(void);
/**
* Any final cleanup before being unloaded.
*/
extern int mca_bmi_ib_component_close(void);
/**
* IB component initialization.
*
* @param num_bmi_modules (OUT) Number of BMIs returned in BMI array.
* @param allow_multi_user_threads (OUT) Flag indicating wether BMI supports user threads (TRUE)
* @param have_hidden_threads (OUT) Flag indicating wether BMI uses threads (TRUE)
*
* (1) read interface list from kernel and compare against component parameters
* then create a BMI instance for selected interfaces
* (2) setup IB listen socket for incoming connection attempts
* (3) publish BMI addressing info
*
*/
extern mca_bmi_base_module_t** mca_bmi_ib_component_init(
int *num_bmi_modules,
bool allow_multi_user_threads,
bool have_hidden_threads
);
/**
* IB component control.
*/
extern int mca_bmi_ib_component_control(
int param,
void* value,
size_t size
);
/**
* IB component progress.
*/
extern int mca_bmi_ib_component_progress(
mca_bmi_tstamp_t tstamp
);
/**
* Cleanup any resources held by the BMI.
*
* @param bmi BMI instance.
* @return OMPI_SUCCESS or error status on failure.
*/
extern int mca_bmi_ib_finalize(
struct mca_bmi_base_module_t* bmi
);
/**
* PML->BMI notification of change in the process list.
*
* @param bmi (IN)
* @param nprocs (IN) Number of processes
* @param procs (IN) Set of processes
* @param peers (OUT) Set of (optional) peer addressing info.
* @param peers (IN/OUT) Set of processes that are reachable via this BMI.
* @return OMPI_SUCCESS or error status on failure.
*
*/
extern int mca_bmi_ib_add_procs(
struct mca_bmi_base_module_t* bmi,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_bmi_base_endpoint_t** peers,
ompi_bitmap_t* reachable
);
/**
* PML->BMI notification of change in the process list.
*
* @param bmi (IN) BMI instance
* @param nproc (IN) Number of processes.
* @param procs (IN) Set of processes.
* @param peers (IN) Set of peer data structures.
* @return Status indicating if cleanup was successful
*
*/
extern int mca_bmi_ib_del_procs(
struct mca_bmi_base_module_t* bmi,
size_t nprocs,
struct ompi_proc_t **procs,
struct mca_bmi_base_endpoint_t** peers
);
/**
* PML->BMI Initialize a send request for TCP cache.
*
* @param bmi (IN) BMI instance
* @param request (IN) Pointer to allocated request.
*
**/
extern int mca_bmi_ib_request_init(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_send_request_t*
);
/**
* PML->BMI Cleanup a send request that is being removed from the cache.
*
* @param bmi (IN) BMI instance
* @param request (IN) Pointer to allocated request.
*
**/
extern void mca_bmi_ib_request_fini(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_send_request_t*
);
/**
* PML->BMI Return a send request to the BMI modules free list.
*
* @param bmi (IN) BMI instance
* @param request (IN) Pointer to allocated request.
*
*/
extern void mca_bmi_ib_request_return(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_send_request_t*
);
/**
* PML->BMI Notification that a receive fragment has been matched.
*
* @param bmi (IN) BMI instance
* @param recv_frag (IN) Receive fragment
*
*/
extern void mca_bmi_ib_matched(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_recv_frag_t* frag
);
/**
* PML->BMI Initiate a send of the specified size.
*
* @param bmi (IN) BMI instance
* @param bmi_base_peer (IN) BMI peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_bmi_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BMI to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BMI was able to queue one or more fragments
*/
extern int mca_bmi_ib_send(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_endpoint_t* bmi_peer,
struct mca_bmi_base_send_request_t*,
size_t offset,
size_t size,
int flags
);
/**
* PML->BMI Initiate a put of the specified size.
*
* @param bmi (IN) BMI instance
* @param bmi_base_peer (IN) BMI peer addressing
* @param send_request (IN/OUT) Send request (allocated by PML via mca_bmi_base_request_alloc_fn_t)
* @param size (IN) Number of bytes PML is requesting BMI to deliver
* @param flags (IN) Flags that should be passed to the peer via the message header.
* @param request (OUT) OMPI_SUCCESS if the BMI was able to queue one or more fragments
*/
extern int mca_bmi_ib_put(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_base_endpoint_t* bmi_peer,
struct mca_bmi_base_send_request_t*,
size_t offset,
size_t size,
int flags
);
/**
* Return a recv fragment to the modules free list.
*
* @param bmi (IN) BMI instance
* @param frag (IN) IB receive fragment
*
*/
extern void mca_bmi_ib_recv_frag_return(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_ib_recv_frag_t* frag
);
/**
* Return a send fragment to the modules free list.
*
* @param bmi (IN) BMI instance
* @param frag (IN) IB send fragment
*
*/
extern void mca_bmi_ib_send_frag_return(
struct mca_bmi_base_module_t* bmi,
struct mca_bmi_ib_send_frag_t*
);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

24
src/mca/bmi/ib/bmi_ib_addr.h Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_ADDR_H
#define MCA_BMI_IB_ADDR_H
#include "bmi_ib.h"
#endif

357
src/mca/bmi/ib/bmi_ib_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,357 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/* #include <hh_common.h> */
/* Open MPI includes */
#include "ompi_config.h"
#include "include/constants.h"
#include "event/event.h"
#include "util/if.h"
#include "util/argv.h"
#include "util/output.h"
#include "mca/pml/pml.h"
#include "mca/bmi/bmi.h"
#include "mca/base/mca_base_param.h"
#include "mca/base/mca_base_module_exchange.h"
#include "mca/errmgr/errmgr.h"
/* IB bmi includes */
#include "bmi_ib.h"
mca_bmi_ib_component_t mca_bmi_ib_component = {
{
/* First, the mca_base_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pml v1.0.0 component (which also implies a
specific MCA version) */
MCA_BMI_BASE_VERSION_1_0_0,
"ib", /* MCA component name */
1, /* MCA component major version */
0, /* MCA component minor version */
0, /* MCA component release version */
mca_bmi_ib_component_open, /* component open */
mca_bmi_ib_component_close /* component close */
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
mca_bmi_ib_component_init,
mca_bmi_ib_component_control,
mca_bmi_ib_component_progress,
}
};
/*
* utility routines for parameter registration
*/
static inline char* mca_bmi_ib_param_register_string(
const char* param_name,
const char* default_value)
{
char *param_value;
int id = mca_base_param_register_string("bmi","ib",param_name,NULL,default_value);
mca_base_param_lookup_string(id, &param_value);
return param_value;
}
static inline int mca_bmi_ib_param_register_int(
const char* param_name,
int default_value)
{
int id = mca_base_param_register_int("bmi","ib",param_name,NULL,default_value);
int param_value = default_value;
mca_base_param_lookup_int(id,&param_value);
return param_value;
}
/*
* Called by MCA framework to open the component, registers
* component parameters.
*/
int mca_bmi_ib_component_open(void)
{
/* register component parameters */
mca_bmi_ib_module.super.bmi_exclusivity =
mca_bmi_ib_param_register_int ("exclusivity", 0);
mca_bmi_ib_module.super.bmi_first_frag_size =
mca_bmi_ib_param_register_int ("first_frag_size",
(MCA_BMI_IB_FIRST_FRAG_SIZE
- sizeof(mca_bmi_base_header_t)));
mca_bmi_ib_module.super.bmi_min_frag_size =
mca_bmi_ib_param_register_int ("min_frag_size",
(MCA_BMI_IB_FIRST_FRAG_SIZE
- sizeof(mca_bmi_base_header_t)));
mca_bmi_ib_module.super.bmi_max_frag_size =
mca_bmi_ib_param_register_int ("max_frag_size", 2<<30);
/* register IB component parameters */
mca_bmi_ib_component.ib_free_list_num =
mca_bmi_ib_param_register_int ("free_list_num", 8);
mca_bmi_ib_component.ib_free_list_max =
mca_bmi_ib_param_register_int ("free_list_max", 1024);
mca_bmi_ib_component.ib_free_list_inc =
mca_bmi_ib_param_register_int ("free_list_inc", 32);
mca_bmi_ib_component.ib_mem_registry_hints_log_size =
mca_bmi_ib_param_register_int ("hints_log_size", 8);
/* initialize global state */
mca_bmi_ib_component.ib_num_bmis=0;
mca_bmi_ib_component.ib_bmis=NULL;
OBJ_CONSTRUCT(&mca_bmi_ib_component.ib_procs, ompi_list_t);
OBJ_CONSTRUCT (&mca_bmi_ib_component.ib_recv_frags, ompi_free_list_t);
return OMPI_SUCCESS;
}
/*
* component cleanup - sanity checking of queue lengths
*/
int mca_bmi_ib_component_close(void)
{
D_PRINT("");
/* Stub */
return OMPI_SUCCESS;
}
/*
* IB component initialization:
* (1) read interface list from kernel and compare against component parameters
* then create a BMI instance for selected interfaces
* (2) setup IB listen socket for incoming connection attempts
* (3) register BMI parameters with the MCA
*/
mca_bmi_base_module_t** mca_bmi_ib_component_init(int *num_bmi_modules,
bool enable_progress_threads,
bool enable_mpi_threads)
{
VAPI_ret_t vapi_ret;
VAPI_hca_id_t* hca_ids;
mca_bmi_base_module_t** bmis;
int i, ret;
/* initialization */
*num_bmi_modules = 0;
/* query the list of available hcas */
vapi_ret=EVAPI_list_hcas(0, &(mca_bmi_ib_component.ib_num_bmis), NULL);
if( VAPI_EAGAIN != vapi_ret || 0 == mca_bmi_ib_component.ib_num_bmis ) {
ompi_output(0,"Warning: no IB HCAs found\n");
return NULL;
}
hca_ids = (VAPI_hca_id_t*) malloc(mca_bmi_ib_component.ib_num_bmis * sizeof(VAPI_hca_id_t));
if(NULL == hca_ids) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
vapi_ret=EVAPI_list_hcas(mca_bmi_ib_component.ib_num_bmis, &mca_bmi_ib_component.ib_num_bmis, hca_ids);
if( VAPI_OK != vapi_ret ) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
/* Allocate space for bmi modules */
mca_bmi_ib_component.ib_bmis = (mca_bmi_ib_module_t*) malloc(sizeof(mca_bmi_ib_module_t) *
mca_bmi_ib_component.ib_num_bmis);
if(NULL == mca_bmi_ib_component.ib_bmis) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
bmis = (struct mca_bmi_base_module_t**)
malloc(mca_bmi_ib_component.ib_num_bmis * sizeof(struct mca_bmi_ib_module_t*));
if(NULL == bmis) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return NULL;
}
/* Initialize pool of receive fragments */
ompi_free_list_init (&(mca_bmi_ib_component.ib_recv_frags),
sizeof (mca_bmi_ib_recv_frag_t),
OBJ_CLASS (mca_bmi_ib_recv_frag_t),
mca_bmi_ib_component.ib_free_list_num,
mca_bmi_ib_component.ib_free_list_max,
mca_bmi_ib_component.ib_free_list_inc, NULL);
/* Initialize each module */
for(i = 0; i < mca_bmi_ib_component.ib_num_bmis; i++) {
mca_bmi_ib_module_t* ib_bmi = &mca_bmi_ib_component.ib_bmis[i];
/* Initialize the modules function pointers */
memcpy(ib_bmi, &mca_bmi_ib_module, sizeof(mca_bmi_ib_module));
/* Initialize module state */
OBJ_CONSTRUCT(&ib_bmi->send_free, ompi_free_list_t);
OBJ_CONSTRUCT(&ib_bmi->repost, ompi_list_t);
ompi_free_list_init(&ib_bmi->send_free,
sizeof(mca_bmi_ib_send_frag_t),
OBJ_CLASS(mca_bmi_ib_send_frag_t),
mca_bmi_ib_component.ib_free_list_num,
mca_bmi_ib_component.ib_free_list_max,
mca_bmi_ib_component.ib_free_list_inc,
NULL);
memcpy(ib_bmi->hca_id, hca_ids[i], sizeof(ib_bmi->hca_id));
if(mca_bmi_ib_module_init(ib_bmi) != OMPI_SUCCESS) {
free(hca_ids);
return NULL;
}
/* Initialize the send descriptors */
if(mca_bmi_ib_send_frag_register(ib_bmi) != OMPI_SUCCESS) {
free(hca_ids);
return NULL;
}
bmis[i] = &ib_bmi->super;
}
/* Post OOB receive to support dynamic connection setup */
mca_bmi_ib_post_recv();
*num_bmi_modules = mca_bmi_ib_component.ib_num_bmis;
free(hca_ids);
return bmis;
}
/*
* IB component control
*/
int mca_bmi_ib_component_control(int param, void* value, size_t size)
{
return OMPI_SUCCESS;
}
/*
* IB component progress.
*/
#define MCA_BMI_IB_DRAIN_NETWORK(nic, cq_hndl, comp_type, comp_addr) \
{ \
VAPI_ret_t ret; \
VAPI_wc_desc_t comp; \
\
ret = VAPI_poll_cq(nic, cq_hndl, &comp); \
if(VAPI_OK == ret) { \
if(comp.status != VAPI_SUCCESS) { \
ompi_output(0, "Got error : %s, Vendor code : %d Frag : %p", \
VAPI_wc_status_sym(comp.status), \
comp.vendor_err_syndrome, comp.id); \
*comp_type = IB_COMP_ERROR; \
*comp_addr = NULL; \
} else { \
if(VAPI_CQE_SQ_SEND_DATA == comp.opcode) { \
*comp_type = IB_COMP_SEND; \
*comp_addr = (void*) (unsigned long) comp.id; \
} else if(VAPI_CQE_RQ_SEND_DATA == comp.opcode) { \
*comp_type = IB_COMP_RECV; \
*comp_addr = (void*) (unsigned long) comp.id; \
} else if(VAPI_CQE_SQ_RDMA_WRITE == comp.opcode) { \
*comp_type = IB_COMP_RDMA_W; \
*comp_addr = (void*) (unsigned long) comp.id; \
} else { \
ompi_output(0, "VAPI_poll_cq: returned unknown opcode : %d\n", \
comp.opcode); \
*comp_type = IB_COMP_ERROR; \
*comp_addr = NULL; \
} \
} \
} else { \
/* No completions from the network */ \
*comp_type = IB_COMP_NOTHING; \
*comp_addr = NULL; \
} \
}
int mca_bmi_ib_component_progress(mca_bmi_tstamp_t tstamp)
{
int i;
int count = 0;
/* Poll for completions */
for(i = 0; i < mca_bmi_ib_component.ib_num_bmis; i++) {
mca_bmi_ib_module_t* ib_bmi = &mca_bmi_ib_component.ib_bmis[i];
int comp_type = IB_COMP_NOTHING;
void* comp_addr;
MCA_BMI_IB_DRAIN_NETWORK(ib_bmi->nic, ib_bmi->cq_hndl, &comp_type, &comp_addr);
/* Handle n/w completions */
switch(comp_type) {
case IB_COMP_SEND :
/* Process a completed send */
mca_bmi_ib_send_frag_send_complete(ib_bmi, (mca_bmi_ib_send_frag_t*)comp_addr);
count++;
break;
case IB_COMP_RECV :
/* Process incoming receives */
mca_bmi_ib_process_recv(ib_bmi, comp_addr);
/* Re post recv buffers */
if(ompi_list_get_size(&ib_bmi->repost) <= 1) {
ompi_list_append(&ib_bmi->repost, (ompi_list_item_t*)comp_addr);
} else {
ompi_list_item_t* item;
while(NULL != (item = ompi_list_remove_first(&ib_bmi->repost))) {
mca_bmi_ib_buffer_repost(ib_bmi->nic, item);
}
mca_bmi_ib_buffer_repost(ib_bmi->nic, comp_addr);
}
count++;
break;
case IB_COMP_RDMA_W :
ompi_output(0, "%s:%d RDMA not implemented\n", __FILE__,__LINE__);
count++;
break;
case IB_COMP_NOTHING:
break;
default:
ompi_output(0, "Errorneous network completion");
break;
}
}
return count;
}

136
src/mca/bmi/ib/bmi_ib_endpoint.h Обычный файл
Просмотреть файл

@ -0,0 +1,136 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_PEER_H
#define MCA_BMI_IB_PEER_H
#include "class/ompi_list.h"
#include "event/event.h"
#include "mca/pml/pml.h"
#include "mca/bmi/bmi.h"
#include "bmi_ib_recvfrag.h"
#include "bmi_ib_sendfrag.h"
#include "bmi_ib_priv.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_endpoint_t);
/**
* State of IB peer connection.
*/
typedef enum {
/* Defines the state in which this BMI instance
* has started the process of connection */
MCA_BMI_IB_CONNECTING,
/* Waiting for ack from peer */
MCA_BMI_IB_CONNECT_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BMI_IB_CONNECTED,
/* Connection is closed, there are no resources
* associated with this */
MCA_BMI_IB_CLOSED,
/* Maximum number of retries have been used.
* Report failure on send to upper layer */
MCA_BMI_IB_FAILED
} mca_bmi_ib_endpoint_state_t;
/**
* An abstraction that represents a connection to a peer process.
* An instance of mca_bmi_base_endpoint_t is associated w/ each process
* and BMI pair at startup. However, connections to the peer
* are established dynamically on an as-needed basis:
*/
struct mca_bmi_base_endpoint_t {
ompi_list_item_t super;
struct mca_bmi_ib_module_t* peer_bmi;
/**< BMI instance that created this connection */
struct mca_bmi_ib_proc_t* peer_proc;
/**< proc structure corresponding to peer */
mca_bmi_ib_endpoint_state_t peer_state;
/**< current state of the connection */
size_t peer_retries;
/**< number of connection retries attempted */
double peer_tstamp;
/**< timestamp of when the first connection was attempted */
ompi_mutex_t peer_send_lock;
/**< lock for concurrent access to peer state */
ompi_mutex_t peer_recv_lock;
/**< lock for concurrent access to peer state */
ompi_list_t pending_send_frags;
/**< list of pending send frags for this peer */
VAPI_qp_num_t rem_qp_num;
/* Remote side QP number */
IB_lid_t rem_lid;
/* Local identifier of the remote process */
VAPI_qp_hndl_t lcl_qp_hndl;
/* Local QP handle */
VAPI_qp_prop_t lcl_qp_prop;
/* Local QP properties */
ib_buffer_t *lcl_recv;
/* Remote resources associated with this connection */
};
typedef struct mca_bmi_base_endpoint_t mca_bmi_base_endpoint_t;
typedef struct mca_bmi_base_endpoint_t mca_bmi_ib_endpoint_t;
int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t*, mca_bmi_ib_send_frag_t*);
int mca_bmi_ib_peer_connect(mca_bmi_base_endpoint_t*);
void mca_bmi_ib_post_recv(void);
void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t*);
#define DUMP_PEER(peer_ptr) { \
ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \
ompi_output(0, "Dumping peer %d state", \
peer->peer_proc->proc_guid.vpid); \
ompi_output(0, "Local QP hndl : %d", \
peer_ptr->peer_conn->lres->qp_hndl); \
ompi_output(0, "Local QP num : %d", \
peer_ptr->peer_conn->lres->qp_prop.qp_num); \
ompi_output(0, "Remote QP num : %d", \
peer_ptr->peer_conn->rres->qp_num); \
ompi_output(0, "Remote LID : %d", \
peer_ptr->peer_conn->rres->lid); \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

127
src/mca/bmi/ib/bmi_ib_frag.h Обычный файл
Просмотреть файл

@ -0,0 +1,127 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_SEND_FRAG_H
#define MCA_BMI_IB_SEND_FRAG_H
#include "ompi_config.h"
#include "mca/bmi/base/bmi_base_sendreq.h"
#include "mca/bmi/base/bmi_base_sendfrag.h"
#include "bmi_ib_priv.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_send_frag_t);
typedef enum {
MCA_BMI_IB_FRAG_SEND,
MCA_BMI_IB_FRAG_PUT,
MCA_BMI_IB_FRAG_GET,
MCA_BMI_IB_FRAG_ACK
} mca_bmi_ib_frag_type_t;
/**
* IB send fragment derived type.
*/
struct mca_bmi_ib_frag_t {
mca_bmi_base_descriptor_t base;
mca_bmi_base_segment_t segment;
struct mca_bmi_base_endpoint_t *endpoint;
mca_bmi_ib_frag_type_t type;
mca_bmi_base_tag_t tag;
size_t size;
int rc;
bool frag_ack_pending;
};
typedef struct mca_bmi_ib_frag_t mca_bmi_ib_frag_t;
/*
* Allocate an IB send descriptor
*
*/
#define MCA_BMI_IB_FRAG_ALLOC1(frag, rc) \
{ \
\
ompi_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&mca_bmi_ib_module.ib_frags1, item, rc); \
frag = (mca_bmi_ib_frag_t*) item; \
}
#define MCA_BMI_IB_FRAG_RETURN1(frag) \
{ \
OMPI_FREE_LIST_RETURN(&mca_bmi_ib_module.ib_frags1, &frag->super); \
}
int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi)
{
int i, rc, num_send_frags;
ompi_list_item_t *item;
ompi_free_list_t *flist = &ib_bmi->ib_frags1;
ib_buffer_t *ib_buf_ptr;
mca_bmi_ib_frag_t *ib_frag;
num_send_frags = ompi_list_get_size(&(flist->super));
item = ompi_list_get_first(&((flist)->super));
/* Register the buffers */
for(i = 0; i < num_send_frags;
item = ompi_list_get_next(item), i++) {
ib_send_frag = (mca_bmi_ib_send_frag_t *) item;
ib_send_frag->frag_progressed = 0;
ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf;
rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag,
(void*) ib_buf_ptr->buf,
MCA_BMI_IB_FIRST_FRAG_SIZE,
&ib_buf_ptr->hndl);
if(rc != OMPI_SUCCESS) {
return OMPI_ERROR;
}
IB_PREPARE_SEND_DESC(ib_buf_ptr, 0,
MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr);
}
return OMPI_SUCCESS;
}
struct mca_bmi_ib_module_t;
mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag(
struct mca_bmi_ib_module_t* ib_bmi,
mca_bmi_base_send_request_t* request);
int mca_bmi_ib_send_frag_register(struct mca_bmi_ib_module_t *bmi);
void mca_bmi_ib_send_frag_send_complete(struct mca_bmi_ib_module_t *bmi, mca_bmi_ib_send_frag_t*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

311
src/mca/bmi/ib/bmi_ib_memory.c Обычный файл
Просмотреть файл

@ -0,0 +1,311 @@
/* Standard system includes */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Open MPI includes */
/* Other IB BMI includes */
#include "bmi_ib.h"
#include "bmi_ib_memory.h"
#include "bmi_ib_priv.h"
static void mca_bmi_ib_mem_registry_construct(ompi_object_t *object);
static void mca_bmi_ib_mem_registry_destruct(ompi_object_t *object);
static void mca_bmi_ib_mem_registry_info_construct(ompi_object_t *object);
static void mca_bmi_ib_mem_registry_info_destruct(ompi_object_t *object);
static int mca_bmi_ib_mem_registry_info_compare(void *key1, void *key2);
static int mca_bmi_ib_mem_registry_real_deregister(
mca_bmi_ib_mem_registry_t *registry,
mca_bmi_ib_mem_registry_info_t *info);
OBJ_CLASS_INSTANCE(mca_bmi_ib_mem_registry_info_t, ompi_list_item_t,
mca_bmi_ib_mem_registry_info_construct, mca_bmi_ib_mem_registry_info_destruct);
OBJ_CLASS_INSTANCE(mca_bmi_ib_mem_registry_t, ompi_rb_tree_t, mca_bmi_ib_mem_registry_construct,
mca_bmi_ib_mem_registry_destruct);
static void mca_bmi_ib_mem_registry_construct(ompi_object_t *object)
{
mca_bmi_ib_mem_registry_t *registry = (mca_bmi_ib_mem_registry_t *)object;
int i;
ompi_rb_tree_init(&(registry->rb_tree), mca_bmi_ib_mem_registry_info_compare);
OBJ_CONSTRUCT(&(registry->info_free_list), ompi_free_list_t);
ompi_free_list_init(&registry->info_free_list, sizeof(mca_bmi_ib_mem_registry_info_t),
OBJ_CLASS(mca_bmi_ib_mem_registry_info_t), 32, -1, 32, NULL);
registry->hints_log_size = mca_bmi_ib_component.ib_mem_registry_hints_log_size;
/* sanity check -- enforce lower bound for hash calculation */
if (registry->hints_log_size < 1) {
registry->hints_log_size = 1;
}
registry->hints = (ompi_ptr_t *)malloc((1 << registry->hints_log_size) *
sizeof(ompi_ptr_t));
registry->hints_log_size = mca_bmi_ib_component.ib_mem_registry_hints_log_size;
registry->hints_size = (registry->hints) ? (1 << registry->hints_log_size) : 0;
for (i = 0; i < registry->hints_size; i++) {
registry->hints[i].pval = (void *)NULL;
}
registry->ib_bmi = NULL;
registry->evictable = NULL;
return;
}
static void mca_bmi_ib_mem_registry_destruct(ompi_object_t *object)
{
/* memory regions that are being tracked are not deregistered here */
mca_bmi_ib_mem_registry_t *registry = (mca_bmi_ib_mem_registry_t *)object;
OBJ_DESTRUCT(&(registry->info_free_list));
if (registry->hints_size != 0) {
free(registry->hints);
registry->hints = (ompi_ptr_t *)NULL;
registry->hints_size = 0;
}
return;
}
static void mca_bmi_ib_mem_registry_info_construct(ompi_object_t *object)
{
mca_bmi_ib_mem_registry_info_t *info = (mca_bmi_ib_mem_registry_info_t *)object;
info->next = NULL;
info->ref_cnt = 0;
info->hndl = VAPI_INVAL_HNDL;
memset(&(info->request), 0, sizeof(VAPI_mr_t));
memset(&(info->reply), 0, sizeof(VAPI_mr_t));
return;
}
static void mca_bmi_ib_mem_registry_info_destruct(ompi_object_t *object)
{
return;
}
static int mca_bmi_ib_mem_registry_info_compare(void *request, void *treenode)
{
int result;
VAPI_mr_t *mr1 = (VAPI_mr_t *)request;
VAPI_mr_t *mr2 = (VAPI_mr_t *)treenode;
uint64_t start1 = mr1->start;
uint64_t start2 = mr2->start;
uint64_t end1 = start1 + mr1->size;
uint64_t end2 = start2 + mr2->size;
if (end1 < start2) {
/* non-overlapping mr1 < mr2 */
result = -1;
}
else if (start1 > end2) {
/* non-overlapping mr1 > mr2 */
result = 1;
}
else if ((end1 <= end2) && (start1 >= start2)) {
/* completely overlapping mr1 and mr2 (mr2 may be bigger) */
if ((mr1->acl & mr2->acl) == mr1->acl) {
/* minimum access permissions met */
result = 0;
}
else {
/* oops -- access permissions not good enough */
result = 1;
}
}
else if (start1 < start2) {
/* partially overlapping mr1 < mr2 */
result = -1;
}
else {
/* partially overlapping mr1 > mr2 */
result = 1;
}
return result;
}
void mca_bmi_ib_mem_registry_clean_evictables(
mca_bmi_ib_mem_registry_t *registry,
mca_bmi_ib_mem_registry_info_t *info)
{
mca_bmi_ib_mem_registry_info_t *tmp = registry->evictable;
mca_bmi_ib_mem_registry_info_t *prev = NULL;
while (NULL != tmp) {
if (tmp == info) {
if (NULL == prev) {
/* no more entries left -- no evictable list */
registry->evictable = NULL;
}
else {
/* remove this entry from the evictable list */
prev->next = tmp->next;
}
/* clear this entry's evictable link */
tmp->next = NULL;
break;
}
prev = tmp;
tmp = tmp->next;
}
return;
}
mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_register(
mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *mr)
{
mca_bmi_ib_mem_registry_info_t *info = mca_bmi_ib_mem_registry_find(registry, mr);
mca_bmi_ib_mem_registry_info_t *next_to_evict;
ompi_list_item_t *item;
VAPI_ret_t vapi_result;
int rc;
if (info == (mca_bmi_ib_mem_registry_info_t *)NULL) {
/* create new entry and register memory region */
item = (ompi_list_item_t *)info;
OMPI_FREE_LIST_GET(&(registry->info_free_list), item, rc);
info = (mca_bmi_ib_mem_registry_info_t *)item;
if (OMPI_SUCCESS != rc) {
/* error - return null pointer */
return info;
}
memcpy(&(info->request),mr,sizeof(VAPI_mr_t));
info->ref_cnt = 1;
do {
vapi_result = VAPI_register_mr(registry->ib_bmi->nic, mr,
&(info->hndl), &(info->reply));
if (VAPI_OK != vapi_result) {
if (VAPI_EAGAIN == vapi_result) {
/* evict an unused memory region, if at all possible */
if (NULL != registry->evictable) {
next_to_evict = registry->evictable->next;
mca_bmi_ib_mem_registry_real_deregister(registry, registry->evictable);
registry->evictable = next_to_evict;
}
}
else {
/* fatal error */
item = (ompi_list_item_t *)info;
OMPI_FREE_LIST_RETURN(&(registry->info_free_list), item);
info = NULL;
return info;
}
}
} while ((VAPI_OK != vapi_result) && (NULL != info));
/* insert a reference to this information into the red/black tree */
rc = ompi_rb_tree_insert(&(registry->rb_tree), &(info->reply), info);
/* aargh! what do we do if the tree insert fails... */
mca_bmi_ib_mem_registry_insert_hint(registry, &(info->reply), info);
}
else {
if (0 == info->ref_cnt) {
/* make sure we're not on the evictable list */
mca_bmi_ib_mem_registry_clean_evictables(registry, info);
}
(info->ref_cnt)++;
}
return info;
}
mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_register_mem_with_registry(
mca_bmi_ib_module_t *ib_module,
void *addr, size_t len)
{
mca_bmi_ib_mem_registry_info_t *info;
VAPI_mr_t mr;
mr.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE;
mr.l_key = 0;
mr.r_key = 0;
mr.pd_hndl = ib_module->ptag;
mr.size = len;
mr.start = (VAPI_virt_addr_t) (MT_virt_addr_t) addr;
mr.type = VAPI_MR;
info = mca_bmi_ib_mem_registry_register(&(ib_module->mem_registry),&mr);
return info;
}
int mca_bmi_ib_deregister_mem_with_registry(
mca_bmi_ib_module_t *ib_module,
void *addr, size_t len)
{
VAPI_mr_t mr;
int rc;
mr.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE;
mr.l_key = 0;
mr.r_key = 0;
mr.pd_hndl = ib_module->ptag;
mr.size = len;
mr.start = (VAPI_virt_addr_t) (MT_virt_addr_t) addr;
mr.type = VAPI_MR;
rc = mca_bmi_ib_mem_registry_deregister(&(ib_module->mem_registry),&mr);
return rc;
}
static int mca_bmi_ib_mem_registry_real_deregister(
mca_bmi_ib_mem_registry_t *registry,
mca_bmi_ib_mem_registry_info_t *info)
{
ompi_list_item_t *item;
VAPI_ret_t vapi_result;
int i;
/* clear hints array of references to this info object */
for (i = 0; i < registry->hints_size; i++) {
if (registry->hints[i].pval == info) {
registry->hints[i].pval = (void *)NULL;
}
}
/* delete the info object from the red/black tree */
ompi_rb_tree_delete(&(registry->rb_tree), &(info->reply));
/* do the real deregistration */
vapi_result = VAPI_deregister_mr(registry->ib_bmi->nic, info->hndl);
/* return the info object to the free list */
item = (ompi_list_item_t *)info;
OMPI_FREE_LIST_RETURN(&(registry->info_free_list), item);
/* return an error if we could not successfully deregister memory region */
if (VAPI_OK != vapi_result) {
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
int mca_bmi_ib_mem_registry_deregister(
mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *mr)
{
mca_bmi_ib_mem_registry_info_t *info = mca_bmi_ib_mem_registry_find(registry, mr);
if (info != NULL) {
if (info->ref_cnt > 0) {
(info->ref_cnt)--;
if (0 == info->ref_cnt) {
info->next = registry->evictable;
registry->evictable = info;
}
}
}
else {
return OMPI_ERR_NOT_FOUND;
}
return OMPI_SUCCESS;
}
int mca_bmi_ib_mem_registry_init(
mca_bmi_ib_mem_registry_t *registry,
struct mca_bmi_ib_module_t *ib_bmi)
{
registry->ib_bmi = ib_bmi;
return OMPI_SUCCESS;
}

157
src/mca/bmi/ib/bmi_ib_memory.h Обычный файл
Просмотреть файл

@ -0,0 +1,157 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
/**
* @file
*/
#ifndef MCA_PTL_IB_MEMORY_H
#define MCA_PTL_IB_MEMORY_H
/* Standard system includes */
#include <sys/types.h>
/* Open MPI includes */
#include "include/types.h"
#include "include/constants.h"
#include "class/ompi_object.h"
#include "class/ompi_list.h"
#include "class/ompi_rb_tree.h"
#include "class/ompi_free_list.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* vapi.h is not a C++ safe header file */
#include <vapi.h>
#include <vapi_common.h>
struct mca_bmi_ib_module_t;
typedef struct mca_bmi_ib_mem_registry_info_t mca_bmi_ib_mem_registry_info_t;
struct mca_bmi_ib_mem_registry_info_t {
ompi_list_item_t super;
mca_bmi_ib_mem_registry_info_t *next;
int ref_cnt;
VAPI_mr_hndl_t hndl;
VAPI_mr_t request;
VAPI_mr_t reply;
};
OBJ_CLASS_DECLARATION(mca_bmi_ib_mem_registry_info_t);
typedef struct mca_bmi_ib_mem_registry_t mca_bmi_ib_mem_registry_t;
struct mca_bmi_ib_mem_registry_t {
ompi_rb_tree_t rb_tree;
ompi_free_list_t info_free_list;
ompi_ptr_t *hints;
mca_bmi_ib_mem_registry_info_t *evictable;
struct mca_bmi_ib_module_t *ib_bmi;
int hints_log_size;
int hints_size;
};
OBJ_CLASS_DECLARATION(mca_bmi_ib_mem_registry_t);
static inline void mca_bmi_ib_mem_registry_insert_hint(
mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *key,
mca_bmi_ib_mem_registry_info_t *info)
{
uint64_t hints_hash = 0, addrll;
if (registry->hints_size) {
addrll = (uint64_t)(key->start);
/* calculate hash index for hints array - hash is (hints_log_size - 1) bits of key
* from first non-zero least significant bit
*/
hints_hash = addrll & (-addrll);
hints_hash = (((hints_hash << registry->hints_log_size) - hints_hash) & addrll) /
hints_hash;
registry->hints[hints_hash].pval = info;
}
return;
}
/* find information on a registered memory region for a given address,
* region size, and access permissions
*
*/
static inline mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_find(
mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *key)
{
mca_bmi_ib_mem_registry_info_t *info = (mca_bmi_ib_mem_registry_info_t *)NULL;
uint64_t hints_hash = 0, addrll;
if (registry->hints_size) {
addrll = (uint64_t)(key->start);
/* calculate hash index for hints array - hash is (hints_log_size - 1) bits of key
* from first non-zero least significant bit
*/
hints_hash = addrll & (-addrll);
hints_hash = (((hints_hash << registry->hints_log_size) - hints_hash) & addrll) /
hints_hash;
if ((info = registry->hints[hints_hash].pval) != (void *)NULL) {
if ((info->reply.start <= key->start) &&
((info->reply.start + info->reply.size) >= (key->start + key->size)) &&
((info->reply.acl & key->acl) == key->acl)) {
return info;
}
}
}
/* search the red/black tree */
info = ompi_rb_tree_find(&(registry->rb_tree), key);
/* store a pointer to this info in the hints array for later lookups */
if ((info != NULL) && registry->hints_size) {
registry->hints[hints_hash].pval = info;
}
return info;
}
/* prototypes */
mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_register(
mca_bmi_ib_mem_registry_t *registry,
VAPI_mr_t *mr);
mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_register_mem_with_registry(
struct mca_bmi_ib_module_t *ib_bmi,
void *addr, size_t len);
int mca_bmi_ib_deregister_mem_with_registry(
struct mca_bmi_ib_module_t *ib_bmi,
void *addr, size_t len);
int mca_bmi_ib_mem_registry_deregister(
mca_bmi_ib_mem_registry_t *registry,
VAPI_mr_t *mr);
int mca_bmi_ib_mem_registry_init(
mca_bmi_ib_mem_registry_t* registry,
struct mca_bmi_ib_module_t *ib_bmi);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

534
src/mca/bmi/ib/bmi_ib_peer.c Обычный файл
Просмотреть файл

@ -0,0 +1,534 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <sys/time.h>
#include <time.h>
#include "include/types.h"
#include "mca/pml/base/pml_base_sendreq.h"
#include "mca/ns/base/base.h"
#include "mca/oob/base/base.h"
#include "mca/rml/rml.h"
#include "mca/errmgr/errmgr.h"
#include "dps/dps.h"
#include "bmi_ib.h"
#include "bmi_ib_addr.h"
#include "bmi_ib_peer.h"
#include "bmi_ib_proc.h"
#include "bmi_ib_priv.h"
#include "bmi_ib_sendfrag.h"
static void mca_bmi_ib_peer_construct(mca_bmi_base_endpoint_t* peer);
static void mca_bmi_ib_peer_destruct(mca_bmi_base_endpoint_t* peer);
OBJ_CLASS_INSTANCE(mca_bmi_ib_endpoint_t,
ompi_list_item_t, mca_bmi_ib_peer_construct,
mca_bmi_ib_peer_destruct);
/*
* Initialize state of the peer instance.
*
*/
static void mca_bmi_ib_peer_construct(mca_bmi_base_endpoint_t* peer)
{
peer->peer_bmi = 0;
peer->peer_proc = 0;
peer->peer_tstamp = 0.0;
peer->peer_state = MCA_PTL_IB_CLOSED;
peer->peer_retries = 0;
OBJ_CONSTRUCT(&peer->peer_send_lock, ompi_mutex_t);
OBJ_CONSTRUCT(&peer->peer_recv_lock, ompi_mutex_t);
OBJ_CONSTRUCT(&peer->pending_send_frags, ompi_list_t);
}
/*
* Destroy a peer
*
*/
static void mca_bmi_ib_peer_destruct(mca_bmi_base_endpoint_t* peer)
{
}
/*
* Send connection information to remote peer using OOB
*
*/
static void mca_bmi_ib_peer_send_cb(
int status,
orte_process_name_t* peer,
orte_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
OBJ_RELEASE(buffer);
}
static int mca_bmi_ib_peer_send_connect_req(mca_bmi_base_endpoint_t* peer)
{
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc;
if(NULL == buffer) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
/* pack the info in the send buffer */
rc = orte_dps.pack(buffer, &peer->lcl_qp_prop.qp_num, 1, ORTE_UINT32);
if(rc != ORTE_SUCCESS) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.pack(buffer, &peer->peer_bmi->port.lid, 1, ORTE_UINT32);
/* send to peer */
rc = orte_rml.send_buffer_nb(&peer->peer_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
mca_bmi_ib_peer_send_cb, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
}
return OMPI_SUCCESS;
}
/*
* Send connect ACK to remote peer
*
*/
static int mca_bmi_ib_peer_send_connect_ack(mca_bmi_base_endpoint_t* peer)
{
orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t);
int rc;
uint32_t zero = 0;
/* pack the info in the send buffer */
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* send to peer */
rc = orte_rml.send_buffer_nb(&peer->peer_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0,
mca_bmi_ib_peer_send_cb, NULL);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/*
* Set remote connection info
*
* XXX: Currently size is unutilized, this shall change
* as soon as we add more info to be exchanged at connection
* setup.
*
*/
static int mca_bmi_ib_peer_set_remote_info(mca_bmi_base_endpoint_t* peer, orte_buffer_t* buffer)
{
int rc;
size_t cnt = 1;
rc = orte_dps.unpack(buffer, &peer->rem_qp_num, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
rc = orte_dps.unpack(buffer, &peer->rem_lid, &cnt, ORTE_UINT32);
if(ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
}
D_PRINT("Received QP num = %d, LID = %d",
peer->rem_qp_num,
peer->rem_lid);
return ORTE_SUCCESS;
}
static int mca_bmi_ib_peer_init(
mca_bmi_ib_endpoint_t *peer)
{
return OMPI_SUCCESS;
}
/*
* Start to connect to the peer. We send our Queue Pair
* information over the TCP OOB communication mechanism.
* On completion of our send, a send completion handler
* is called.
*
*/
static int mca_bmi_ib_peer_start_connect(mca_bmi_base_endpoint_t* peer)
{
mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi;
int rc;
/* Create the Queue Pair */
if(OMPI_SUCCESS != (rc = mca_bmi_ib_create_qp(ib_bmi->nic,
ib_bmi->ptag,
ib_bmi->cq_hndl,
ib_bmi->cq_hndl,
&peer->lcl_qp_hndl,
&peer->lcl_qp_prop,
VAPI_TS_RC))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
/* Send connection info over to remote peer */
peer->peer_state = MCA_BMI_IB_CONNECTING;
if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send_connect_req(peer))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
return OMPI_SUCCESS;
}
/*
* Reply to a `start - connect' message
*
*/
static int mca_bmi_ib_peer_reply_start_connect(mca_bmi_ib_endpoint_t *peer, orte_buffer_t* buffer)
{
mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi;
int rc;
/* Create the Queue Pair */
if(OMPI_SUCCESS != (rc = mca_bmi_ib_create_qp(ib_bmi->nic,
ib_bmi->ptag,
ib_bmi->cq_hndl,
ib_bmi->cq_hndl,
&peer->lcl_qp_hndl,
&peer->lcl_qp_prop,
VAPI_TS_RC))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
/* Set the remote side info */
mca_bmi_ib_peer_set_remote_info(peer, buffer);
/* Connect to peer */
rc = mca_bmi_ib_peer_connect(peer);
if(rc != OMPI_SUCCESS) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
/* Send connection info over to remote peer */
if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send_connect_req(peer))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
return OMPI_SUCCESS;
}
/*
*
*/
static void mca_bmi_ib_peer_connected(mca_bmi_ib_endpoint_t *peer)
{
peer->peer_state = MCA_BMI_IB_CONNECTED;
mca_bmi_ib_progress_send_frags(peer);
}
/*
* Non blocking OOB recv callback.
* Read incoming QP and other info, and if this peer
* is trying to connect, reply with our QP info,
* otherwise try to modify QP's and establish
* reliable connection
*
*/
static void mca_bmi_ib_peer_recv(
int status,
orte_process_name_t* peer,
orte_buffer_t* buffer,
orte_rml_tag_t tag,
void* cbdata)
{
mca_bmi_ib_proc_t *ib_proc;
mca_bmi_ib_endpoint_t *ib_peer;
int peer_state;
int rc;
for(ib_proc = (mca_bmi_ib_proc_t*)
ompi_list_get_first(&mca_bmi_ib_component.ib_procs);
ib_proc != (mca_bmi_ib_proc_t*)
ompi_list_get_end(&mca_bmi_ib_component.ib_procs);
ib_proc = (mca_bmi_ib_proc_t*)ompi_list_get_next(ib_proc)) {
if(ib_proc->proc_guid.vpid == peer->vpid) {
/* Try to get the peer instance of this proc */
/* Limitation: Right now, we have only 1 peer
* for every process. Need several changes, some
* in PML/BMI interface to set this right */
ib_peer = ib_proc->proc_peers[0];
peer_state = ib_peer->peer_state;
/* Update status */
switch(peer_state) {
case MCA_BMI_IB_CLOSED :
/* We had this connection closed before.
* The peer is trying to connect. Move the
* status of this connection to CONNECTING,
* and then reply with our QP information */
if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_reply_start_connect(ib_peer, buffer))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
break;
}
/* Setup state as connected */
ib_peer->peer_state = MCA_BMI_IB_CONNECT_ACK;
break;
case MCA_BMI_IB_CONNECTING :
mca_bmi_ib_peer_set_remote_info(ib_peer, buffer);
if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_connect(ib_peer))) {
ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n",
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
break;
}
/* Setup state as connected */
mca_bmi_ib_peer_connected(ib_peer);
/* Send him an ack */
mca_bmi_ib_peer_send_connect_ack(ib_peer);
break;
case MCA_BMI_IB_CONNECT_ACK:
mca_bmi_ib_peer_connected(ib_peer);
break;
case MCA_BMI_IB_CONNECTED :
break;
default :
ompi_output(0, "Connected -> Connecting not possible.\n");
}
break;
}
}
/* Okay, now that we are done receiving,
* re-post the buffer */
mca_bmi_ib_post_recv();
}
void mca_bmi_ib_post_recv()
{
D_PRINT("");
orte_rml.recv_buffer_nb(
ORTE_RML_NAME_ANY,
ORTE_RML_TAG_DYNAMIC-1,
0,
mca_bmi_ib_peer_recv,
NULL);
}
/*
* Attempt to send a fragment using a given peer. If the peer is not
* connected, queue the fragment and start the connection as required.
*/
int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t* peer,
mca_bmi_ib_send_frag_t* frag)
{
int rc;
OMPI_THREAD_LOCK(&peer->peer_send_lock);
switch(peer->peer_state) {
case MCA_BMI_IB_CONNECTING:
D_PRINT("Queing because state is connecting");
ompi_list_append(&peer->pending_send_frags,
(ompi_list_item_t *)frag);
rc = OMPI_SUCCESS;
break;
case MCA_BMI_IB_CONNECT_ACK:
D_PRINT("Queuing because waiting for ack");
ompi_list_append(&peer->pending_send_frags,
(ompi_list_item_t *)frag);
rc = OMPI_SUCCESS;
break;
case MCA_BMI_IB_CLOSED:
D_PRINT("Connection to peer closed ... connecting ...");
ompi_list_append(&peer->pending_send_frags,
(ompi_list_item_t *)frag);
rc = mca_bmi_ib_peer_start_connect(peer);
break;
case MCA_BMI_IB_FAILED:
rc = OMPI_ERR_UNREACH;
break;
case MCA_BMI_IB_CONNECTED:
{
mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi;
ompi_list_item_t* item;
A_PRINT("Send to : %d, len : %d, frag : %p",
peer->peer_proc->proc_guid.vpid,
frag->ib_buf.desc.sg_entry.len,
frag);
rc = mca_bmi_ib_post_send(peer->peer_bmi, peer,
&frag->ib_buf, (void*) frag);
while(NULL != (item = ompi_list_remove_first(&ib_bmi->repost))) {
mca_bmi_ib_buffer_repost(ib_bmi->nic, item);
}
break;
}
default:
rc = OMPI_ERR_UNREACH;
}
OMPI_THREAD_UNLOCK(&peer->peer_send_lock);
return rc;
}
void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t* peer)
{
ompi_list_item_t *frag_item;
mca_bmi_ib_send_frag_t *sendfrag;
/*Check if peer is connected */
if(peer->peer_state != MCA_BMI_IB_CONNECTED) {
return;
}
/* While there are frags in the list,
* process them */
while(!ompi_list_is_empty(&(peer->pending_send_frags))) {
frag_item = ompi_list_remove_first(&(peer->pending_send_frags));
sendfrag = (mca_bmi_ib_send_frag_t *) frag_item;
/* We need to post this one */
if(mca_bmi_ib_post_send(peer->peer_bmi, peer, &sendfrag->ib_buf,
(void*) sendfrag)
!= OMPI_SUCCESS) {
ompi_output(0, "Error in posting send");
}
}
}
/*
* Complete connection to peer.
*/
int mca_bmi_ib_peer_connect(
mca_bmi_ib_endpoint_t *peer)
{
int rc, i;
VAPI_ret_t ret;
ib_buffer_t *ib_buf_ptr;
mca_bmi_ib_module_t *ib_bmi = peer->peer_bmi;
/* Establish Reliable Connection */
rc = mca_bmi_ib_qp_init(ib_bmi->nic,
peer->lcl_qp_hndl,
peer->rem_qp_num,
peer->rem_lid);
if(rc != OMPI_SUCCESS) {
return rc;
}
/* Allocate resources to this connection */
peer->lcl_recv = (ib_buffer_t*)
malloc(sizeof(ib_buffer_t) * NUM_IB_RECV_BUF);
if(NULL == peer->lcl_recv) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Register the buffers */
for(i = 0; i < NUM_IB_RECV_BUF; i++) {
rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag,
(void*) peer->lcl_recv[i].buf,
MCA_BMI_IB_FIRST_FRAG_SIZE,
&peer->lcl_recv[i].hndl);
if(rc != OMPI_SUCCESS) {
return OMPI_ERROR;
}
ib_buf_ptr = &peer->lcl_recv[i];
ib_buf_ptr->qp_hndl = peer->lcl_qp_hndl;
IB_PREPARE_RECV_DESC(ib_buf_ptr);
}
/* Post receives */
for(i = 0; i < NUM_IB_RECV_BUF; i++) {
ret = VAPI_post_rr(ib_bmi->nic,
peer->lcl_qp_hndl,
&peer->lcl_recv[i].desc.rr);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_rr");
}
}
return OMPI_SUCCESS;
}

84
src/mca/bmi/ib/bmi_ib_peer.h Обычный файл
Просмотреть файл

@ -0,0 +1,84 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_PEER_H
#define MCA_BMI_IB_PEER_H
#include "class/ompi_list.h"
#include "event/event.h"
#include "mca/pml/pml.h"
#include "mca/bmi/bmi.h"
#include "bmi_ib_recvfrag.h"
#include "bmi_ib_sendfrag.h"
#include "bmi_ib_priv.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_endpoint_t);
/**
* State of IB peer connection.
*/
typedef enum {
/* Defines the state in which this BMI instance
* has started the process of connection */
MCA_BMI_IB_CONNECTING,
/* Waiting for ack from peer */
MCA_BMI_IB_CONNECT_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BMI_IB_CONNECTED,
/* Connection is closed, there are no resources
* associated with this */
MCA_BMI_IB_CLOSED,
/* Maximum number of retries have been used.
* Report failure on send to upper layer */
MCA_BMI_IB_FAILED
} mca_bmi_ib_peer_state_t;
int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t*, mca_bmi_ib_send_frag_t*);
int mca_bmi_ib_peer_connect(mca_bmi_base_endpoint_t*);
void mca_bmi_ib_post_recv(void);
void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t*);
#define DUMP_PEER(peer_ptr) { \
ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \
ompi_output(0, "Dumping peer %d state", \
peer->peer_proc->proc_guid.vpid); \
ompi_output(0, "Local QP hndl : %d", \
peer_ptr->peer_conn->lres->qp_hndl); \
ompi_output(0, "Local QP num : %d", \
peer_ptr->peer_conn->lres->qp_prop.qp_num); \
ompi_output(0, "Remote QP num : %d", \
peer_ptr->peer_conn->rres->qp_num); \
ompi_output(0, "Remote LID : %d", \
peer_ptr->peer_conn->rres->lid); \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

474
src/mca/bmi/ib/bmi_ib_priv.c Обычный файл
Просмотреть файл

@ -0,0 +1,474 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "bmi_ib_vapi.h"
#include "bmi_ib_priv.h"
#include "bmi_ib.h"
#include "bmi_ib_memory.h"
/*
* Asynchronous event handler to detect unforseen
* events. Usually, such events are catastrophic.
* Should have a robust mechanism to handle these
* events and abort the OMPI application if necessary.
*
*/
static void async_event_handler(VAPI_hca_hndl_t hca_hndl,
VAPI_event_record_t * event_p,
void *priv_data)
{
switch (event_p->type) {
case VAPI_QP_PATH_MIGRATED:
case VAPI_EEC_PATH_MIGRATED:
case VAPI_QP_COMM_ESTABLISHED:
case VAPI_EEC_COMM_ESTABLISHED:
case VAPI_SEND_QUEUE_DRAINED:
case VAPI_PORT_ACTIVE:
{
D_PRINT("Got an asynchronous event: %s\n",
VAPI_event_record_sym(event_p->type));
break;
}
case VAPI_CQ_ERROR:
case VAPI_LOCAL_WQ_INV_REQUEST_ERROR:
case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR:
case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR:
case VAPI_PATH_MIG_REQ_ERROR:
case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR:
case VAPI_LOCAL_CATASTROPHIC_ERROR:
case VAPI_PORT_ERROR:
{
ompi_output(0, "Got an asynchronous event: %s (%s)",
VAPI_event_record_sym(event_p->type),
VAPI_event_syndrome_sym(event_p->
syndrome));
break;
}
default:
ompi_output(0, "Warning!! Got an undefined "
"asynchronous event\n");
}
}
/*
* This function returns the hca_id for each BMI
* in a round robin manner. Each BMI gets a different
* HCA id ...
*
* If num BMIs > num HCAs, then those bmis will be
* assigned HCA ids beginning from 0 again.
*
*/
static int mca_bmi_ib_get_hca_hndl(VAPI_hca_id_t hca_id,
VAPI_hca_hndl_t* hca_hndl)
{
VAPI_ret_t ret;
/* Open the HCA */
ret = EVAPI_get_hca_hndl(hca_id, hca_hndl);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "EVAPI_get_hca_hndl");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static int mca_bmi_ib_query_hca_prop(VAPI_hca_hndl_t nic,
VAPI_hca_port_t* port)
{
VAPI_ret_t ret;
/* Querying for port properties */
ret = VAPI_query_hca_port_prop(nic,
(IB_port_t)DEFAULT_PORT,
port);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_query_hca_port_prop");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static int mca_bmi_ib_alloc_pd(VAPI_hca_hndl_t nic,
VAPI_pd_hndl_t* ptag)
{
VAPI_ret_t ret;
ret = VAPI_alloc_pd(nic, ptag);
if(ret != VAPI_OK) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_alloc_pd");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static int mca_bmi_ib_create_cq(VAPI_hca_hndl_t nic,
VAPI_cq_hndl_t* cq_hndl)
{
uint32_t act_num_cqe = 0;
VAPI_ret_t ret;
ret = VAPI_create_cq(nic, DEFAULT_CQ_SIZE,
cq_hndl, &act_num_cqe);
if( (VAPI_OK != ret) || (0 == act_num_cqe)) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_create_cq");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static int mca_bmi_ib_set_async_handler(VAPI_hca_hndl_t nic,
EVAPI_async_handler_hndl_t *async_handler)
{
VAPI_ret_t ret;
ret = EVAPI_set_async_event_handler(nic,
async_event_handler, 0, async_handler);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "EVAPI_set_async_event_handler");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
int mca_bmi_ib_create_qp(VAPI_hca_hndl_t nic,
VAPI_pd_hndl_t ptag,
VAPI_cq_hndl_t recv_cq,
VAPI_cq_hndl_t send_cq,
VAPI_qp_hndl_t* qp_hndl,
VAPI_qp_prop_t* qp_prop,
int transport_type)
{
VAPI_ret_t ret;
VAPI_qp_init_attr_t qp_init_attr;
switch(transport_type) {
case VAPI_TS_RC: /* Set up RC qp parameters */
qp_init_attr.cap.max_oust_wr_rq = DEFAULT_WQ_SIZE;
qp_init_attr.cap.max_oust_wr_sq = DEFAULT_WQ_SIZE;
qp_init_attr.cap.max_sg_size_rq = DEFAULT_SG_LIST;
qp_init_attr.cap.max_sg_size_sq = DEFAULT_SG_LIST;
qp_init_attr.pd_hndl = ptag;
/* We don't have Reliable Datagram Handle right now */
qp_init_attr.rdd_hndl = 0;
/* Set Send and Recv completion queues */
qp_init_attr.rq_cq_hndl = recv_cq;
qp_init_attr.sq_cq_hndl = send_cq;
/* Signal all work requests on this queue pair */
qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR;
qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR;
/* Use Unreliable Datagram transport service */
qp_init_attr.ts_type = VAPI_TS_RC;
break;
case VAPI_TS_UD: /* Set up UD qp parameters */
default:
return OMPI_ERR_NOT_IMPLEMENTED;
}
ret = VAPI_create_qp(nic, &qp_init_attr,
qp_hndl, qp_prop);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_create_qp");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
int mca_bmi_ib_module_init(mca_bmi_ib_module_t *ib_bmi)
{
/* Get HCA handle */
if(mca_bmi_ib_get_hca_hndl(ib_bmi->hca_id, &ib_bmi->nic)
!= OMPI_SUCCESS) {
return OMPI_ERROR;
}
/* Allocate a protection domain for this NIC */
if(mca_bmi_ib_alloc_pd(ib_bmi->nic, &ib_bmi->ptag)
!= OMPI_SUCCESS) {
return OMPI_ERROR;
}
/* Get the properties of the HCA,
* LID etc. are part of the properties */
if(mca_bmi_ib_query_hca_prop(ib_bmi->nic, &ib_bmi->port)
!= OMPI_SUCCESS) {
return OMPI_ERROR;
}
/* Create Completion Q */
/* We use a single completion Q for sends & recvs
* This saves us overhead of polling 2 separate Qs */
if(mca_bmi_ib_create_cq(ib_bmi->nic, &ib_bmi->cq_hndl)
!= OMPI_SUCCESS) {
return OMPI_ERROR;
}
/* Attach asynchronous handler */
if(mca_bmi_ib_set_async_handler(ib_bmi->nic,
&ib_bmi->async_handler)
!= OMPI_SUCCESS) {
return OMPI_ERROR;
}
/* initialize memory region registry */
OBJ_CONSTRUCT(&ib_bmi->mem_registry, mca_bmi_ib_mem_registry_t);
mca_bmi_ib_mem_registry_init(&ib_bmi->mem_registry, ib_bmi);
return OMPI_SUCCESS;
}
int mca_bmi_ib_qp_init(VAPI_hca_hndl_t nic,
VAPI_qp_hndl_t qp_hndl,
VAPI_qp_num_t remote_qp,
IB_lid_t remote_lid)
{
VAPI_ret_t ret;
VAPI_qp_attr_t qp_attr;
VAPI_qp_attr_mask_t qp_attr_mask;
VAPI_qp_cap_t qp_cap;
/* Modifying QP to INIT */
QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
qp_attr.qp_state = VAPI_INIT;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE);
qp_attr.pkey_ix = DEFAULT_PKEY_IX;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX);
qp_attr.port = DEFAULT_PORT;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PORT);
qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_REMOTE_ATOMIC_FLAGS);
ret = VAPI_modify_qp(nic, qp_hndl,
&qp_attr, &qp_attr_mask, &qp_cap);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp");
return OMPI_ERROR;
}
D_PRINT("Modified to init..Qp %d", qp_hndl);
/********************** INIT --> RTR ************************/
QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
qp_attr.qp_state = VAPI_RTR;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE);
qp_attr.qp_ous_rd_atom = DEFAULT_QP_OUS_RD_ATOM;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_OUS_RD_ATOM);
qp_attr.path_mtu = DEFAULT_MTU;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PATH_MTU);
qp_attr.rq_psn = DEFAULT_PSN;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RQ_PSN);
qp_attr.pkey_ix = DEFAULT_PKEY_IX;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX);
qp_attr.min_rnr_timer = DEFAULT_MIN_RNR_TIMER;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_MIN_RNR_TIMER);
qp_attr.av.sl = DEFAULT_SERVICE_LEVEL;
qp_attr.av.grh_flag = FALSE;
qp_attr.av.static_rate = DEFAULT_STATIC_RATE;
qp_attr.av.src_path_bits = DEFAULT_SRC_PATH_BITS;
qp_attr.dest_qp_num = remote_qp;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_DEST_QP_NUM);
qp_attr.av.dlid = remote_lid;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_AV);
ret = VAPI_modify_qp(nic, qp_hndl,
&qp_attr, &qp_attr_mask, &qp_cap);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp");
return OMPI_ERROR;
}
D_PRINT("Modified to RTR..Qp %d", qp_hndl);
/************** RTS *******************/
QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
qp_attr.qp_state = VAPI_RTS;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE);
qp_attr.sq_psn = DEFAULT_PSN;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_SQ_PSN);
qp_attr.timeout = DEFAULT_TIME_OUT;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_TIMEOUT);
qp_attr.retry_count = DEFAULT_RETRY_COUNT;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RETRY_COUNT);
qp_attr.rnr_retry = DEFAULT_RNR_RETRY;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RNR_RETRY);
qp_attr.ous_dst_rd_atom = DEFAULT_MAX_RDMA_DST_OPS;
QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_OUS_DST_RD_ATOM);
ret = VAPI_modify_qp(nic, qp_hndl,
&qp_attr, &qp_attr_mask, &qp_cap);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp");
return OMPI_ERROR;
}
D_PRINT("Modified to RTS..Qp %d", qp_hndl);
return OMPI_SUCCESS;
}
int mca_bmi_ib_register_mem(VAPI_hca_hndl_t nic, VAPI_pd_hndl_t ptag,
void* buf, int len, vapi_memhandle_t* memhandle)
{
VAPI_ret_t ret;
VAPI_mrw_t mr_in, mr_out;
vapi_memhandle_t mem_handle;
mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE;
mr_in.l_key = 0;
mr_in.r_key = 0;
mr_in.pd_hndl = ptag;
mr_in.size = len;
mr_in.start = (VAPI_virt_addr_t) (MT_virt_addr_t) buf;
mr_in.type = VAPI_MR;
ret = VAPI_register_mr(nic, &mr_in, &mem_handle.hndl, &mr_out);
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_register_mr");
return OMPI_ERROR;
}
mem_handle.lkey = mr_out.l_key;
mem_handle.rkey = mr_out.r_key;
memhandle->lkey = mem_handle.lkey;
memhandle->rkey = mem_handle.rkey;
/* D_PRINT("addr = %p, lkey = %d\n", buf, memhandle->lkey); */
memhandle->hndl = mem_handle.hndl;
return OMPI_SUCCESS;
}
int mca_bmi_ib_post_send(mca_bmi_ib_module_t *ib_bmi,
mca_bmi_ib_endpoint_t *peer,
ib_buffer_t *ib_buf, void* addr)
{
VAPI_ret_t ret;
int msg_len = ib_buf->desc.sg_entry.len;
IB_PREPARE_SEND_DESC(ib_buf, (peer->rem_qp_num),
msg_len, addr);
/* TODO - get this from NIC properties */
if(msg_len < 128) { /* query this information from VAPI_query_qp(property max_inline_data_sq) */
ret = EVAPI_post_inline_sr(ib_bmi->nic,
peer->lcl_qp_hndl,
&ib_buf->desc.sr);
} else {
ret = VAPI_post_sr(ib_bmi->nic,
peer->lcl_qp_hndl,
&ib_buf->desc.sr);
}
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_sr");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
void mca_bmi_ib_buffer_repost(VAPI_hca_hndl_t nic, void* addr)
{
VAPI_ret_t ret;
ib_buffer_t *ib_buf = (ib_buffer_t*)addr;
IB_PREPARE_RECV_DESC(ib_buf);
ret = VAPI_post_rr(nic, ib_buf->qp_hndl, &(ib_buf->desc.rr));
if(VAPI_OK != ret) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_rr");
ompi_output(0, "Error in buffer reposting");
}
}
void mca_bmi_ib_prepare_ack(mca_bmi_ib_module_t *ib_bmi,
void* addr_to_reg, int len_to_reg,
void* ack_buf, int* len_added)
{
mca_bmi_ib_mem_registry_info_t *info =
mca_bmi_ib_register_mem_with_registry(ib_bmi,
addr_to_reg, (size_t)len_to_reg);
if(NULL == info) {
ompi_output(0, "Error in registering");
}
A_PRINT("Sending Remote key : %d", info->reply.r_key);
memcpy(ack_buf,(void*) &(info->reply.r_key), sizeof(VAPI_rkey_t));
*len_added = sizeof(VAPI_rkey_t);
}
int mca_bmi_ib_rdma_write(mca_bmi_ib_module_t *ib_bmi,
mca_bmi_ib_endpoint_t *peer, ib_buffer_t *ib_buf,
void* send_buf, size_t send_len, void* remote_buf,
VAPI_rkey_t remote_key, void* id_buf)
{
VAPI_ret_t ret;
mca_bmi_ib_mem_registry_info_t *info =
mca_bmi_ib_register_mem_with_registry(ib_bmi,
send_buf, send_len);
if (NULL == info) {
return OMPI_ERROR;
}
/* Prepare descriptor */
IB_PREPARE_RDMA_W_DESC(ib_buf, (peer->rem_qp_num),
send_len, send_buf, (info->reply.l_key), remote_key,
id_buf, remote_buf);
ret = VAPI_post_sr(ib_bmi->nic,
peer->lcl_qp_hndl,
&ib_buf->desc.sr);
if(ret != VAPI_OK) {
MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_sr");
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}

217
src/mca/bmi/ib/bmi_ib_priv.h Обычный файл
Просмотреть файл

@ -0,0 +1,217 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_PRIV_H
#define MCA_BMI_IB_PRIV_H
#include <stdint.h>
#include "class/ompi_free_list.h"
#include "bmi_ib_vapi.h"
#include "bmi_ib_memory.h"
#define NUM_IB_SEND_BUF (1)
#define NUM_IB_RECV_BUF (4)
#define MCA_BMI_IB_FIRST_FRAG_SIZE (65536)
typedef enum {
IB_RECV,
IB_SEND
} IB_wr_t;
typedef enum {
IB_COMP_ERROR,
IB_COMP_RECV,
IB_COMP_SEND,
IB_COMP_RDMA_W,
IB_COMP_NOTHING
} IB_comp_t;
struct vapi_memhandle_t {
VAPI_mr_hndl_t hndl;
/* Memory region handle */
VAPI_lkey_t lkey;
/* Local key to registered memory, needed for
* posting send/recv requests */
VAPI_rkey_t rkey;
/* Remote key to registered memory, need to send this
* to remote processes for incoming RDMA ops */
};
typedef struct vapi_memhandle_t vapi_memhandle_t;
struct vapi_descriptor_t {
union {
VAPI_rr_desc_t rr;
/* Receive descriptor */
VAPI_sr_desc_t sr;
/* Send descriptor */
};
VAPI_sg_lst_entry_t sg_entry;
/* Scatter/Gather entry */
};
typedef struct vapi_descriptor_t vapi_descriptor_t;
struct ib_buffer_t {
ompi_list_item_t super;
vapi_descriptor_t desc;
/* Descriptor of the buffer */
vapi_memhandle_t hndl;
/* Buffer handle */
char buf[MCA_BMI_IB_FIRST_FRAG_SIZE];
/* Buffer space */
VAPI_qp_hndl_t qp_hndl;
/* Queue pair used for this IB buffer */
};
typedef struct ib_buffer_t ib_buffer_t;
#define DUMP_IB_STATE(ib_bmi) { \
ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \
ompi_output(0, "Dumping IB state"); \
ompi_output(0, "HCA ID : %s", ib_bmi->hca_id); \
ompi_output(0, "LID : %d", ib_bmi->port.lid); \
ompi_output(0, "HCA handle : %d", ib_bmi->nic); \
ompi_output(0, "Protection Domain: %d", ib_bmi->ptag); \
ompi_output(0, "Comp Q handle : %d", ib_bmi->cq_hndl); \
ompi_output(0, "Async hndl : %d", ib_bmi->async_handler); \
}
#define IB_PREPARE_RECV_DESC(ib_buf_ptr) { \
ib_buf_ptr->desc.rr.comp_type = VAPI_SIGNALED; \
ib_buf_ptr->desc.rr.opcode = VAPI_RECEIVE; \
ib_buf_ptr->desc.rr.id = (VAPI_virt_addr_t) \
(MT_virt_addr_t) ib_buf_ptr; \
ib_buf_ptr->desc.rr.sg_lst_len = 1; \
ib_buf_ptr->desc.rr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \
ib_buf_ptr->desc.sg_entry.len = MCA_BMI_IB_FIRST_FRAG_SIZE; \
ib_buf_ptr->desc.sg_entry.lkey = ib_buf_ptr->hndl.lkey; \
ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \
(MT_virt_addr_t) ib_buf_ptr->buf; \
}
#define IB_PREPARE_SEND_DESC(ib_buf_ptr, qp, msg_len, \
id_buf) { \
ib_buf_ptr->desc.sr.comp_type = VAPI_SIGNALED; \
ib_buf_ptr->desc.sr.opcode = VAPI_SEND; \
ib_buf_ptr->desc.sr.remote_qkey = 0; \
ib_buf_ptr->desc.sr.remote_qp = qp; \
ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \
(MT_virt_addr_t) id_buf; \
ib_buf_ptr->desc.sr.sg_lst_len = 1; \
ib_buf_ptr->desc.sr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \
ib_buf_ptr->desc.sg_entry.len = msg_len; \
ib_buf_ptr->desc.sg_entry.lkey = ib_buf_ptr->hndl.lkey; \
ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \
(MT_virt_addr_t) ib_buf_ptr->buf; \
}
#define IB_SET_REMOTE_QP_NUM(ib_buf_ptr, qp) { \
ib_buf_ptr->desc.sr.remote_qp = qp; \
}
#define IB_SET_SEND_DESC_ID(ib_buf_ptr, addr) { \
ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \
(MT_virt_addr_t) addr; \
}
#define IB_SET_SEND_DESC_LEN(ib_buf_ptr, msg_len) { \
ib_buf_ptr->desc.sg_entry.len = msg_len; \
}
#define IB_PREPARE_RDMA_W_DESC(ib_buf_ptr, qp, \
msg_len, user_buf, local_key, remote_key, \
id_buf, remote_buf) { \
ib_buf_ptr->desc.sr.comp_type = VAPI_SIGNALED; \
ib_buf_ptr->desc.sr.opcode = VAPI_RDMA_WRITE; \
ib_buf_ptr->desc.sr.remote_qkey = 0; \
ib_buf_ptr->desc.sr.remote_qp = qp; \
ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \
(MT_virt_addr_t) id_buf; \
ib_buf_ptr->desc.sr.sg_lst_len = 1; \
ib_buf_ptr->desc.sr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \
ib_buf_ptr->desc.sg_entry.len = msg_len; \
ib_buf_ptr->desc.sg_entry.lkey = local_key; \
ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \
(MT_virt_addr_t) user_buf; \
ib_buf_ptr->desc.sr.remote_addr = (VAPI_virt_addr_t) \
(MT_virt_addr_t) remote_buf; \
ib_buf_ptr->desc.sr.r_key = remote_key; \
}
struct mca_bmi_ib_module_t;
struct mca_bmi_base_endpoint_t;
int mca_bmi_ib_module_init(struct mca_bmi_ib_module_t*);
int mca_bmi_ib_register_mem(
VAPI_hca_hndl_t nic,
VAPI_pd_hndl_t ptag,
void* buf,
int len,
vapi_memhandle_t* memhandle);
int mca_bmi_ib_post_send(
struct mca_bmi_ib_module_t *ib_module,
struct mca_bmi_base_endpoint_t *peer,
ib_buffer_t *ib_buf, void*);
void mca_bmi_ib_buffer_repost(
VAPI_hca_hndl_t nic,
void* addr);
void mca_bmi_ib_prepare_ack(
struct mca_bmi_ib_module_t *ib_module,
void* addr_to_reg, int len_to_reg,
void* ack_buf, int* len_added);
int mca_bmi_ib_rdma_write(
struct mca_bmi_ib_module_t *ib_module,
struct mca_bmi_base_endpoint_t *peer,
ib_buffer_t *ib_buf,
void* send_buf,
size_t send_len,
void* remote_buf,
VAPI_rkey_t remote_key, void*);
int mca_bmi_ib_create_qp(VAPI_hca_hndl_t nic,
VAPI_pd_hndl_t ptag,
VAPI_cq_hndl_t recv_cq,
VAPI_cq_hndl_t send_cq,
VAPI_qp_hndl_t* qp_hndl,
VAPI_qp_prop_t* qp_prop,
int transport_type);
int mca_bmi_ib_qp_init(
VAPI_hca_hndl_t nic,
VAPI_qp_hndl_t qp_hndl,
VAPI_qp_num_t remote_qp,
IB_lid_t remote_lid);
#endif /* MCA_BMI_IB_PRIV_H */

164
src/mca/bmi/ib/bmi_ib_proc.c Обычный файл
Просмотреть файл

@ -0,0 +1,164 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "class/ompi_hash_table.h"
#include "mca/base/mca_base_module_exchange.h"
#include "bmi_ib.h"
#include "bmi_ib_vapi.h"
#include "bmi_ib_proc.h"
static void mca_bmi_ib_proc_construct(mca_bmi_ib_proc_t* proc);
static void mca_bmi_ib_proc_destruct(mca_bmi_ib_proc_t* proc);
OBJ_CLASS_INSTANCE(mca_bmi_ib_proc_t,
ompi_list_item_t, mca_bmi_ib_proc_construct,
mca_bmi_ib_proc_destruct);
void mca_bmi_ib_proc_construct(mca_bmi_ib_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_addr_count = 0;
proc->proc_peers = 0;
proc->proc_peer_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, ompi_mutex_t);
/* add to list of all proc instance */
OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock);
ompi_list_append(&mca_bmi_ib_component.ib_procs, &proc->super);
OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock);
}
/*
* Cleanup ib proc instance
*/
void mca_bmi_ib_proc_destruct(mca_bmi_ib_proc_t* proc)
{
/* remove from list of all proc instances */
OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock);
ompi_list_remove_item(&mca_bmi_ib_component.ib_procs, &proc->super);
OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock);
/* release resources */
if(NULL != proc->proc_peers) {
free(proc->proc_peers);
}
}
/*
* Look for an existing IB process instances based on the associated
* ompi_proc_t instance.
*/
static mca_bmi_ib_proc_t* mca_bmi_ib_proc_lookup_ompi(ompi_proc_t* ompi_proc)
{
mca_bmi_ib_proc_t* ib_proc;
OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock);
for(ib_proc = (mca_bmi_ib_proc_t*)
ompi_list_get_first(&mca_bmi_ib_component.ib_procs);
ib_proc != (mca_bmi_ib_proc_t*)
ompi_list_get_end(&mca_bmi_ib_component.ib_procs);
ib_proc = (mca_bmi_ib_proc_t*)ompi_list_get_next(ib_proc)) {
if(ib_proc->proc_ompi == ompi_proc) {
OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock);
return ib_proc;
}
}
OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock);
return NULL;
}
/*
* Create a IB process structure. There is a one-to-one correspondence
* between a ompi_proc_t and a mca_bmi_ib_proc_t instance. We cache
* additional data (specifically the list of mca_bmi_ib_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
mca_bmi_ib_proc_t* mca_bmi_ib_proc_create(ompi_proc_t* ompi_proc)
{
mca_bmi_ib_proc_t* module_proc = NULL;
/* Check if we have already created a IB proc
* structure for this ompi process */
module_proc = mca_bmi_ib_proc_lookup_ompi(ompi_proc);
if(module_proc != NULL) {
/* Gotcha! */
return module_proc;
}
/* Oops! First time, gotta create a new IB proc
* out of the ompi_proc ... */
module_proc = OBJ_NEW(mca_bmi_ib_proc_t);
/* Initialize number of peer */
module_proc->proc_peer_count = 0;
module_proc->proc_ompi = ompi_proc;
/* build a unique identifier (of arbitrary
* size) to represent the proc */
module_proc->proc_guid = ompi_proc->proc_name;
/* IB module doesn't have addresses exported at
* initialization, so the addr_count is set to one. */
module_proc->proc_addr_count = 1;
/* XXX: Right now, there can be only 1 peer associated
* with a proc. Needs a little bit change in
* mca_bmi_ib_proc_t to allow on demand increasing of
* number of peers for this proc */
module_proc->proc_peers = (mca_bmi_base_endpoint_t**)
malloc(module_proc->proc_addr_count * sizeof(mca_bmi_base_endpoint_t*));
if(NULL == module_proc->proc_peers) {
OBJ_RELEASE(module_proc);
return NULL;
}
return module_proc;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a bmi instance into the proc array and assign
* it an address.
*/
int mca_bmi_ib_proc_insert(mca_bmi_ib_proc_t* module_proc,
mca_bmi_base_endpoint_t* module_peer)
{
/* insert into peer array */
module_peer->peer_proc = module_proc;
module_proc->proc_peers[module_proc->proc_peer_count++] = module_peer;
return OMPI_SUCCESS;
}

71
src/mca/bmi/ib/bmi_ib_proc.h Обычный файл
Просмотреть файл

@ -0,0 +1,71 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_PROC_H
#define MCA_BMI_IB_PROC_H
#include "mca/ns/ns.h"
#include "class/ompi_object.h"
#include "proc/proc.h"
#include "bmi_ib.h"
#include "bmi_ib_vapi.h"
#include "bmi_ib_addr.h"
#include "bmi_ib_peer.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_proc_t);
/**
* Represents the state of a remote process and the set of addresses
* that it exports. Also cache an instance of mca_bmi_base_endpoint_t for
* each
* BMI instance that attempts to open a connection to the process.
*/
struct mca_bmi_ib_proc_t {
ompi_list_item_t super;
/**< allow proc to be placed on a list */
ompi_proc_t *proc_ompi;
/**< pointer to corresponding ompi_proc_t */
orte_process_name_t proc_guid;
/**< globally unique identifier for the process */
size_t proc_addr_count;
/**< number of addresses published by peer */
struct mca_bmi_base_endpoint_t **proc_peers;
/**< array of peers that have been created to access this proc */
size_t proc_peer_count;
/**< number of peers */
ompi_mutex_t proc_lock;
/**< lock to protect against concurrent access to proc state */
};
typedef struct mca_bmi_ib_proc_t mca_bmi_ib_proc_t;
mca_bmi_ib_proc_t* mca_bmi_ib_proc_create(ompi_proc_t* ompi_proc);
int mca_bmi_ib_proc_insert(mca_bmi_ib_proc_t*, mca_bmi_base_endpoint_t*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

199
src/mca/bmi/ib/bmi_ib_recvfrag.c Обычный файл
Просмотреть файл

@ -0,0 +1,199 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mca/pml/base/pml_base_sendreq.h"
#include "bmi_ib.h"
#include "bmi_ib_peer.h"
#include "bmi_ib_recvfrag.h"
#include "bmi_ib_sendfrag.h"
#include "bmi_ib_memory.h"
static void mca_bmi_ib_recv_frag_construct(mca_bmi_ib_recv_frag_t* frag);
static void mca_bmi_ib_recv_frag_destruct(mca_bmi_ib_recv_frag_t* frag);
OBJ_CLASS_INSTANCE(mca_bmi_ib_recv_frag_t,
mca_bmi_base_recv_frag_t,
mca_bmi_ib_recv_frag_construct,
mca_bmi_ib_recv_frag_destruct);
/*
* IB fragment constructor
*/
static void mca_bmi_ib_recv_frag_construct(mca_bmi_ib_recv_frag_t* frag)
{
}
/*
* IB fragment destructor
*/
static void mca_bmi_ib_recv_frag_destruct(mca_bmi_ib_recv_frag_t* frag)
{
}
void
mca_bmi_ib_recv_frag_done (
mca_bmi_base_header_t *header,
mca_bmi_base_recv_frag_t* frag,
mca_bmi_base_recv_request_t *request)
{
D_PRINT("");
frag->frag_base.frag_owner->bmi_recv_progress (
frag->frag_base.frag_owner,
request,
frag->frag_base.frag_size,
frag->frag_base.frag_size);
/* Return recv frag to free list */
OMPI_FREE_LIST_RETURN(&mca_bmi_ib_component.ib_recv_frags,
(ompi_list_item_t*)frag);
}
static void mca_bmi_ib_data_frag(
mca_bmi_ib_module_t *ib_bmi,
mca_bmi_base_header_t *hdr)
{
bool matched;
int rc;
ompi_list_item_t *item;
mca_bmi_ib_recv_frag_t *recv_frag;
size_t hdr_length;
OMPI_FREE_LIST_WAIT (&mca_bmi_ib_component.ib_recv_frags, item, rc);
recv_frag = (mca_bmi_ib_recv_frag_t *) item;
recv_frag->super.frag_base.frag_owner = &ib_bmi->super;
recv_frag->super.frag_base.frag_peer = NULL;
recv_frag->super.frag_request = NULL;
recv_frag->super.frag_is_buffered = false;
/* Copy the header, mca_bmi_base_match() */
recv_frag->super.frag_base.frag_header = *hdr;
switch(hdr->hdr_common.hdr_type) {
case MCA_BMI_HDR_TYPE_MATCH:
hdr_length = sizeof(mca_bmi_base_match_header_t);
recv_frag->super.frag_base.frag_size = hdr->hdr_match.hdr_msg_length;
break;
case MCA_BMI_HDR_TYPE_RNDV:
hdr_length = sizeof(mca_bmi_base_rendezvous_header_t);
recv_frag->super.frag_base.frag_size = hdr->hdr_rndv.hdr_frag_length;
break;
}
/* Taking the data starting point be default */
recv_frag->super.frag_base.frag_addr = (char *) hdr + hdr_length;
/* match against preposted requests */
matched = ib_bmi->super.bmi_match(
recv_frag->super.frag_base.frag_owner,
&recv_frag->super,
&recv_frag->super.frag_base.frag_header.hdr_match);
if (!matched) {
memcpy (recv_frag->unex_buf, (char *) hdr + hdr_length, recv_frag->super.frag_base.frag_size);
recv_frag->super.frag_is_buffered = true;
recv_frag->super.frag_base.frag_addr = recv_frag->unex_buf;
}
}
static void mca_bmi_ib_ctrl_frag(
mca_bmi_ib_module_t *ib_bmi,
mca_bmi_base_header_t *header)
{
mca_bmi_ib_send_frag_t *send_frag;
mca_bmi_base_send_request_t *req;
void *data_ptr;
send_frag = (mca_bmi_ib_send_frag_t *)
header->hdr_ack.hdr_src_ptr.pval;
req = (mca_bmi_base_send_request_t *)
send_frag->frag_send.frag_request;
req->req_peer_match = header->hdr_ack.hdr_dst_match;
req->req_peer_addr = header->hdr_ack.hdr_dst_addr;
req->req_peer_size = header->hdr_ack.hdr_dst_size;
/* Locate data in the ACK buffer */
data_ptr = (void*)
((char*) header + sizeof(mca_bmi_base_ack_header_t));
/* Copy over data to request buffer */
memcpy(&((mca_bmi_ib_send_request_t *) req)->req_key,
data_ptr, sizeof(VAPI_rkey_t));
/* Progress & release fragments */
mca_bmi_ib_send_frag_send_complete(ib_bmi, send_frag);
}
static void mca_bmi_ib_last_frag(mca_bmi_ib_module_t *ib_bmi,
mca_bmi_base_header_t *hdr)
{
mca_bmi_ib_fin_header_t *fin_hdr = (mca_bmi_ib_fin_header_t *)hdr;
mca_bmi_base_recv_request_t *request;
request = (mca_bmi_base_recv_request_t*) hdr->hdr_frag.hdr_dst_ptr.pval;
/* deregister memory if this is the last fragment */
if ((request->req_bytes_received + hdr->hdr_frag.hdr_frag_length) >=
request->req_recv.req_bytes_packed) {
mca_bmi_ib_deregister_mem_with_registry(ib_bmi,
fin_hdr->mr_addr.pval, (size_t)fin_hdr->mr_size);
}
ib_bmi->super.bmi_recv_progress (
&ib_bmi->super,
request,
hdr->hdr_frag.hdr_frag_length,
hdr->hdr_frag.hdr_frag_length);
}
/*
* Process incoming receive fragments
*
*/
void mca_bmi_ib_process_recv(mca_bmi_ib_module_t *ib_bmi, void* addr)
{
ib_buffer_t *ib_buf;
mca_bmi_base_header_t *header;
ib_buf = (ib_buffer_t *) addr;
header = (mca_bmi_base_header_t *) &ib_buf->buf[0];
switch(header->hdr_common.hdr_type) {
case MCA_BMI_HDR_TYPE_MATCH :
case MCA_BMI_HDR_TYPE_RNDV :
case MCA_BMI_HDR_TYPE_FRAG :
mca_bmi_ib_data_frag(ib_bmi, header);
break;
case MCA_BMI_HDR_TYPE_ACK :
mca_bmi_ib_ctrl_frag(ib_bmi, header);
break;
case MCA_BMI_HDR_TYPE_FIN :
A_PRINT("Fin");
mca_bmi_ib_last_frag(ib_bmi, header);
break;
default :
ompi_output(0, "Unknown fragment type");
break;
}
}

53
src/mca/bmi/ib/bmi_ib_recvfrag.h Обычный файл
Просмотреть файл

@ -0,0 +1,53 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_RECV_FRAG_H
#define MCA_BMI_IB_RECV_FRAG_H
#include "mca/bmi/bmi.h"
#include "mca/bmi/base/bmi_base_recvfrag.h"
#define MCA_BMI_IB_UNEX_BUF_SIZE (4096)
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_recv_frag_t);
/**
* IB received fragment derived type.
*/
struct mca_bmi_ib_recv_frag_t {
mca_bmi_base_recv_frag_t super;
/**< base receive fragment descriptor */
char unex_buf[MCA_BMI_IB_UNEX_BUF_SIZE];
/**< Unexpected buffer */
};
typedef struct mca_bmi_ib_recv_frag_t mca_bmi_ib_recv_frag_t;
struct mca_bmi_ib_module_t;
void mca_bmi_ib_recv_frag_done (mca_bmi_base_header_t*,
mca_bmi_base_recv_frag_t*, mca_bmi_base_recv_request_t*);
void mca_bmi_ib_process_recv(struct mca_bmi_ib_module_t* , void*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

159
src/mca/bmi/ib/bmi_ib_sendfrag.c Обычный файл
Просмотреть файл

@ -0,0 +1,159 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/types.h"
#include "datatype/datatype.h"
#include "mca/pml/base/pml_base_sendreq.h"
#include "bmi_ib.h"
#include "bmi_ib_peer.h"
#include "bmi_ib_proc.h"
#include "bmi_ib_sendfrag.h"
#include "bmi_ib_priv.h"
#include "bmi_ib_memory.h"
static void mca_bmi_ib_send_frag_construct(mca_bmi_ib_send_frag_t* frag);
static void mca_bmi_ib_send_frag_destruct(mca_bmi_ib_send_frag_t* frag);
OBJ_CLASS_INSTANCE(mca_bmi_ib_send_frag_t,
mca_bmi_base_send_frag_t,
mca_bmi_ib_send_frag_construct,
mca_bmi_ib_send_frag_destruct);
/*
* Placeholders for send fragment constructor/destructors.
*/
static void mca_bmi_ib_send_frag_construct(mca_bmi_ib_send_frag_t* frag)
{
frag->frag_progressed = 0;
frag->frag_ack_pending = 0;
}
static void mca_bmi_ib_send_frag_destruct(mca_bmi_ib_send_frag_t* frag)
{
}
/*
* Allocate a IB send descriptor
*
*/
mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag(
mca_bmi_ib_module_t* ib_bmi,
mca_bmi_base_send_request_t* request)
{
ompi_free_list_t *flist = &ib_bmi->send_free;
ompi_list_item_t *item;
mca_bmi_ib_send_frag_t *ib_send_frag;
item = ompi_list_remove_first(&((flist)->super));
while(NULL == item) {
mca_bmi_tstamp_t tstamp = 0;
D_PRINT("Gone one NULL descriptor ... trying again");
mca_bmi_ib_component_progress(0);
item = ompi_list_remove_first (&((flist)->super));
}
ib_send_frag = (mca_bmi_ib_send_frag_t *)item;
return ib_send_frag;
}
int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi)
{
int i, rc, num_send_frags;
ompi_list_item_t *item;
ompi_free_list_t *flist = &ib_bmi->send_free;
ib_buffer_t *ib_buf_ptr;
mca_bmi_ib_send_frag_t *ib_send_frag;
num_send_frags = ompi_list_get_size(&(flist->super));
item = ompi_list_get_first(&((flist)->super));
/* Register the buffers */
for(i = 0; i < num_send_frags;
item = ompi_list_get_next(item), i++) {
ib_send_frag = (mca_bmi_ib_send_frag_t *) item;
ib_send_frag->frag_progressed = 0;
ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf;
rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag,
(void*) ib_buf_ptr->buf,
MCA_BMI_IB_FIRST_FRAG_SIZE,
&ib_buf_ptr->hndl);
if(rc != OMPI_SUCCESS) {
return OMPI_ERROR;
}
IB_PREPARE_SEND_DESC(ib_buf_ptr, 0,
MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr);
}
return OMPI_SUCCESS;
}
/*
* Process send completions
*
*/
void mca_bmi_ib_send_frag_send_complete(mca_bmi_ib_module_t *ib_bmi, mca_bmi_ib_send_frag_t* sendfrag)
{
mca_bmi_base_header_t *hdr;
mca_bmi_base_send_request_t* req = sendfrag->frag_send.frag_request;
hdr = (mca_bmi_base_header_t *) sendfrag->ib_buf.buf;
switch(hdr->hdr_common.hdr_type) {
case MCA_BMI_HDR_TYPE_MATCH:
if (0 == (hdr->hdr_common.hdr_flags & MCA_BMI_FLAGS_ACK)
|| mca_bmi_base_send_request_matched(req)) {
ib_bmi->super.bmi_send_progress(&ib_bmi->super,
sendfrag->frag_send.frag_request,
hdr->hdr_rndv.hdr_frag_length);
if(req->req_cached == false) {
OMPI_FREE_LIST_RETURN(&ib_bmi->send_free,
((ompi_list_item_t *) sendfrag));
}
}
break;
case MCA_BMI_HDR_TYPE_ACK:
OMPI_FREE_LIST_RETURN(&ib_bmi->send_free,
((ompi_list_item_t *) sendfrag));
break;
case MCA_BMI_HDR_TYPE_FIN:
ib_bmi->super.bmi_send_progress(&ib_bmi->super,
sendfrag->frag_send.frag_request,
hdr->hdr_frag.hdr_frag_length);
OMPI_FREE_LIST_RETURN(&ib_bmi->send_free,
((ompi_list_item_t *) sendfrag));
break;
}
}

124
src/mca/bmi/ib/bmi_ib_sendfrag.h Обычный файл
Просмотреть файл

@ -0,0 +1,124 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_SEND_FRAG_H
#define MCA_BMI_IB_SEND_FRAG_H
#include "ompi_config.h"
#include "mca/bmi/base/bmi_base_sendreq.h"
#include "mca/bmi/base/bmi_base_sendfrag.h"
#include "bmi_ib_priv.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
OBJ_CLASS_DECLARATION(mca_bmi_ib_send_frag_t);
typedef enum {
MCA_BMI_IB_FRAG_SEND,
MCA_BMI_IB_FRAG_PUT,
MCA_BMI_IB_FRAG_GET,
MCA_BMI_IB_FRAG_ACK
} mca_bmi_ib_frag_type_t;
/**
* IB send fragment derived type.
*/
struct mca_bmi_ib_frag_t {
mca_bmi_base_descriptor_t base;
mca_bmi_base_segment_t segment;
struct mca_bmi_base_endpoint_t *endpoint;
mca_bmi_ib_frag_type_t type;
mca_bmi_base_tag_t tag;
size_t size;
int rc;
bool frag_ack_pending;
};
typedef struct mca_bmi_ib_frag_t mca_bmi_ib_frag_t;
/*
* Allocate an IB send descriptor
*
*/
#define MCA_BMI_IB_FRAG_ALLOC1(frag, rc) \
{
ompi_list_item_t *item;
OMPI_FREE_LIST_WAIT(&mca_bmi_ib_module.ib_frags1, item, rc);
frag = (mca_bmi_ib_frag_t*) item;
}
int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi)
{
int i, rc, num_send_frags;
ompi_list_item_t *item;
ompi_free_list_t *flist = &ib_bmi->ib_frags1;
ib_buffer_t *ib_buf_ptr;
mca_bmi_ib_frag_t *ib_frag;
num_send_frags = ompi_list_get_size(&(flist->super));
item = ompi_list_get_first(&((flist)->super));
/* Register the buffers */
for(i = 0; i < num_send_frags;
item = ompi_list_get_next(item), i++) {
ib_send_frag = (mca_bmi_ib_send_frag_t *) item;
ib_send_frag->frag_progressed = 0;
ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf;
rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag,
(void*) ib_buf_ptr->buf,
MCA_BMI_IB_FIRST_FRAG_SIZE,
&ib_buf_ptr->hndl);
if(rc != OMPI_SUCCESS) {
return OMPI_ERROR;
}
IB_PREPARE_SEND_DESC(ib_buf_ptr, 0,
MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr);
}
return OMPI_SUCCESS;
}
struct mca_bmi_ib_module_t;
mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag(
struct mca_bmi_ib_module_t* ib_bmi,
mca_bmi_base_send_request_t* request);
int mca_bmi_ib_send_frag_register(struct mca_bmi_ib_module_t *bmi);
void mca_bmi_ib_send_frag_send_complete(struct mca_bmi_ib_module_t *bmi, mca_bmi_ib_send_frag_t*);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

90
src/mca/bmi/ib/bmi_ib_vapi.h Обычный файл
Просмотреть файл

@ -0,0 +1,90 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BMI_IB_VAPI_H
#define MCA_BMI_IB_VAPI_H
#include <vapi.h>
#include <mtl_common.h>
#include <vapi_common.h>
/* HACK: Alert, these are dumb defines,
* all this stuff should be runtime. Ignoring for now.
*/
#define DEFAULT_PORT (1)
#define DEFAULT_CQ_SIZE (40000)
#define DEFAULT_WQ_SIZE (10000)
#define DEFAULT_SG_LIST (1)
#define DEFAULT_PKEY_IX (0)
#define DEFAULT_PSN (0)
#define DEFAULT_QP_OUS_RD_ATOM (1)
#define DEFAULT_MTU (MTU1024)
#define DEFAULT_MIN_RNR_TIMER (5)
#define DEFAULT_TIME_OUT (10)
#define DEFAULT_RETRY_COUNT (7)
#define DEFAULT_RNR_RETRY (7)
#define DEFAULT_MAX_RDMA_DST_OPS (16)
#define DEFAULT_TRAFFIC_CLASS (0)
#define DEFAULT_HOP_LIMIT (63)
#define DEFAULT_FLOW_LABEL (0)
#define DEFAULT_SERVICE_LEVEL (0)
#define DEFAULT_STATIC_RATE (0)
#define DEFAULT_SRC_PATH_BITS (0)
/* This is a convinence macro.
*
* ret : The value to return if call failed
* vapi_ret : The value which was returned from the last VAPI call
* func_name : The VAPI function which was called
*/
#define MCA_BMI_IB_VAPI_RET(vapi_ret, func_name) { \
ompi_output(0,"[%s:%d] ", __FILE__, __LINE__); \
ompi_output(0,"%s : %s",func_name,VAPI_strerror(vapi_ret)); \
}
/* Debug Print */
#if 0
#define D_PRINT(fmt, args...) { \
ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \
##args); \
}
#else
#define D_PRINT(fmt, args...)
#endif
#if 0
#define A_PRINT(fmt, args...) { \
ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \
##args); \
}
#else
#define A_PRINT(fmt, args...)
#endif
#if 0
#define B_PRINT(fmt, args...) { \
ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \
##args); \
}
#else
#define B_PRINT(fmt, args...)
#endif
#endif

22
src/mca/bmi/ib/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,22 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=bmi_ib.c
PARAM_CONFIG_HEADER_FILE="ib_config.h"
PARAM_CONFIG_FILES="Makefile"

148
src/mca/bmi/ib/configure.stub Обычный файл
Просмотреть файл

@ -0,0 +1,148 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004 The Ohio State University.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#
# Main function. This will be invoked in the middle of the templated
# configure script.
#
AC_DEFUN([MCA_CONFIGURE_STUB],[
# Additional --with flags that can be specified
AC_ARG_WITH(ptl-ib,
AC_HELP_STRING([--with-ptl-ib=IBDIR],
[Specify the installation directory of IB (should enable the correct automatic determination of using the 32 or 64 bit library, if both are present under IBDIR/lib and IBDIR/lib64)]))
AC_ARG_WITH(ptl-ib-libdir,
AC_HELP_STRING([--with-ptl-ib-libdir=IBLIBDIR],
[directory where the IB library can be found, if it is not in $IBDIR/lib or $IBDIR/lib64]))
# Quick sanity check
if test "$with_ptl_ib" = "no"; then
AC_MSG_WARN([*** --without-ptl-ib specified -- aborting])
AC_MSG_ERROR([Will not continue])
fi
# Find the right IBDIR
if test "$with_ptl_ib" != "" -a "$with_ptl_ib" != "yes" ; then
IBDIR="$with_ptl_ib"
IBLIBDIR="$with_ptl_ib"
fi
if test "$with_ptl_ib_libdir" != "" -a "$with_ptl_ib_libdir" != "yes" -a \
"$with_ptl_ib_libdir" != "no"; then
IBLIBDIR="$with_ptl_ib_libdir"
fi
# Add to CPPFLAGS if necessary
EXTRA_CPPFLAGS=
if test "$IBDIR" != ""; then
if test -d "$IBDIR/include"; then
EXTRA_CPPFLAGS="-I$IBDIR/include"
else
AC_MSG_WARN([*** Warning: cannot find $IBDIR/include])
AC_MSG_WARN([*** Will still try to configure ib ptl anyway...])
fi
if test "$IBDIR" != "" -a -d "$IBDIR/wrap"; then
EXTRA_CPPFLAGS="-I$IBDIR/wrap $EXTRA_CPPFLAGS"
else
AC_MSG_WARN([*** Warning: cannot find $IBDIR/wrap])
AC_MSG_WARN([*** Will still try to configure ib ptl anyway...])
fi
fi
# See if we can find vapi.h
CPPFLAGS="$CPPFLAGS $EXTRA_CPPFLAGS"
AC_CHECK_HEADERS(vapi.h,,
AC_MSG_ERROR([*** Cannot find working vapi.h]))
# Note that it is possible to find the library even if -L is not
# specified, if the LD_LIBRARY_PATH includes the directory where
# the shared ib library is kept. Hence, we unset LD_LIBRARY_PATH
# before running this test.
LD_LIBRARY_PATH_save="$LD_LIBRARY_PATH"
unset LD_LIBRARY_PATH
# Helpfer function to try to find libvapi (called from below). In
# some versions of Mellanox (v3.1), we need to expliitly link in
# the thread libraries. #$%#@$%@%#$!!!
mca_ptl_ib_try_find_libvapi() {
func1=[$]1
func2=[$]2
LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS"
vapi_badness=
AC_CHECK_LIB([vapi], [$func1], [], [vapi_badness=true],
[-lmtl_common -lmpga -lmosal])
if test "$vapi_badness" != ""; then
AC_CHECK_LIB([pthread], [pthread_create],
[pthread=yes LIBS="$LIBS -lpthread"],
[pthread=no])
if test "$pthread" = "yes"; then
AC_CHECK_LIB([vapi], [$func2], [], [],
[-lmtl_common -lmpga -lmosal])
fi
fi
}
# The libraries may be in $IBDIR/lib or $IBDIR/lib64. Try them
# both.
LIBS_save="$LIBS"
LDFLAGS_save="$LDFLAGS"
LIBS="$LIBS -lmosal -lmpga -lmtl_common"
LIBS_orig="$LIBS"
EXTRA_LDFLAGS=
if test -d "$IBLIBDIR/lib"; then
EXTRA_LDFLAGS="-L$IBLIBDIR/lib"
LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS"
mca_ptl_ib_try_find_libvapi VAPI_open_hca VAPI_query_hca_cap
if test "$LIBS" != "$LIBS_orig"; then
echo "--> found libvapi in $IBLIBDIR/lib"
fi
fi
if test "$LIBS" = "$LIBS_orig" -a -d "$IBLIBDIR/lib64"; then
EXTRA_LDFLAGS="-L$IBLIBDIR/lib64"
LDFLAGS="$LDFLAGS_save $EXTRA_LDFLAGS"
mca_ptl_ib_try_find_libvapi EVAPI_list_hcas EVAPI_open_hca
if test "$LIBS" != "$LIBS_orig"; then
echo "--> found libvapi in $IBLIBDIR/lib64"
fi
fi
if test "$LIBS" = "$LIBS_orig"; then
AC_MSG_ERROR([*** Cannot find working libvapi.])
fi
LD_LIBRARY_PATH="$LD_LIBRARY_PATH_save"
LIBS="$LIBS -lmtl_common -lmpga"
#
# Save extra compiler/linker flags so that they can be added in
# the wrapper compilers, if necessary
#
WRAPPER_EXTRA_LDFLAGS="$EXTRA_LDFLAGS"
WRAPPER_EXTRA_LIBS="-lvapi -lmtl_common -lmpga -lmosal"
])dnl