diff --git a/src/mca/bmi/ib/.ompi_ignore b/src/mca/bmi/ib/.ompi_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/mca/bmi/ib/.ompi_unignore b/src/mca/bmi/ib/.ompi_unignore new file mode 100644 index 0000000000..cef743e658 --- /dev/null +++ b/src/mca/bmi/ib/.ompi_unignore @@ -0,0 +1,2 @@ +twoodall +gshipman diff --git a/src/mca/bmi/ib/Makefile.am b/src/mca/bmi/ib/Makefile.am new file mode 100644 index 0000000000..41fd7adcb6 --- /dev/null +++ b/src/mca/bmi/ib/Makefile.am @@ -0,0 +1,60 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + +include $(top_ompi_srcdir)/config/Makefile.options + +libmca_bmi_ib_la_SOURCES = \ + bmi_ib.c \ + bmi_ib.h \ + bmi_ib_addr.h \ + bmi_ib_component.c \ + bmi_ib_endpoint.h \ + bmi_ib_frag.c \ + bmi_ib_frag.h \ + bmi_ib_memory.c \ + bmi_ib_peer.c \ + bmi_ib_peer.h \ + bmi_ib_priv.c \ + bmi_ib_priv.h \ + bmi_ib_proc.c \ + bmi_ib_proc.h \ + bmi_ib_recvfrag.c \ + bmi_ib_recvfrag.h \ + bmi_ib_sendfrag.c \ + bmi_ib_sendfrag.h \ + bmi_ib_vapi.h + + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_bmi_sm_DSO +component_noinst = +component_install = mca_bmi_ib.la +else +component_noinst = libmca_bmi_ib.la +component_install = +endif + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component_install) +mca_bmi_ib_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_bmi_ib_la_LDFLAGS = -module -avoid-version diff --git a/src/mca/bmi/ib/bmi_ib.c b/src/mca/bmi/ib/bmi_ib.c new file mode 100644 index 0000000000..8399c9ae24 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib.c @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include +#include "util/output.h" +#include "util/if.h" +#include "mca/pml/pml.h" +#include "mca/bmi/bmi.h" + +#include "bmi_ib.h" +#include "bmi_ib_frag.h" + +mca_bmi_ib_module_t mca_bmi_ib_module = { + { + &mca_bmi_ib_component.super, + 0, /* max size of first fragment */ + 0, /* min fragment size */ + 0, /* max fragment size */ + 0, /* exclusivity */ + 0, /* latency */ + 0, /* bandwidth */ + MCA_PTL_PUT, /* bmi flags */ + mca_bmi_ib_add_procs, + mca_bmi_ib_del_procs, + mca_bmi_ib_register, + mca_bmi_ib_finalize, + /* we need alloc free, pack */ + mca_bmi_ib_alloc, + mca_bmi_ib_free, + mca_bmi_ib_pack, + mca_bmi_ib_send, + mca_bmi_ib_put, + NULL /* get */ + } +}; + +int mca_bmi_ib_add_procs( + struct mca_bmi_base_module_t* bmi, + size_t nprocs, + struct ompi_proc_t **ompi_procs, + struct mca_bmi_base_endpoint_t** peers, + ompi_bitmap_t* reachable) +{ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi; + int i, rc; + + for(i = 0; i < nprocs; i++) { + + struct ompi_proc_t* ompi_proc = ompi_procs[i]; + mca_bmi_ib_proc_t* ib_proc; + mca_bmi_base_endpoint_t* ib_peer; + + if(NULL == (ib_proc = mca_bmi_ib_proc_create(ompi_proc))) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Check to make sure that the peer has at least as many interface + * addresses exported as we are trying to use. If not, then + * don't bind this PTL instance to the proc. + */ + + OMPI_THREAD_LOCK(&ib_proc->proc_lock); + + /* The bmi_proc datastructure is shared by all IB PTL + * instances that are trying to reach this destination. + * Cache the peer instance on the bmi_proc. + */ + ib_peer = OBJ_NEW(mca_bmi_ib_endpoint_t); + if(NULL == ib_peer) { + OMPI_THREAD_UNLOCK(&module_proc->proc_lock); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ib_peer->peer_bmi = ib_bmi; + rc = mca_bmi_ib_proc_insert(ib_proc, ib_peer); + if(rc != OMPI_SUCCESS) { + OBJ_RELEASE(ib_peer); + OMPI_THREAD_UNLOCK(&module_proc->proc_lock); + continue; + } + + ompi_bitmap_set_bit(reachable, i); + OMPI_THREAD_UNLOCK(&module_proc->proc_lock); + peers[i] = ib_peer; + } + + return OMPI_SUCCESS; +} + +int mca_bmi_ib_del_procs(struct mca_bmi_base_module_t* bmi, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_bmi_base_endpoint_t ** peers) +{ + /* Stub */ + D_PRINT("Stub\n"); + return OMPI_SUCCESS; +} + +int mca_bmi_ib_register( + struct mca_bmi_base_module_t* bmi, + mca_bmi_base_tag_t tag, + mca_bmi_base_module_recv_cb_fn_t cbfunc, + void* cbdata) +{ + /* TODO add register stuff here... */ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*) bmi; + ib_bmi->ib_reg[tag].cbfunc = cbfunc; + ib_bmi->ib_reg[tag].cbdata = cbdata; + return OMPI_SUCCESS; +} + + +/** + * Allocate a segment. + * + * @param bmi (IN) BMI module + * @param size (IN) Request segment size. + */ +extern mca_bmi_base_descriptor_t* mca_bmi_ib_alloc( + struct mca_bmi_base_module_t* bmi, + size_t size) +{ + mca_bmi_ib_frag_t* frag; + int rc; + if(size <= mca_bmi_ib_component.first_fragment_size) { + MCA_BMI_IB_FRAG_ALLOC1(frag,rc); + } else { + + } + return (mca_bmi_base_descriptor_t*)frag; +} + +extern int mca_bmi_ib_free( + struct mca_bmi_base_module_t* bmi, + mca_bmi_base_descriptor_t* des) +{ + mca_bmi_ib_frag_t* frag = (mca_bmi_ib_frag_t*)des; + MCA_BMI_IB_FRAG_RETURN1(frag); + +} + +/** + * Pack data + * + * @param bmi (IN) BMI module + * @param peer (IN) BMI peer addressing + */ +struct mca_bmi_base_descriptor_t* mca_bmi_ib_pack( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_endpoint_t* peer, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size) +{ + return NULL; +} + +int mca_bmi_ib_finalize(struct mca_bmi_base_module_t* bmi) +{ + /* Stub */ + D_PRINT("Stub\n"); + return OMPI_SUCCESS; +} + +int mca_bmi_ib_request_init( struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_send_request_t* request) +{ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi; + mca_bmi_ib_send_frag_t* sendfrag; + ompi_list_item_t* item; + int rc; + + OMPI_FREE_LIST_GET(&ib_bmi->send_free, item, rc); + if(NULL == (sendfrag = (mca_bmi_ib_send_frag_t*)item)) { + return rc; + } + ((mca_bmi_ib_send_request_t*) request)->req_frag = sendfrag; + return OMPI_SUCCESS; +} + + +void mca_bmi_ib_request_fini( struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_send_request_t* request) +{ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi; + mca_bmi_ib_send_request_t* sendreq = (mca_bmi_ib_send_request_t*)request; + OMPI_FREE_LIST_RETURN(&ib_bmi->send_free, (ompi_list_item_t*)sendreq->req_frag); +} + +/* + * Initiate a send. If this is the first fragment, use the fragment + * descriptor allocated with the send requests, otherwise obtain + * one from the free list. Initialize the fragment and foward + * on to the peer. + */ + +int mca_bmi_ib_send( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_endpoint_t* bmi_peer, + struct mca_bmi_base_descriptor_t* descriptor, + mca_bmi_base_tag_t tag) + +{ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi; + mca_bmi_ib_frag_t* frag = (mca_bmi_ib_frag_t*)descriptor; + frag->tag = tag; + frag->type = MCA_BMI_IB_FRAG_SEND; + + + int rc = OMPI_SUCCESS; + frag->rc = rc; + + + ompi_convertor_t *convertor; + int rc, freeAfter; + unsigned int iov_count, max_data; + struct iovec iov; + + /* first fragment (eager send) and first fragment of long + * protocol can use the convertor initialized on the request, + * remaining fragments must copy/reinit the convertor as the + * transfer could be in parallel. + */ + if( offset <= mca_bmi_ib_module.super.bmi_first_frag_size ) { + convertor = &sendreq->req_send.req_convertor; + } else { + convertor = &sendfrag->frag_send.frag_base.frag_convertor; + ompi_convertor_copy(&sendreq->req_send.req_convertor, convertor); + ompi_convertor_init_for_send( convertor, + 0, + sendreq->req_send.req_base.req_datatype, + sendreq->req_send.req_base.req_count, + sendreq->req_send.req_base.req_addr, + offset, + NULL ); + } + + /* if data is contigous, convertor will return an offset + * into users buffer - otherwise will return an allocated buffer + * that holds the packed data + */ + if((flags & MCA_PTL_FLAGS_ACK) == 0) { + iov.iov_base = &sendfrag->ib_buf.buf[sizeof(mca_bmi_base_match_header_t)]; + } else { + iov.iov_base = &sendfrag->ib_buf.buf[sizeof(mca_bmi_base_rendezvous_header_t)]; + } + iov.iov_len = size; + iov_count = 1; + max_data = size; + + if((rc = ompi_convertor_pack(convertor,&iov, &iov_count, &max_data, &freeAfter)) < 0) { + ompi_output(0, "Unable to pack data"); + return rc; + } + + /* adjust size to reflect actual number of bytes packed by convertor */ + size = iov.iov_len; + sendfrag->frag_send.frag_base.frag_addr = iov.iov_base; + sendfrag->frag_send.frag_base.frag_size = iov.iov_len; + } else { + sendfrag->frag_send.frag_base.frag_addr = NULL; + sendfrag->frag_send.frag_base.frag_size = 0; + } + + /* fragment state */ + sendfrag->frag_send.frag_base.frag_owner = &bmi_peer->peer_bmi->super; + sendfrag->frag_send.frag_request = sendreq; + sendfrag->frag_send.frag_base.frag_peer = bmi_peer; + sendfrag->frag_progressed = 0; + + /* Initialize header */ + hdr = (mca_bmi_base_header_t *) &sendfrag->ib_buf.buf[0]; + hdr->hdr_common.hdr_flags = flags; + hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; + if((flags & MCA_PTL_FLAGS_ACK) == 0) { + hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_MATCH; + hdr_length = sizeof(mca_bmi_base_match_header_t); + } else { + hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_MATCH; + hdr->hdr_rndv.hdr_frag_length = sendfrag->frag_send.frag_base.frag_size; + hdr->hdr_rndv.hdr_src_ptr.lval = 0; /* for VALGRIND/PURIFY - REPLACE WITH MACRO */ + hdr->hdr_rndv.hdr_src_ptr.pval = sendfrag; + hdr_length = sizeof(mca_bmi_base_rendezvous_header_t); + } + + /* Update the offset after actual fragment size is determined, + * and before attempting to send the fragment */ + sendreq->req_offset += size; + + IB_SET_SEND_DESC_LEN((&sendfrag->ib_buf), (hdr_length + size)); + if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send(bmi_peer, sendfrag))) { + return rc; + } + + /* if this is the entire message - signal request is complete */ + if(sendreq->req_send.req_bytes_packed == size) { + ompi_request_complete( &(sendreq->req_send.req_base.req_ompi) ); + } + return OMPI_SUCCESS; +} + +/* + * RDMA local buffer to remote buffer address. + */ + +int mca_bmi_ib_put( struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_endpoint_t* bmi_peer, + struct mca_bmi_base_send_request_t* req, size_t offset, + size_t size, int flags) +{ + return OMPI_ERR_NOT_IMPLEMENTED; +} + + +/* + * On a match send an ack to the peer. + */ + +static void mca_bmi_ib_ack( + mca_bmi_ib_module_t *ib_bmi, + mca_bmi_ib_send_frag_t *send_frag, + mca_bmi_ib_recv_frag_t *recv_frag) +{ + mca_bmi_base_header_t *hdr; + mca_bmi_base_recv_request_t *request; + mca_bmi_ib_endpoint_t *ib_peer; + ib_buffer_t *ib_buf; + int recv_len; + int len_to_reg, len_added = 0; + void *addr_to_reg, *ack_buf; + + /* Header starts at beginning of registered + * buffer space */ + + hdr = (mca_bmi_base_header_t *) + &send_frag->ib_buf.buf[0]; + + request = recv_frag->super.frag_request; + + /* Amount of data we have already received */ + recv_len = + recv_frag->super.frag_base.frag_header.hdr_rndv.hdr_frag_length; + + hdr->hdr_common.hdr_type = MCA_PTL_HDR_TYPE_ACK; + hdr->hdr_common.hdr_flags = 0; + + /* Remote side send descriptor */ + hdr->hdr_ack.hdr_src_ptr = + recv_frag->super.frag_base.frag_header.hdr_rndv.hdr_src_ptr; + + /* Matched request from recv side */ + hdr->hdr_ack.hdr_dst_match.lval = 0; + hdr->hdr_ack.hdr_dst_match.pval = request; + + hdr->hdr_ack.hdr_dst_addr.lval = 0; + + addr_to_reg = (void*)((char*)request->req_recv.req_base.req_addr + recv_len); + hdr->hdr_ack.hdr_dst_addr.pval = addr_to_reg; + + len_to_reg = request->req_recv.req_bytes_packed - recv_len; + hdr->hdr_ack.hdr_dst_size = len_to_reg; + + A_PRINT("Dest addr : %p, RDMA Len : %d", + hdr->hdr_ack.hdr_dst_addr.pval, + hdr->hdr_ack.hdr_dst_size); + + ack_buf = (void*) ((char*) (&send_frag->ib_buf.buf[0]) + + sizeof(mca_bmi_base_ack_header_t)); + + /* Prepare ACK packet with IB specific stuff */ + mca_bmi_ib_prepare_ack(ib_bmi, addr_to_reg, len_to_reg, + ack_buf, &len_added); + + /* Send it right away! */ + ib_peer = (mca_bmi_ib_endpoint_t *) + recv_frag->super.frag_base.frag_peer; + + ib_buf = &send_frag->ib_buf; + + IB_SET_SEND_DESC_LEN(ib_buf, + (sizeof(mca_bmi_base_ack_header_t) + len_added)); + + mca_bmi_ib_post_send(ib_bmi, ib_peer, &send_frag->ib_buf, send_frag); + + /* fragment state */ + send_frag->frag_send.frag_base.frag_owner = &ib_bmi->super; + send_frag->frag_send.frag_base.frag_peer = recv_frag->super.frag_base.frag_peer; + send_frag->frag_send.frag_base.frag_addr = NULL; + send_frag->frag_send.frag_base.frag_size = 0; +} + +/* + * A posted receive has been matched - if required send an + * ack back to the peer and process the fragment. Copy the + * data to user buffer + */ + +void mca_bmi_ib_matched( + mca_bmi_base_module_t* bmi, + mca_bmi_base_recv_frag_t* frag) +{ + mca_bmi_ib_module_t* ib_bmi = (mca_bmi_ib_module_t*)bmi; + mca_bmi_base_recv_request_t *request; + mca_bmi_base_header_t *header; + mca_bmi_ib_recv_frag_t *recv_frag; + + header = &frag->frag_base.frag_header; + request = frag->frag_request; + recv_frag = (mca_bmi_ib_recv_frag_t*) frag; + + D_PRINT("Matched frag\n"); + + if (header->hdr_common.hdr_flags & MCA_PTL_FLAGS_ACK) { + mca_bmi_ib_send_frag_t *send_frag; + send_frag = mca_bmi_ib_alloc_send_frag(ib_bmi, NULL); + if(NULL == send_frag) { + ompi_output(0, "Cannot get send descriptor"); + } else { + mca_bmi_ib_ack(ib_bmi, send_frag, recv_frag); + } + } + + /* Process the fragment */ + + /* IN TCP case, IO_VEC is first allocated. + * then recv the data, and copy if needed, + * But in ELAN cases, we save the data into an + * unex buffer if the recv descriptor is not posted + * (for too long) (TODO). + * We then need to copy from + * unex_buffer to application buffer */ + + if ((header->hdr_common.hdr_type & MCA_PTL_HDR_TYPE_MATCH) && + (header->hdr_match.hdr_msg_length > 0)) { + struct iovec iov; + ompi_proc_t *proc; + unsigned int iov_count, max_data; + int freeAfter; + + iov.iov_base = frag->frag_base.frag_addr; + iov.iov_len = frag->frag_base.frag_size; + + proc = ompi_comm_peer_lookup(request->req_recv.req_base.req_comm, + request->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); + + ompi_convertor_copy(proc->proc_convertor, &frag->frag_base.frag_convertor); + + ompi_convertor_init_for_recv( &frag->frag_base.frag_convertor, + 0, + request->req_recv.req_base.req_datatype, + request->req_recv.req_base.req_count, + request->req_recv.req_base.req_addr, + 0, /* fragment offset */ + NULL ); + ompi_convertor_unpack(&frag->frag_base.frag_convertor, &iov, &iov_count, &max_data, &freeAfter); + } + mca_bmi_ib_recv_frag_done(header, frag, request); +} diff --git a/src/mca/bmi/ib/bmi_ib.h b/src/mca/bmi/ib/bmi_ib.h new file mode 100644 index 0000000000..f126c4650d --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib.h @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PTL_IB_H +#define MCA_PTL_IB_H + +/* Standard system includes */ +#include +#include + +/* Open MPI includes */ +#include "class/ompi_free_list.h" +#include "class/ompi_bitmap.h" +#include "event/event.h" +#include "mca/pml/pml.h" +#include "mca/bmi/bmi.h" +#include "util/output.h" + +/* InfiniBand VAPI includes */ +#include "bmi_ib_vapi.h" +#include "bmi_ib_addr.h" +#include "bmi_ib_proc.h" +#include "bmi_ib_peer.h" +#include "bmi_ib_priv.h" + +/* Other IB bmi includes */ +#include "bmi_ib_sendreq.h" +#include "bmi_ib_recvfrag.h" +#include "bmi_ib_sendfrag.h" +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/** + * Infiniband (IB) BMI component. + */ + +struct mca_bmi_ib_registration_t { + mca_bmi_base_module_recv_cb_fn_t cbfunc; + void *cbdata; +} + +struct mca_bmi_ib_component_t { + mca_bmi_base_component_1_0_0_t super; /**< base BMI component */ + + uint32_t ib_num_bmis; + /**< number of hcas available to the IB component */ + + struct mca_bmi_ib_module_t *ib_bmis; + /**< array of available PTLs */ + + int ib_free_list_num; + /**< initial size of free lists */ + + int ib_free_list_max; + /**< maximum size of free lists */ + + int ib_free_list_inc; + /**< number of elements to alloc when growing free lists */ + + ompi_free_list_t ib_send_requests; + /**< free list of ib send requests -- sendreq + IB */ + + ompi_free_list_t ib_send_frags; + /**< free list of ib send fragments */ + + ompi_free_list_t ib_recv_frags; + /**< free list of ib recv fragments */ + + ompi_list_t ib_procs; + /**< list of ib proc structures */ + + ompi_event_t ib_send_event; + /**< event structure for sends */ + + ompi_event_t ib_recv_event; + /**< event structure for recvs */ + + ompi_mutex_t ib_lock; + /**< lock for accessing module state */ + + int ib_mem_registry_hints_log_size; + /**< log2 size of hints hash array used by memory registry */ +}; +typedef struct mca_bmi_ib_component_t mca_bmi_ib_component_t; +struct mca_bmi_ib_recv_frag_t; + +extern mca_bmi_ib_component_t mca_bmi_ib_component; + +/** + * IB PTL Interface + */ +struct mca_bmi_ib_module_t { + mca_bmi_base_module_t super; /**< base PTL interface */ + bool bmi_inited; + mca_bmi_ib_registration_t ib_reg[256]; + VAPI_hca_id_t hca_id; /**< ID of HCA */ + VAPI_hca_port_t port; /**< IB port of this PTL */ + VAPI_hca_hndl_t nic; /**< NIC handle */ + VAPI_pd_hndl_t ptag; /**< Protection Domain tag */ + VAPI_cq_hndl_t cq_hndl; /**< Completion Queue handle */ + + EVAPI_async_handler_hndl_t async_handler; + /**< Async event handler used to detect weird/unknown events */ + + mca_bmi_ib_mem_registry_t mem_registry; /**< registry of memory regions */ + ompi_free_list_t ib_frags1; /**< free list of buffer descriptors */ + ompi_list_t repost; /**< list of buffers to repost */ +}; + +typedef struct mca_bmi_ib_module_t mca_bmi_ib_module_t; + +extern mca_bmi_ib_module_tmca_bmi_ib_module; + +/** + * IB FIN header + */ +typedef struct mca_bmi_ib_fin_header_t mca_bmi_ib_fin_header_t; + +struct mca_bmi_ib_fin_header_t { + mca_bmi_base_frag_header_t frag_hdr; + ompi_ptr_t mr_addr; + uint64_t mr_size; +}; + +/** + * Register IB component parameters with the MCA framework + */ +extern int mca_bmi_ib_component_open(void); + +/** + * Any final cleanup before being unloaded. + */ +extern int mca_bmi_ib_component_close(void); + +/** + * IB component initialization. + * + * @param num_bmi_modules (OUT) Number of BMIs returned in BMI array. + * @param allow_multi_user_threads (OUT) Flag indicating wether BMI supports user threads (TRUE) + * @param have_hidden_threads (OUT) Flag indicating wether BMI uses threads (TRUE) + * + * (1) read interface list from kernel and compare against component parameters + * then create a BMI instance for selected interfaces + * (2) setup IB listen socket for incoming connection attempts + * (3) publish BMI addressing info + * + */ +extern mca_bmi_base_module_t** mca_bmi_ib_component_init( + int *num_bmi_modules, + bool allow_multi_user_threads, + bool have_hidden_threads +); + +/** + * IB component control. + */ +extern int mca_bmi_ib_component_control( + int param, + void* value, + size_t size +); + +/** + * IB component progress. + */ +extern int mca_bmi_ib_component_progress( + mca_bmi_tstamp_t tstamp +); + + + +/** + * Cleanup any resources held by the BMI. + * + * @param bmi BMI instance. + * @return OMPI_SUCCESS or error status on failure. + */ + +extern int mca_bmi_ib_finalize( + struct mca_bmi_base_module_t* bmi +); + + +/** + * PML->BMI notification of change in the process list. + * + * @param bmi (IN) + * @param nprocs (IN) Number of processes + * @param procs (IN) Set of processes + * @param peers (OUT) Set of (optional) peer addressing info. + * @param peers (IN/OUT) Set of processes that are reachable via this BMI. + * @return OMPI_SUCCESS or error status on failure. + * + */ + +extern int mca_bmi_ib_add_procs( + struct mca_bmi_base_module_t* bmi, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_bmi_base_endpoint_t** peers, + ompi_bitmap_t* reachable +); + +/** + * PML->BMI notification of change in the process list. + * + * @param bmi (IN) BMI instance + * @param nproc (IN) Number of processes. + * @param procs (IN) Set of processes. + * @param peers (IN) Set of peer data structures. + * @return Status indicating if cleanup was successful + * + */ +extern int mca_bmi_ib_del_procs( + struct mca_bmi_base_module_t* bmi, + size_t nprocs, + struct ompi_proc_t **procs, + struct mca_bmi_base_endpoint_t** peers +); + +/** + * PML->BMI Initialize a send request for TCP cache. + * + * @param bmi (IN) BMI instance + * @param request (IN) Pointer to allocated request. + * + **/ +extern int mca_bmi_ib_request_init( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_send_request_t* + ); + +/** + * PML->BMI Cleanup a send request that is being removed from the cache. + * + * @param bmi (IN) BMI instance + * @param request (IN) Pointer to allocated request. + * + **/ +extern void mca_bmi_ib_request_fini( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_send_request_t* + ); + +/** + * PML->BMI Return a send request to the BMI modules free list. + * + * @param bmi (IN) BMI instance + * @param request (IN) Pointer to allocated request. + * + */ +extern void mca_bmi_ib_request_return( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_send_request_t* +); + +/** + * PML->BMI Notification that a receive fragment has been matched. + * + * @param bmi (IN) BMI instance + * @param recv_frag (IN) Receive fragment + * + */ +extern void mca_bmi_ib_matched( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_recv_frag_t* frag +); + +/** + * PML->BMI Initiate a send of the specified size. + * + * @param bmi (IN) BMI instance + * @param bmi_base_peer (IN) BMI peer addressing + * @param send_request (IN/OUT) Send request (allocated by PML via mca_bmi_base_request_alloc_fn_t) + * @param size (IN) Number of bytes PML is requesting BMI to deliver + * @param flags (IN) Flags that should be passed to the peer via the message header. + * @param request (OUT) OMPI_SUCCESS if the BMI was able to queue one or more fragments + */ +extern int mca_bmi_ib_send( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_endpoint_t* bmi_peer, + struct mca_bmi_base_send_request_t*, + size_t offset, + size_t size, + int flags +); + +/** + * PML->BMI Initiate a put of the specified size. + * + * @param bmi (IN) BMI instance + * @param bmi_base_peer (IN) BMI peer addressing + * @param send_request (IN/OUT) Send request (allocated by PML via mca_bmi_base_request_alloc_fn_t) + * @param size (IN) Number of bytes PML is requesting BMI to deliver + * @param flags (IN) Flags that should be passed to the peer via the message header. + * @param request (OUT) OMPI_SUCCESS if the BMI was able to queue one or more fragments + */ +extern int mca_bmi_ib_put( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_base_endpoint_t* bmi_peer, + struct mca_bmi_base_send_request_t*, + size_t offset, + size_t size, + int flags +); + +/** + * Return a recv fragment to the modules free list. + * + * @param bmi (IN) BMI instance + * @param frag (IN) IB receive fragment + * + */ +extern void mca_bmi_ib_recv_frag_return( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_ib_recv_frag_t* frag +); + + +/** + * Return a send fragment to the modules free list. + * + * @param bmi (IN) BMI instance + * @param frag (IN) IB send fragment + * + */ +extern void mca_bmi_ib_send_frag_return( + struct mca_bmi_base_module_t* bmi, + struct mca_bmi_ib_send_frag_t* +); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_addr.h b/src/mca/bmi/ib/bmi_ib_addr.h new file mode 100644 index 0000000000..066ed1fdfc --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_addr.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_ADDR_H +#define MCA_BMI_IB_ADDR_H + +#include "bmi_ib.h" + +#endif diff --git a/src/mca/bmi/ib/bmi_ib_component.c b/src/mca/bmi/ib/bmi_ib_component.c new file mode 100644 index 0000000000..8faa7894c4 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_component.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* #include */ + +/* Open MPI includes */ +#include "ompi_config.h" +#include "include/constants.h" +#include "event/event.h" +#include "util/if.h" +#include "util/argv.h" +#include "util/output.h" +#include "mca/pml/pml.h" +#include "mca/bmi/bmi.h" +#include "mca/base/mca_base_param.h" +#include "mca/base/mca_base_module_exchange.h" +#include "mca/errmgr/errmgr.h" + +/* IB bmi includes */ +#include "bmi_ib.h" + + +mca_bmi_ib_component_t mca_bmi_ib_component = { + { + /* First, the mca_base_component_t struct containing meta information + about the component itself */ + + { + /* Indicate that we are a pml v1.0.0 component (which also implies a + specific MCA version) */ + + MCA_BMI_BASE_VERSION_1_0_0, + + "ib", /* MCA component name */ + 1, /* MCA component major version */ + 0, /* MCA component minor version */ + 0, /* MCA component release version */ + mca_bmi_ib_component_open, /* component open */ + mca_bmi_ib_component_close /* component close */ + }, + + /* Next the MCA v1.0.0 component meta data */ + + { + /* Whether the component is checkpointable or not */ + + false + }, + + mca_bmi_ib_component_init, + mca_bmi_ib_component_control, + mca_bmi_ib_component_progress, + } +}; + + +/* + * utility routines for parameter registration + */ + +static inline char* mca_bmi_ib_param_register_string( + const char* param_name, + const char* default_value) +{ + char *param_value; + int id = mca_base_param_register_string("bmi","ib",param_name,NULL,default_value); + mca_base_param_lookup_string(id, ¶m_value); + return param_value; +} + +static inline int mca_bmi_ib_param_register_int( + const char* param_name, + int default_value) +{ + int id = mca_base_param_register_int("bmi","ib",param_name,NULL,default_value); + int param_value = default_value; + mca_base_param_lookup_int(id,¶m_value); + return param_value; +} + +/* + * Called by MCA framework to open the component, registers + * component parameters. + */ + +int mca_bmi_ib_component_open(void) +{ + /* register component parameters */ + mca_bmi_ib_module.super.bmi_exclusivity = + mca_bmi_ib_param_register_int ("exclusivity", 0); + + mca_bmi_ib_module.super.bmi_first_frag_size = + mca_bmi_ib_param_register_int ("first_frag_size", + (MCA_BMI_IB_FIRST_FRAG_SIZE + - sizeof(mca_bmi_base_header_t))); + + mca_bmi_ib_module.super.bmi_min_frag_size = + mca_bmi_ib_param_register_int ("min_frag_size", + (MCA_BMI_IB_FIRST_FRAG_SIZE + - sizeof(mca_bmi_base_header_t))); + + mca_bmi_ib_module.super.bmi_max_frag_size = + mca_bmi_ib_param_register_int ("max_frag_size", 2<<30); + + /* register IB component parameters */ + mca_bmi_ib_component.ib_free_list_num = + mca_bmi_ib_param_register_int ("free_list_num", 8); + mca_bmi_ib_component.ib_free_list_max = + mca_bmi_ib_param_register_int ("free_list_max", 1024); + mca_bmi_ib_component.ib_free_list_inc = + mca_bmi_ib_param_register_int ("free_list_inc", 32); + mca_bmi_ib_component.ib_mem_registry_hints_log_size = + mca_bmi_ib_param_register_int ("hints_log_size", 8); + + /* initialize global state */ + mca_bmi_ib_component.ib_num_bmis=0; + mca_bmi_ib_component.ib_bmis=NULL; + OBJ_CONSTRUCT(&mca_bmi_ib_component.ib_procs, ompi_list_t); + OBJ_CONSTRUCT (&mca_bmi_ib_component.ib_recv_frags, ompi_free_list_t); + + return OMPI_SUCCESS; +} + +/* + * component cleanup - sanity checking of queue lengths + */ + +int mca_bmi_ib_component_close(void) +{ + D_PRINT(""); + /* Stub */ + return OMPI_SUCCESS; +} + +/* + * IB component initialization: + * (1) read interface list from kernel and compare against component parameters + * then create a BMI instance for selected interfaces + * (2) setup IB listen socket for incoming connection attempts + * (3) register BMI parameters with the MCA + */ +mca_bmi_base_module_t** mca_bmi_ib_component_init(int *num_bmi_modules, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + VAPI_ret_t vapi_ret; + VAPI_hca_id_t* hca_ids; + mca_bmi_base_module_t** bmis; + int i, ret; + + /* initialization */ + *num_bmi_modules = 0; + + /* query the list of available hcas */ + vapi_ret=EVAPI_list_hcas(0, &(mca_bmi_ib_component.ib_num_bmis), NULL); + if( VAPI_EAGAIN != vapi_ret || 0 == mca_bmi_ib_component.ib_num_bmis ) { + ompi_output(0,"Warning: no IB HCAs found\n"); + return NULL; + } + + hca_ids = (VAPI_hca_id_t*) malloc(mca_bmi_ib_component.ib_num_bmis * sizeof(VAPI_hca_id_t)); + if(NULL == hca_ids) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + vapi_ret=EVAPI_list_hcas(mca_bmi_ib_component.ib_num_bmis, &mca_bmi_ib_component.ib_num_bmis, hca_ids); + if( VAPI_OK != vapi_ret ) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + /* Allocate space for bmi modules */ + mca_bmi_ib_component.ib_bmis = (mca_bmi_ib_module_t*) malloc(sizeof(mca_bmi_ib_module_t) * + mca_bmi_ib_component.ib_num_bmis); + if(NULL == mca_bmi_ib_component.ib_bmis) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + bmis = (struct mca_bmi_base_module_t**) + malloc(mca_bmi_ib_component.ib_num_bmis * sizeof(struct mca_bmi_ib_module_t*)); + if(NULL == bmis) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return NULL; + } + + /* Initialize pool of receive fragments */ + ompi_free_list_init (&(mca_bmi_ib_component.ib_recv_frags), + sizeof (mca_bmi_ib_recv_frag_t), + OBJ_CLASS (mca_bmi_ib_recv_frag_t), + mca_bmi_ib_component.ib_free_list_num, + mca_bmi_ib_component.ib_free_list_max, + mca_bmi_ib_component.ib_free_list_inc, NULL); + + /* Initialize each module */ + for(i = 0; i < mca_bmi_ib_component.ib_num_bmis; i++) { + mca_bmi_ib_module_t* ib_bmi = &mca_bmi_ib_component.ib_bmis[i]; + + /* Initialize the modules function pointers */ + memcpy(ib_bmi, &mca_bmi_ib_module, sizeof(mca_bmi_ib_module)); + + /* Initialize module state */ + OBJ_CONSTRUCT(&ib_bmi->send_free, ompi_free_list_t); + OBJ_CONSTRUCT(&ib_bmi->repost, ompi_list_t); + + ompi_free_list_init(&ib_bmi->send_free, + sizeof(mca_bmi_ib_send_frag_t), + OBJ_CLASS(mca_bmi_ib_send_frag_t), + mca_bmi_ib_component.ib_free_list_num, + mca_bmi_ib_component.ib_free_list_max, + mca_bmi_ib_component.ib_free_list_inc, + NULL); + + + memcpy(ib_bmi->hca_id, hca_ids[i], sizeof(ib_bmi->hca_id)); + if(mca_bmi_ib_module_init(ib_bmi) != OMPI_SUCCESS) { + free(hca_ids); + return NULL; + } + + /* Initialize the send descriptors */ + if(mca_bmi_ib_send_frag_register(ib_bmi) != OMPI_SUCCESS) { + free(hca_ids); + return NULL; + } + bmis[i] = &ib_bmi->super; + } + + /* Post OOB receive to support dynamic connection setup */ + mca_bmi_ib_post_recv(); + + *num_bmi_modules = mca_bmi_ib_component.ib_num_bmis; + free(hca_ids); + return bmis; +} + +/* + * IB component control + */ + +int mca_bmi_ib_component_control(int param, void* value, size_t size) +{ + return OMPI_SUCCESS; +} + + +/* + * IB component progress. + */ + +#define MCA_BMI_IB_DRAIN_NETWORK(nic, cq_hndl, comp_type, comp_addr) \ +{ \ + VAPI_ret_t ret; \ + VAPI_wc_desc_t comp; \ + \ + ret = VAPI_poll_cq(nic, cq_hndl, &comp); \ + if(VAPI_OK == ret) { \ + if(comp.status != VAPI_SUCCESS) { \ + ompi_output(0, "Got error : %s, Vendor code : %d Frag : %p", \ + VAPI_wc_status_sym(comp.status), \ + comp.vendor_err_syndrome, comp.id); \ + *comp_type = IB_COMP_ERROR; \ + *comp_addr = NULL; \ + } else { \ + if(VAPI_CQE_SQ_SEND_DATA == comp.opcode) { \ + *comp_type = IB_COMP_SEND; \ + *comp_addr = (void*) (unsigned long) comp.id; \ + } else if(VAPI_CQE_RQ_SEND_DATA == comp.opcode) { \ + *comp_type = IB_COMP_RECV; \ + *comp_addr = (void*) (unsigned long) comp.id; \ + } else if(VAPI_CQE_SQ_RDMA_WRITE == comp.opcode) { \ + *comp_type = IB_COMP_RDMA_W; \ + *comp_addr = (void*) (unsigned long) comp.id; \ + } else { \ + ompi_output(0, "VAPI_poll_cq: returned unknown opcode : %d\n", \ + comp.opcode); \ + *comp_type = IB_COMP_ERROR; \ + *comp_addr = NULL; \ + } \ + } \ + } else { \ + /* No completions from the network */ \ + *comp_type = IB_COMP_NOTHING; \ + *comp_addr = NULL; \ + } \ +} + + +int mca_bmi_ib_component_progress(mca_bmi_tstamp_t tstamp) +{ + int i; + int count = 0; + + /* Poll for completions */ + for(i = 0; i < mca_bmi_ib_component.ib_num_bmis; i++) { + mca_bmi_ib_module_t* ib_bmi = &mca_bmi_ib_component.ib_bmis[i]; + int comp_type = IB_COMP_NOTHING; + void* comp_addr; + + MCA_BMI_IB_DRAIN_NETWORK(ib_bmi->nic, ib_bmi->cq_hndl, &comp_type, &comp_addr); + + /* Handle n/w completions */ + switch(comp_type) { + case IB_COMP_SEND : + + /* Process a completed send */ + mca_bmi_ib_send_frag_send_complete(ib_bmi, (mca_bmi_ib_send_frag_t*)comp_addr); + count++; + break; + + case IB_COMP_RECV : + + /* Process incoming receives */ + mca_bmi_ib_process_recv(ib_bmi, comp_addr); + /* Re post recv buffers */ + if(ompi_list_get_size(&ib_bmi->repost) <= 1) { + ompi_list_append(&ib_bmi->repost, (ompi_list_item_t*)comp_addr); + } else { + ompi_list_item_t* item; + while(NULL != (item = ompi_list_remove_first(&ib_bmi->repost))) { + mca_bmi_ib_buffer_repost(ib_bmi->nic, item); + } + mca_bmi_ib_buffer_repost(ib_bmi->nic, comp_addr); + } + count++; + break; + + case IB_COMP_RDMA_W : + + ompi_output(0, "%s:%d RDMA not implemented\n", __FILE__,__LINE__); + count++; + break; + + case IB_COMP_NOTHING: + break; + default: + ompi_output(0, "Errorneous network completion"); + break; + } + } + return count; +} + diff --git a/src/mca/bmi/ib/bmi_ib_endpoint.h b/src/mca/bmi/ib/bmi_ib_endpoint.h new file mode 100644 index 0000000000..ce2719c3c8 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_endpoint.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_PEER_H +#define MCA_BMI_IB_PEER_H + +#include "class/ompi_list.h" +#include "event/event.h" +#include "mca/pml/pml.h" +#include "mca/bmi/bmi.h" +#include "bmi_ib_recvfrag.h" +#include "bmi_ib_sendfrag.h" +#include "bmi_ib_priv.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_endpoint_t); + +/** + * State of IB peer connection. + */ + +typedef enum { + /* Defines the state in which this BMI instance + * has started the process of connection */ + MCA_BMI_IB_CONNECTING, + + /* Waiting for ack from peer */ + MCA_BMI_IB_CONNECT_ACK, + + /* Connected ... both sender & receiver have + * buffers associated with this connection */ + MCA_BMI_IB_CONNECTED, + + /* Connection is closed, there are no resources + * associated with this */ + MCA_BMI_IB_CLOSED, + + /* Maximum number of retries have been used. + * Report failure on send to upper layer */ + MCA_BMI_IB_FAILED +} mca_bmi_ib_endpoint_state_t; + +/** + * An abstraction that represents a connection to a peer process. + * An instance of mca_bmi_base_endpoint_t is associated w/ each process + * and BMI pair at startup. However, connections to the peer + * are established dynamically on an as-needed basis: + */ + +struct mca_bmi_base_endpoint_t { + ompi_list_item_t super; + + struct mca_bmi_ib_module_t* peer_bmi; + /**< BMI instance that created this connection */ + + struct mca_bmi_ib_proc_t* peer_proc; + /**< proc structure corresponding to peer */ + + mca_bmi_ib_endpoint_state_t peer_state; + /**< current state of the connection */ + + size_t peer_retries; + /**< number of connection retries attempted */ + + double peer_tstamp; + /**< timestamp of when the first connection was attempted */ + + ompi_mutex_t peer_send_lock; + /**< lock for concurrent access to peer state */ + + ompi_mutex_t peer_recv_lock; + /**< lock for concurrent access to peer state */ + + ompi_list_t pending_send_frags; + /**< list of pending send frags for this peer */ + + VAPI_qp_num_t rem_qp_num; + /* Remote side QP number */ + + IB_lid_t rem_lid; + /* Local identifier of the remote process */ + + VAPI_qp_hndl_t lcl_qp_hndl; + /* Local QP handle */ + + VAPI_qp_prop_t lcl_qp_prop; + /* Local QP properties */ + + ib_buffer_t *lcl_recv; + /* Remote resources associated with this connection */ +}; + +typedef struct mca_bmi_base_endpoint_t mca_bmi_base_endpoint_t; +typedef struct mca_bmi_base_endpoint_t mca_bmi_ib_endpoint_t; + +int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t*, mca_bmi_ib_send_frag_t*); +int mca_bmi_ib_peer_connect(mca_bmi_base_endpoint_t*); +void mca_bmi_ib_post_recv(void); + +void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t*); + +#define DUMP_PEER(peer_ptr) { \ + ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \ + ompi_output(0, "Dumping peer %d state", \ + peer->peer_proc->proc_guid.vpid); \ + ompi_output(0, "Local QP hndl : %d", \ + peer_ptr->peer_conn->lres->qp_hndl); \ + ompi_output(0, "Local QP num : %d", \ + peer_ptr->peer_conn->lres->qp_prop.qp_num); \ + ompi_output(0, "Remote QP num : %d", \ + peer_ptr->peer_conn->rres->qp_num); \ + ompi_output(0, "Remote LID : %d", \ + peer_ptr->peer_conn->rres->lid); \ +} + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_frag.h b/src/mca/bmi/ib/bmi_ib_frag.h new file mode 100644 index 0000000000..3ad917fd1d --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_frag.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_SEND_FRAG_H +#define MCA_BMI_IB_SEND_FRAG_H + +#include "ompi_config.h" +#include "mca/bmi/base/bmi_base_sendreq.h" +#include "mca/bmi/base/bmi_base_sendfrag.h" + +#include "bmi_ib_priv.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_send_frag_t); + +typedef enum { + MCA_BMI_IB_FRAG_SEND, + MCA_BMI_IB_FRAG_PUT, + MCA_BMI_IB_FRAG_GET, + MCA_BMI_IB_FRAG_ACK +} mca_bmi_ib_frag_type_t; + +/** + * IB send fragment derived type. + */ +struct mca_bmi_ib_frag_t { + mca_bmi_base_descriptor_t base; + mca_bmi_base_segment_t segment; + struct mca_bmi_base_endpoint_t *endpoint; + mca_bmi_ib_frag_type_t type; + mca_bmi_base_tag_t tag; + + size_t size; + int rc; + bool frag_ack_pending; +}; +typedef struct mca_bmi_ib_frag_t mca_bmi_ib_frag_t; + + + +/* + * Allocate an IB send descriptor + * + */ +#define MCA_BMI_IB_FRAG_ALLOC1(frag, rc) \ +{ \ + \ + ompi_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&mca_bmi_ib_module.ib_frags1, item, rc); \ + frag = (mca_bmi_ib_frag_t*) item; \ +} + +#define MCA_BMI_IB_FRAG_RETURN1(frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&mca_bmi_ib_module.ib_frags1, &frag->super); \ +} + +int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi) +{ + int i, rc, num_send_frags; + ompi_list_item_t *item; + ompi_free_list_t *flist = &ib_bmi->ib_frags1; + ib_buffer_t *ib_buf_ptr; + mca_bmi_ib_frag_t *ib_frag; + + num_send_frags = ompi_list_get_size(&(flist->super)); + item = ompi_list_get_first(&((flist)->super)); + + /* Register the buffers */ + for(i = 0; i < num_send_frags; + item = ompi_list_get_next(item), i++) { + + ib_send_frag = (mca_bmi_ib_send_frag_t *) item; + + ib_send_frag->frag_progressed = 0; + + ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf; + + rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag, + (void*) ib_buf_ptr->buf, + MCA_BMI_IB_FIRST_FRAG_SIZE, + &ib_buf_ptr->hndl); + if(rc != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + IB_PREPARE_SEND_DESC(ib_buf_ptr, 0, + MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr); + } + + return OMPI_SUCCESS; +} + + + + +struct mca_bmi_ib_module_t; + +mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag( + struct mca_bmi_ib_module_t* ib_bmi, + mca_bmi_base_send_request_t* request); + +int mca_bmi_ib_send_frag_register(struct mca_bmi_ib_module_t *bmi); +void mca_bmi_ib_send_frag_send_complete(struct mca_bmi_ib_module_t *bmi, mca_bmi_ib_send_frag_t*); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_memory.c b/src/mca/bmi/ib/bmi_ib_memory.c new file mode 100644 index 0000000000..59f6f2f5d2 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_memory.c @@ -0,0 +1,311 @@ +/* Standard system includes */ +#include +#include +#include + +/* Open MPI includes */ + +/* Other IB BMI includes */ +#include "bmi_ib.h" +#include "bmi_ib_memory.h" +#include "bmi_ib_priv.h" + +static void mca_bmi_ib_mem_registry_construct(ompi_object_t *object); +static void mca_bmi_ib_mem_registry_destruct(ompi_object_t *object); +static void mca_bmi_ib_mem_registry_info_construct(ompi_object_t *object); +static void mca_bmi_ib_mem_registry_info_destruct(ompi_object_t *object); + +static int mca_bmi_ib_mem_registry_info_compare(void *key1, void *key2); + +static int mca_bmi_ib_mem_registry_real_deregister( + mca_bmi_ib_mem_registry_t *registry, + mca_bmi_ib_mem_registry_info_t *info); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_mem_registry_info_t, ompi_list_item_t, + mca_bmi_ib_mem_registry_info_construct, mca_bmi_ib_mem_registry_info_destruct); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_mem_registry_t, ompi_rb_tree_t, mca_bmi_ib_mem_registry_construct, + mca_bmi_ib_mem_registry_destruct); + +static void mca_bmi_ib_mem_registry_construct(ompi_object_t *object) +{ + mca_bmi_ib_mem_registry_t *registry = (mca_bmi_ib_mem_registry_t *)object; + int i; + + ompi_rb_tree_init(&(registry->rb_tree), mca_bmi_ib_mem_registry_info_compare); + + OBJ_CONSTRUCT(&(registry->info_free_list), ompi_free_list_t); + ompi_free_list_init(®istry->info_free_list, sizeof(mca_bmi_ib_mem_registry_info_t), + OBJ_CLASS(mca_bmi_ib_mem_registry_info_t), 32, -1, 32, NULL); + + registry->hints_log_size = mca_bmi_ib_component.ib_mem_registry_hints_log_size; + /* sanity check -- enforce lower bound for hash calculation */ + if (registry->hints_log_size < 1) { + registry->hints_log_size = 1; + } + + registry->hints = (ompi_ptr_t *)malloc((1 << registry->hints_log_size) * + sizeof(ompi_ptr_t)); + + registry->hints_log_size = mca_bmi_ib_component.ib_mem_registry_hints_log_size; + registry->hints_size = (registry->hints) ? (1 << registry->hints_log_size) : 0; + for (i = 0; i < registry->hints_size; i++) { + registry->hints[i].pval = (void *)NULL; + } + + registry->ib_bmi = NULL; + registry->evictable = NULL; + + return; +} + +static void mca_bmi_ib_mem_registry_destruct(ompi_object_t *object) +{ + /* memory regions that are being tracked are not deregistered here */ + mca_bmi_ib_mem_registry_t *registry = (mca_bmi_ib_mem_registry_t *)object; + OBJ_DESTRUCT(&(registry->info_free_list)); + if (registry->hints_size != 0) { + free(registry->hints); + registry->hints = (ompi_ptr_t *)NULL; + registry->hints_size = 0; + } + return; +} + +static void mca_bmi_ib_mem_registry_info_construct(ompi_object_t *object) +{ + mca_bmi_ib_mem_registry_info_t *info = (mca_bmi_ib_mem_registry_info_t *)object; + info->next = NULL; + info->ref_cnt = 0; + info->hndl = VAPI_INVAL_HNDL; + memset(&(info->request), 0, sizeof(VAPI_mr_t)); + memset(&(info->reply), 0, sizeof(VAPI_mr_t)); + return; +} + +static void mca_bmi_ib_mem_registry_info_destruct(ompi_object_t *object) +{ + return; +} + +static int mca_bmi_ib_mem_registry_info_compare(void *request, void *treenode) +{ + int result; + VAPI_mr_t *mr1 = (VAPI_mr_t *)request; + VAPI_mr_t *mr2 = (VAPI_mr_t *)treenode; + uint64_t start1 = mr1->start; + uint64_t start2 = mr2->start; + uint64_t end1 = start1 + mr1->size; + uint64_t end2 = start2 + mr2->size; + + if (end1 < start2) { + /* non-overlapping mr1 < mr2 */ + result = -1; + } + else if (start1 > end2) { + /* non-overlapping mr1 > mr2 */ + result = 1; + } + else if ((end1 <= end2) && (start1 >= start2)) { + /* completely overlapping mr1 and mr2 (mr2 may be bigger) */ + if ((mr1->acl & mr2->acl) == mr1->acl) { + /* minimum access permissions met */ + result = 0; + } + else { + /* oops -- access permissions not good enough */ + result = 1; + } + } + else if (start1 < start2) { + /* partially overlapping mr1 < mr2 */ + result = -1; + } + else { + /* partially overlapping mr1 > mr2 */ + result = 1; + } + + return result; +} + +void mca_bmi_ib_mem_registry_clean_evictables( + mca_bmi_ib_mem_registry_t *registry, + mca_bmi_ib_mem_registry_info_t *info) +{ + mca_bmi_ib_mem_registry_info_t *tmp = registry->evictable; + mca_bmi_ib_mem_registry_info_t *prev = NULL; + + while (NULL != tmp) { + if (tmp == info) { + if (NULL == prev) { + /* no more entries left -- no evictable list */ + registry->evictable = NULL; + } + else { + /* remove this entry from the evictable list */ + prev->next = tmp->next; + } + /* clear this entry's evictable link */ + tmp->next = NULL; + break; + } + prev = tmp; + tmp = tmp->next; + } + return; +} + +mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_register( + mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *mr) +{ + mca_bmi_ib_mem_registry_info_t *info = mca_bmi_ib_mem_registry_find(registry, mr); + mca_bmi_ib_mem_registry_info_t *next_to_evict; + ompi_list_item_t *item; + VAPI_ret_t vapi_result; + int rc; + + if (info == (mca_bmi_ib_mem_registry_info_t *)NULL) { + /* create new entry and register memory region */ + item = (ompi_list_item_t *)info; + OMPI_FREE_LIST_GET(&(registry->info_free_list), item, rc); + info = (mca_bmi_ib_mem_registry_info_t *)item; + if (OMPI_SUCCESS != rc) { + /* error - return null pointer */ + return info; + } + memcpy(&(info->request),mr,sizeof(VAPI_mr_t)); + info->ref_cnt = 1; + do { + vapi_result = VAPI_register_mr(registry->ib_bmi->nic, mr, + &(info->hndl), &(info->reply)); + if (VAPI_OK != vapi_result) { + if (VAPI_EAGAIN == vapi_result) { + /* evict an unused memory region, if at all possible */ + if (NULL != registry->evictable) { + next_to_evict = registry->evictable->next; + mca_bmi_ib_mem_registry_real_deregister(registry, registry->evictable); + registry->evictable = next_to_evict; + } + } + else { + /* fatal error */ + item = (ompi_list_item_t *)info; + OMPI_FREE_LIST_RETURN(&(registry->info_free_list), item); + info = NULL; + return info; + } + } + } while ((VAPI_OK != vapi_result) && (NULL != info)); + /* insert a reference to this information into the red/black tree */ + rc = ompi_rb_tree_insert(&(registry->rb_tree), &(info->reply), info); + /* aargh! what do we do if the tree insert fails... */ + mca_bmi_ib_mem_registry_insert_hint(registry, &(info->reply), info); + } + else { + if (0 == info->ref_cnt) { + /* make sure we're not on the evictable list */ + mca_bmi_ib_mem_registry_clean_evictables(registry, info); + } + (info->ref_cnt)++; + } + + return info; +} + +mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_register_mem_with_registry( + mca_bmi_ib_module_t *ib_module, + void *addr, size_t len) +{ + mca_bmi_ib_mem_registry_info_t *info; + VAPI_mr_t mr; + + mr.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; + mr.l_key = 0; + mr.r_key = 0; + mr.pd_hndl = ib_module->ptag; + mr.size = len; + mr.start = (VAPI_virt_addr_t) (MT_virt_addr_t) addr; + mr.type = VAPI_MR; + + info = mca_bmi_ib_mem_registry_register(&(ib_module->mem_registry),&mr); + return info; +} + +int mca_bmi_ib_deregister_mem_with_registry( + mca_bmi_ib_module_t *ib_module, + void *addr, size_t len) +{ + VAPI_mr_t mr; + int rc; + + mr.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; + mr.l_key = 0; + mr.r_key = 0; + mr.pd_hndl = ib_module->ptag; + mr.size = len; + mr.start = (VAPI_virt_addr_t) (MT_virt_addr_t) addr; + mr.type = VAPI_MR; + + rc = mca_bmi_ib_mem_registry_deregister(&(ib_module->mem_registry),&mr); + return rc; +} + +static int mca_bmi_ib_mem_registry_real_deregister( + mca_bmi_ib_mem_registry_t *registry, + mca_bmi_ib_mem_registry_info_t *info) +{ + ompi_list_item_t *item; + VAPI_ret_t vapi_result; + int i; + + /* clear hints array of references to this info object */ + for (i = 0; i < registry->hints_size; i++) { + if (registry->hints[i].pval == info) { + registry->hints[i].pval = (void *)NULL; + } + } + /* delete the info object from the red/black tree */ + ompi_rb_tree_delete(&(registry->rb_tree), &(info->reply)); + /* do the real deregistration */ + vapi_result = VAPI_deregister_mr(registry->ib_bmi->nic, info->hndl); + /* return the info object to the free list */ + item = (ompi_list_item_t *)info; + OMPI_FREE_LIST_RETURN(&(registry->info_free_list), item); + /* return an error if we could not successfully deregister memory region */ + if (VAPI_OK != vapi_result) { + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +int mca_bmi_ib_mem_registry_deregister( + mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *mr) +{ + mca_bmi_ib_mem_registry_info_t *info = mca_bmi_ib_mem_registry_find(registry, mr); + + if (info != NULL) { + if (info->ref_cnt > 0) { + (info->ref_cnt)--; + if (0 == info->ref_cnt) { + info->next = registry->evictable; + registry->evictable = info; + } + } + } + else { + return OMPI_ERR_NOT_FOUND; + } + + return OMPI_SUCCESS; +} + + +int mca_bmi_ib_mem_registry_init( + mca_bmi_ib_mem_registry_t *registry, + struct mca_bmi_ib_module_t *ib_bmi) +{ + registry->ib_bmi = ib_bmi; + return OMPI_SUCCESS; +} + diff --git a/src/mca/bmi/ib/bmi_ib_memory.h b/src/mca/bmi/ib/bmi_ib_memory.h new file mode 100644 index 0000000000..2ed3774b58 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_memory.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_PTL_IB_MEMORY_H +#define MCA_PTL_IB_MEMORY_H + +/* Standard system includes */ +#include + +/* Open MPI includes */ +#include "include/types.h" +#include "include/constants.h" +#include "class/ompi_object.h" +#include "class/ompi_list.h" +#include "class/ompi_rb_tree.h" +#include "class/ompi_free_list.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +/* vapi.h is not a C++ safe header file */ +#include +#include + +struct mca_bmi_ib_module_t; +typedef struct mca_bmi_ib_mem_registry_info_t mca_bmi_ib_mem_registry_info_t; + +struct mca_bmi_ib_mem_registry_info_t { + ompi_list_item_t super; + mca_bmi_ib_mem_registry_info_t *next; + int ref_cnt; + VAPI_mr_hndl_t hndl; + VAPI_mr_t request; + VAPI_mr_t reply; +}; + +OBJ_CLASS_DECLARATION(mca_bmi_ib_mem_registry_info_t); + +typedef struct mca_bmi_ib_mem_registry_t mca_bmi_ib_mem_registry_t; + +struct mca_bmi_ib_mem_registry_t { + ompi_rb_tree_t rb_tree; + ompi_free_list_t info_free_list; + ompi_ptr_t *hints; + mca_bmi_ib_mem_registry_info_t *evictable; + struct mca_bmi_ib_module_t *ib_bmi; + int hints_log_size; + int hints_size; +}; + +OBJ_CLASS_DECLARATION(mca_bmi_ib_mem_registry_t); + +static inline void mca_bmi_ib_mem_registry_insert_hint( + mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *key, + mca_bmi_ib_mem_registry_info_t *info) +{ + uint64_t hints_hash = 0, addrll; + + if (registry->hints_size) { + addrll = (uint64_t)(key->start); + + /* calculate hash index for hints array - hash is (hints_log_size - 1) bits of key + * from first non-zero least significant bit + */ + hints_hash = addrll & (-addrll); + hints_hash = (((hints_hash << registry->hints_log_size) - hints_hash) & addrll) / + hints_hash; + + registry->hints[hints_hash].pval = info; + } + return; +} + +/* find information on a registered memory region for a given address, + * region size, and access permissions + * + */ +static inline mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_find( + mca_bmi_ib_mem_registry_t *registry, VAPI_mr_t *key) +{ + mca_bmi_ib_mem_registry_info_t *info = (mca_bmi_ib_mem_registry_info_t *)NULL; + uint64_t hints_hash = 0, addrll; + + if (registry->hints_size) { + addrll = (uint64_t)(key->start); + + /* calculate hash index for hints array - hash is (hints_log_size - 1) bits of key + * from first non-zero least significant bit + */ + hints_hash = addrll & (-addrll); + hints_hash = (((hints_hash << registry->hints_log_size) - hints_hash) & addrll) / + hints_hash; + + if ((info = registry->hints[hints_hash].pval) != (void *)NULL) { + if ((info->reply.start <= key->start) && + ((info->reply.start + info->reply.size) >= (key->start + key->size)) && + ((info->reply.acl & key->acl) == key->acl)) { + return info; + } + } + } + + /* search the red/black tree */ + info = ompi_rb_tree_find(&(registry->rb_tree), key); + + /* store a pointer to this info in the hints array for later lookups */ + if ((info != NULL) && registry->hints_size) { + registry->hints[hints_hash].pval = info; + } + + return info; +} + +/* prototypes */ + +mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_mem_registry_register( + mca_bmi_ib_mem_registry_t *registry, + VAPI_mr_t *mr); + +mca_bmi_ib_mem_registry_info_t *mca_bmi_ib_register_mem_with_registry( + struct mca_bmi_ib_module_t *ib_bmi, + void *addr, size_t len); + +int mca_bmi_ib_deregister_mem_with_registry( + struct mca_bmi_ib_module_t *ib_bmi, + void *addr, size_t len); + +int mca_bmi_ib_mem_registry_deregister( + mca_bmi_ib_mem_registry_t *registry, + VAPI_mr_t *mr); + +int mca_bmi_ib_mem_registry_init( + mca_bmi_ib_mem_registry_t* registry, + struct mca_bmi_ib_module_t *ib_bmi); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_peer.c b/src/mca/bmi/ib/bmi_ib_peer.c new file mode 100644 index 0000000000..23d815b03d --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_peer.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include +#include +#include "include/types.h" +#include "mca/pml/base/pml_base_sendreq.h" +#include "mca/ns/base/base.h" +#include "mca/oob/base/base.h" +#include "mca/rml/rml.h" +#include "mca/errmgr/errmgr.h" +#include "dps/dps.h" +#include "bmi_ib.h" +#include "bmi_ib_addr.h" +#include "bmi_ib_peer.h" +#include "bmi_ib_proc.h" +#include "bmi_ib_priv.h" +#include "bmi_ib_sendfrag.h" + +static void mca_bmi_ib_peer_construct(mca_bmi_base_endpoint_t* peer); +static void mca_bmi_ib_peer_destruct(mca_bmi_base_endpoint_t* peer); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_endpoint_t, + ompi_list_item_t, mca_bmi_ib_peer_construct, + mca_bmi_ib_peer_destruct); + +/* + * Initialize state of the peer instance. + * + */ + +static void mca_bmi_ib_peer_construct(mca_bmi_base_endpoint_t* peer) +{ + peer->peer_bmi = 0; + peer->peer_proc = 0; + peer->peer_tstamp = 0.0; + peer->peer_state = MCA_PTL_IB_CLOSED; + peer->peer_retries = 0; + OBJ_CONSTRUCT(&peer->peer_send_lock, ompi_mutex_t); + OBJ_CONSTRUCT(&peer->peer_recv_lock, ompi_mutex_t); + OBJ_CONSTRUCT(&peer->pending_send_frags, ompi_list_t); +} + +/* + * Destroy a peer + * + */ + +static void mca_bmi_ib_peer_destruct(mca_bmi_base_endpoint_t* peer) +{ +} + +/* + * Send connection information to remote peer using OOB + * + */ + +static void mca_bmi_ib_peer_send_cb( + int status, + orte_process_name_t* peer, + orte_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + OBJ_RELEASE(buffer); +} + + +static int mca_bmi_ib_peer_send_connect_req(mca_bmi_base_endpoint_t* peer) +{ + orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t); + int rc; + if(NULL == buffer) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + + /* pack the info in the send buffer */ + rc = orte_dps.pack(buffer, &peer->lcl_qp_prop.qp_num, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dps.pack(buffer, &peer->peer_bmi->port.lid, 1, ORTE_UINT32); + + /* send to peer */ + rc = orte_rml.send_buffer_nb(&peer->peer_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0, + mca_bmi_ib_peer_send_cb, NULL); + if(rc < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } + return OMPI_SUCCESS; +} + +/* + * Send connect ACK to remote peer + * + */ + +static int mca_bmi_ib_peer_send_connect_ack(mca_bmi_base_endpoint_t* peer) +{ + orte_buffer_t* buffer = OBJ_NEW(orte_buffer_t); + int rc; + uint32_t zero = 0; + + /* pack the info in the send buffer */ + if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* send to peer */ + rc = orte_rml.send_buffer_nb(&peer->peer_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0, + mca_bmi_ib_peer_send_cb, NULL); + if(rc < 0) { + ORTE_ERROR_LOG(rc); + return rc; + } +} + +/* + * Set remote connection info + * + * XXX: Currently size is unutilized, this shall change + * as soon as we add more info to be exchanged at connection + * setup. + * + */ +static int mca_bmi_ib_peer_set_remote_info(mca_bmi_base_endpoint_t* peer, orte_buffer_t* buffer) +{ + int rc; + size_t cnt = 1; + rc = orte_dps.unpack(buffer, &peer->rem_qp_num, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dps.unpack(buffer, &peer->rem_lid, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + D_PRINT("Received QP num = %d, LID = %d", + peer->rem_qp_num, + peer->rem_lid); + return ORTE_SUCCESS; +} + + +static int mca_bmi_ib_peer_init( + mca_bmi_ib_endpoint_t *peer) +{ + return OMPI_SUCCESS; +} + +/* + * Start to connect to the peer. We send our Queue Pair + * information over the TCP OOB communication mechanism. + * On completion of our send, a send completion handler + * is called. + * + */ + +static int mca_bmi_ib_peer_start_connect(mca_bmi_base_endpoint_t* peer) +{ + mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi; + int rc; + + /* Create the Queue Pair */ + if(OMPI_SUCCESS != (rc = mca_bmi_ib_create_qp(ib_bmi->nic, + ib_bmi->ptag, + ib_bmi->cq_hndl, + ib_bmi->cq_hndl, + &peer->lcl_qp_hndl, + &peer->lcl_qp_prop, + VAPI_TS_RC))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + return rc; + } + + /* Send connection info over to remote peer */ + peer->peer_state = MCA_BMI_IB_CONNECTING; + if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send_connect_req(peer))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + return rc; + } + return OMPI_SUCCESS; +} + +/* + * Reply to a `start - connect' message + * + */ +static int mca_bmi_ib_peer_reply_start_connect(mca_bmi_ib_endpoint_t *peer, orte_buffer_t* buffer) +{ + mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi; + int rc; + + /* Create the Queue Pair */ + if(OMPI_SUCCESS != (rc = mca_bmi_ib_create_qp(ib_bmi->nic, + ib_bmi->ptag, + ib_bmi->cq_hndl, + ib_bmi->cq_hndl, + &peer->lcl_qp_hndl, + &peer->lcl_qp_prop, + VAPI_TS_RC))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + return rc; + } + + /* Set the remote side info */ + mca_bmi_ib_peer_set_remote_info(peer, buffer); + + /* Connect to peer */ + rc = mca_bmi_ib_peer_connect(peer); + if(rc != OMPI_SUCCESS) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + return rc; + } + + /* Send connection info over to remote peer */ + if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_send_connect_req(peer))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + return rc; + } + return OMPI_SUCCESS; +} + +/* + * + */ + +static void mca_bmi_ib_peer_connected(mca_bmi_ib_endpoint_t *peer) +{ + peer->peer_state = MCA_BMI_IB_CONNECTED; + mca_bmi_ib_progress_send_frags(peer); +} + +/* + * Non blocking OOB recv callback. + * Read incoming QP and other info, and if this peer + * is trying to connect, reply with our QP info, + * otherwise try to modify QP's and establish + * reliable connection + * + */ + +static void mca_bmi_ib_peer_recv( + int status, + orte_process_name_t* peer, + orte_buffer_t* buffer, + orte_rml_tag_t tag, + void* cbdata) +{ + mca_bmi_ib_proc_t *ib_proc; + mca_bmi_ib_endpoint_t *ib_peer; + int peer_state; + int rc; + + for(ib_proc = (mca_bmi_ib_proc_t*) + ompi_list_get_first(&mca_bmi_ib_component.ib_procs); + ib_proc != (mca_bmi_ib_proc_t*) + ompi_list_get_end(&mca_bmi_ib_component.ib_procs); + ib_proc = (mca_bmi_ib_proc_t*)ompi_list_get_next(ib_proc)) { + + if(ib_proc->proc_guid.vpid == peer->vpid) { + + /* Try to get the peer instance of this proc */ + + /* Limitation: Right now, we have only 1 peer + * for every process. Need several changes, some + * in PML/BMI interface to set this right */ + ib_peer = ib_proc->proc_peers[0]; + + peer_state = ib_peer->peer_state; + + /* Update status */ + switch(peer_state) { + case MCA_BMI_IB_CLOSED : + /* We had this connection closed before. + * The peer is trying to connect. Move the + * status of this connection to CONNECTING, + * and then reply with our QP information */ + + if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_reply_start_connect(ib_peer, buffer))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + break; + } + + /* Setup state as connected */ + ib_peer->peer_state = MCA_BMI_IB_CONNECT_ACK; + break; + + case MCA_BMI_IB_CONNECTING : + + mca_bmi_ib_peer_set_remote_info(ib_peer, buffer); + if(OMPI_SUCCESS != (rc = mca_bmi_ib_peer_connect(ib_peer))) { + ompi_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); + break; + } + + /* Setup state as connected */ + mca_bmi_ib_peer_connected(ib_peer); + + /* Send him an ack */ + mca_bmi_ib_peer_send_connect_ack(ib_peer); + break; + + case MCA_BMI_IB_CONNECT_ACK: + + mca_bmi_ib_peer_connected(ib_peer); + + break; + + case MCA_BMI_IB_CONNECTED : + break; + default : + ompi_output(0, "Connected -> Connecting not possible.\n"); + } + + break; + } + } + + /* Okay, now that we are done receiving, + * re-post the buffer */ + mca_bmi_ib_post_recv(); +} + +void mca_bmi_ib_post_recv() +{ + D_PRINT(""); + + orte_rml.recv_buffer_nb( + ORTE_RML_NAME_ANY, + ORTE_RML_TAG_DYNAMIC-1, + 0, + mca_bmi_ib_peer_recv, + NULL); +} + + +/* + * Attempt to send a fragment using a given peer. If the peer is not + * connected, queue the fragment and start the connection as required. + */ + +int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t* peer, + mca_bmi_ib_send_frag_t* frag) +{ + int rc; + + + OMPI_THREAD_LOCK(&peer->peer_send_lock); + + switch(peer->peer_state) { + case MCA_BMI_IB_CONNECTING: + + D_PRINT("Queing because state is connecting"); + + ompi_list_append(&peer->pending_send_frags, + (ompi_list_item_t *)frag); + + rc = OMPI_SUCCESS; + break; + + case MCA_BMI_IB_CONNECT_ACK: + + D_PRINT("Queuing because waiting for ack"); + + ompi_list_append(&peer->pending_send_frags, + (ompi_list_item_t *)frag); + + rc = OMPI_SUCCESS; + break; + + case MCA_BMI_IB_CLOSED: + + D_PRINT("Connection to peer closed ... connecting ..."); + + ompi_list_append(&peer->pending_send_frags, + (ompi_list_item_t *)frag); + + rc = mca_bmi_ib_peer_start_connect(peer); + + break; + + case MCA_BMI_IB_FAILED: + + rc = OMPI_ERR_UNREACH; + break; + + case MCA_BMI_IB_CONNECTED: + { + mca_bmi_ib_module_t* ib_bmi = peer->peer_bmi; + ompi_list_item_t* item; + + A_PRINT("Send to : %d, len : %d, frag : %p", + peer->peer_proc->proc_guid.vpid, + frag->ib_buf.desc.sg_entry.len, + frag); + + rc = mca_bmi_ib_post_send(peer->peer_bmi, peer, + &frag->ib_buf, (void*) frag); + while(NULL != (item = ompi_list_remove_first(&ib_bmi->repost))) { + mca_bmi_ib_buffer_repost(ib_bmi->nic, item); + } + break; + } + default: + rc = OMPI_ERR_UNREACH; + } + + OMPI_THREAD_UNLOCK(&peer->peer_send_lock); + + return rc; +} + +void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t* peer) +{ + ompi_list_item_t *frag_item; + mca_bmi_ib_send_frag_t *sendfrag; + + /*Check if peer is connected */ + if(peer->peer_state != MCA_BMI_IB_CONNECTED) { + + return; + } + + /* While there are frags in the list, + * process them */ + + while(!ompi_list_is_empty(&(peer->pending_send_frags))) { + + frag_item = ompi_list_remove_first(&(peer->pending_send_frags)); + sendfrag = (mca_bmi_ib_send_frag_t *) frag_item; + + /* We need to post this one */ + if(mca_bmi_ib_post_send(peer->peer_bmi, peer, &sendfrag->ib_buf, + (void*) sendfrag) + != OMPI_SUCCESS) { + ompi_output(0, "Error in posting send"); + } + } +} + + +/* + * Complete connection to peer. + */ + +int mca_bmi_ib_peer_connect( + mca_bmi_ib_endpoint_t *peer) +{ + int rc, i; + VAPI_ret_t ret; + ib_buffer_t *ib_buf_ptr; + mca_bmi_ib_module_t *ib_bmi = peer->peer_bmi; + + /* Establish Reliable Connection */ + rc = mca_bmi_ib_qp_init(ib_bmi->nic, + peer->lcl_qp_hndl, + peer->rem_qp_num, + peer->rem_lid); + + if(rc != OMPI_SUCCESS) { + return rc; + } + + /* Allocate resources to this connection */ + peer->lcl_recv = (ib_buffer_t*) + malloc(sizeof(ib_buffer_t) * NUM_IB_RECV_BUF); + if(NULL == peer->lcl_recv) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Register the buffers */ + for(i = 0; i < NUM_IB_RECV_BUF; i++) { + + rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag, + (void*) peer->lcl_recv[i].buf, + MCA_BMI_IB_FIRST_FRAG_SIZE, + &peer->lcl_recv[i].hndl); + if(rc != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + ib_buf_ptr = &peer->lcl_recv[i]; + ib_buf_ptr->qp_hndl = peer->lcl_qp_hndl; + + IB_PREPARE_RECV_DESC(ib_buf_ptr); + } + + /* Post receives */ + for(i = 0; i < NUM_IB_RECV_BUF; i++) { + + ret = VAPI_post_rr(ib_bmi->nic, + peer->lcl_qp_hndl, + &peer->lcl_recv[i].desc.rr); + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_rr"); + } + } + return OMPI_SUCCESS; +} diff --git a/src/mca/bmi/ib/bmi_ib_peer.h b/src/mca/bmi/ib/bmi_ib_peer.h new file mode 100644 index 0000000000..0c2c0b8ec9 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_peer.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_PEER_H +#define MCA_BMI_IB_PEER_H + +#include "class/ompi_list.h" +#include "event/event.h" +#include "mca/pml/pml.h" +#include "mca/bmi/bmi.h" +#include "bmi_ib_recvfrag.h" +#include "bmi_ib_sendfrag.h" +#include "bmi_ib_priv.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_endpoint_t); + +/** + * State of IB peer connection. + */ + +typedef enum { + /* Defines the state in which this BMI instance + * has started the process of connection */ + MCA_BMI_IB_CONNECTING, + + /* Waiting for ack from peer */ + MCA_BMI_IB_CONNECT_ACK, + + /* Connected ... both sender & receiver have + * buffers associated with this connection */ + MCA_BMI_IB_CONNECTED, + + /* Connection is closed, there are no resources + * associated with this */ + MCA_BMI_IB_CLOSED, + + /* Maximum number of retries have been used. + * Report failure on send to upper layer */ + MCA_BMI_IB_FAILED +} mca_bmi_ib_peer_state_t; + + +int mca_bmi_ib_peer_send(mca_bmi_base_endpoint_t*, mca_bmi_ib_send_frag_t*); +int mca_bmi_ib_peer_connect(mca_bmi_base_endpoint_t*); +void mca_bmi_ib_post_recv(void); + +void mca_bmi_ib_progress_send_frags(mca_bmi_ib_endpoint_t*); + +#define DUMP_PEER(peer_ptr) { \ + ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \ + ompi_output(0, "Dumping peer %d state", \ + peer->peer_proc->proc_guid.vpid); \ + ompi_output(0, "Local QP hndl : %d", \ + peer_ptr->peer_conn->lres->qp_hndl); \ + ompi_output(0, "Local QP num : %d", \ + peer_ptr->peer_conn->lres->qp_prop.qp_num); \ + ompi_output(0, "Remote QP num : %d", \ + peer_ptr->peer_conn->rres->qp_num); \ + ompi_output(0, "Remote LID : %d", \ + peer_ptr->peer_conn->rres->lid); \ +} + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_priv.c b/src/mca/bmi/ib/bmi_ib_priv.c new file mode 100644 index 0000000000..b3d7bf7e06 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_priv.c @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "bmi_ib_vapi.h" +#include "bmi_ib_priv.h" +#include "bmi_ib.h" +#include "bmi_ib_memory.h" + +/* + * Asynchronous event handler to detect unforseen + * events. Usually, such events are catastrophic. + * Should have a robust mechanism to handle these + * events and abort the OMPI application if necessary. + * + */ +static void async_event_handler(VAPI_hca_hndl_t hca_hndl, + VAPI_event_record_t * event_p, + void *priv_data) +{ + switch (event_p->type) { + case VAPI_QP_PATH_MIGRATED: + case VAPI_EEC_PATH_MIGRATED: + case VAPI_QP_COMM_ESTABLISHED: + case VAPI_EEC_COMM_ESTABLISHED: + case VAPI_SEND_QUEUE_DRAINED: + case VAPI_PORT_ACTIVE: + { + D_PRINT("Got an asynchronous event: %s\n", + VAPI_event_record_sym(event_p->type)); + break; + } + case VAPI_CQ_ERROR: + case VAPI_LOCAL_WQ_INV_REQUEST_ERROR: + case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR: + case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR: + case VAPI_PATH_MIG_REQ_ERROR: + case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR: + case VAPI_LOCAL_CATASTROPHIC_ERROR: + case VAPI_PORT_ERROR: + { + ompi_output(0, "Got an asynchronous event: %s (%s)", + VAPI_event_record_sym(event_p->type), + VAPI_event_syndrome_sym(event_p-> + syndrome)); + break; + } + default: + ompi_output(0, "Warning!! Got an undefined " + "asynchronous event\n"); + } + +} + +/* + * This function returns the hca_id for each BMI + * in a round robin manner. Each BMI gets a different + * HCA id ... + * + * If num BMIs > num HCAs, then those bmis will be + * assigned HCA ids beginning from 0 again. + * + */ + + +static int mca_bmi_ib_get_hca_hndl(VAPI_hca_id_t hca_id, + VAPI_hca_hndl_t* hca_hndl) +{ + VAPI_ret_t ret; + + /* Open the HCA */ + ret = EVAPI_get_hca_hndl(hca_id, hca_hndl); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "EVAPI_get_hca_hndl"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_bmi_ib_query_hca_prop(VAPI_hca_hndl_t nic, + VAPI_hca_port_t* port) +{ + VAPI_ret_t ret; + + /* Querying for port properties */ + ret = VAPI_query_hca_port_prop(nic, + (IB_port_t)DEFAULT_PORT, + port); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_query_hca_port_prop"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_bmi_ib_alloc_pd(VAPI_hca_hndl_t nic, + VAPI_pd_hndl_t* ptag) +{ + VAPI_ret_t ret; + + ret = VAPI_alloc_pd(nic, ptag); + + if(ret != VAPI_OK) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_alloc_pd"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_bmi_ib_create_cq(VAPI_hca_hndl_t nic, + VAPI_cq_hndl_t* cq_hndl) +{ + uint32_t act_num_cqe = 0; + VAPI_ret_t ret; + + ret = VAPI_create_cq(nic, DEFAULT_CQ_SIZE, + cq_hndl, &act_num_cqe); + + if( (VAPI_OK != ret) || (0 == act_num_cqe)) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_create_cq"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_bmi_ib_set_async_handler(VAPI_hca_hndl_t nic, + EVAPI_async_handler_hndl_t *async_handler) +{ + VAPI_ret_t ret; + + ret = EVAPI_set_async_event_handler(nic, + async_event_handler, 0, async_handler); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "EVAPI_set_async_event_handler"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +int mca_bmi_ib_create_qp(VAPI_hca_hndl_t nic, + VAPI_pd_hndl_t ptag, + VAPI_cq_hndl_t recv_cq, + VAPI_cq_hndl_t send_cq, + VAPI_qp_hndl_t* qp_hndl, + VAPI_qp_prop_t* qp_prop, + int transport_type) +{ + VAPI_ret_t ret; + VAPI_qp_init_attr_t qp_init_attr; + + switch(transport_type) { + + case VAPI_TS_RC: /* Set up RC qp parameters */ + qp_init_attr.cap.max_oust_wr_rq = DEFAULT_WQ_SIZE; + qp_init_attr.cap.max_oust_wr_sq = DEFAULT_WQ_SIZE; + qp_init_attr.cap.max_sg_size_rq = DEFAULT_SG_LIST; + qp_init_attr.cap.max_sg_size_sq = DEFAULT_SG_LIST; + qp_init_attr.pd_hndl = ptag; + /* We don't have Reliable Datagram Handle right now */ + qp_init_attr.rdd_hndl = 0; + + /* Set Send and Recv completion queues */ + qp_init_attr.rq_cq_hndl = recv_cq; + qp_init_attr.sq_cq_hndl = send_cq; + + /* Signal all work requests on this queue pair */ + qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR; + qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR; + + /* Use Unreliable Datagram transport service */ + qp_init_attr.ts_type = VAPI_TS_RC; + break; + case VAPI_TS_UD: /* Set up UD qp parameters */ + default: + return OMPI_ERR_NOT_IMPLEMENTED; + } + + ret = VAPI_create_qp(nic, &qp_init_attr, + qp_hndl, qp_prop); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_create_qp"); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + +int mca_bmi_ib_module_init(mca_bmi_ib_module_t *ib_bmi) +{ + /* Get HCA handle */ + if(mca_bmi_ib_get_hca_hndl(ib_bmi->hca_id, &ib_bmi->nic) + != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + /* Allocate a protection domain for this NIC */ + if(mca_bmi_ib_alloc_pd(ib_bmi->nic, &ib_bmi->ptag) + != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + /* Get the properties of the HCA, + * LID etc. are part of the properties */ + if(mca_bmi_ib_query_hca_prop(ib_bmi->nic, &ib_bmi->port) + != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + /* Create Completion Q */ + /* We use a single completion Q for sends & recvs + * This saves us overhead of polling 2 separate Qs */ + if(mca_bmi_ib_create_cq(ib_bmi->nic, &ib_bmi->cq_hndl) + != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + /* Attach asynchronous handler */ + if(mca_bmi_ib_set_async_handler(ib_bmi->nic, + &ib_bmi->async_handler) + != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + /* initialize memory region registry */ + OBJ_CONSTRUCT(&ib_bmi->mem_registry, mca_bmi_ib_mem_registry_t); + mca_bmi_ib_mem_registry_init(&ib_bmi->mem_registry, ib_bmi); + return OMPI_SUCCESS; +} + + +int mca_bmi_ib_qp_init(VAPI_hca_hndl_t nic, + VAPI_qp_hndl_t qp_hndl, + VAPI_qp_num_t remote_qp, + IB_lid_t remote_lid) +{ + VAPI_ret_t ret; + VAPI_qp_attr_t qp_attr; + VAPI_qp_attr_mask_t qp_attr_mask; + VAPI_qp_cap_t qp_cap; + + /* Modifying QP to INIT */ + QP_ATTR_MASK_CLR_ALL(qp_attr_mask); + qp_attr.qp_state = VAPI_INIT; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); + qp_attr.pkey_ix = DEFAULT_PKEY_IX; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); + qp_attr.port = DEFAULT_PORT; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PORT); + qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_REMOTE_ATOMIC_FLAGS); + + ret = VAPI_modify_qp(nic, qp_hndl, + &qp_attr, &qp_attr_mask, &qp_cap); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp"); + return OMPI_ERROR; + } + + D_PRINT("Modified to init..Qp %d", qp_hndl); + + /********************** INIT --> RTR ************************/ + QP_ATTR_MASK_CLR_ALL(qp_attr_mask); + qp_attr.qp_state = VAPI_RTR; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); + qp_attr.qp_ous_rd_atom = DEFAULT_QP_OUS_RD_ATOM; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_OUS_RD_ATOM); + qp_attr.path_mtu = DEFAULT_MTU; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PATH_MTU); + qp_attr.rq_psn = DEFAULT_PSN; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RQ_PSN); + qp_attr.pkey_ix = DEFAULT_PKEY_IX; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); + qp_attr.min_rnr_timer = DEFAULT_MIN_RNR_TIMER; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_MIN_RNR_TIMER); + + qp_attr.av.sl = DEFAULT_SERVICE_LEVEL; + qp_attr.av.grh_flag = FALSE; + qp_attr.av.static_rate = DEFAULT_STATIC_RATE; + qp_attr.av.src_path_bits = DEFAULT_SRC_PATH_BITS; + + qp_attr.dest_qp_num = remote_qp; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_DEST_QP_NUM); + qp_attr.av.dlid = remote_lid; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_AV); + + ret = VAPI_modify_qp(nic, qp_hndl, + &qp_attr, &qp_attr_mask, &qp_cap); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp"); + return OMPI_ERROR; + } + + D_PRINT("Modified to RTR..Qp %d", qp_hndl); + + /************** RTS *******************/ + QP_ATTR_MASK_CLR_ALL(qp_attr_mask); + qp_attr.qp_state = VAPI_RTS; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); + qp_attr.sq_psn = DEFAULT_PSN; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_SQ_PSN); + qp_attr.timeout = DEFAULT_TIME_OUT; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_TIMEOUT); + qp_attr.retry_count = DEFAULT_RETRY_COUNT; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RETRY_COUNT); + qp_attr.rnr_retry = DEFAULT_RNR_RETRY; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RNR_RETRY); + qp_attr.ous_dst_rd_atom = DEFAULT_MAX_RDMA_DST_OPS; + QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_OUS_DST_RD_ATOM); + + ret = VAPI_modify_qp(nic, qp_hndl, + &qp_attr, &qp_attr_mask, &qp_cap); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_modify_qp"); + return OMPI_ERROR; + } + D_PRINT("Modified to RTS..Qp %d", qp_hndl); + + return OMPI_SUCCESS; +} + +int mca_bmi_ib_register_mem(VAPI_hca_hndl_t nic, VAPI_pd_hndl_t ptag, + void* buf, int len, vapi_memhandle_t* memhandle) +{ + VAPI_ret_t ret; + VAPI_mrw_t mr_in, mr_out; + vapi_memhandle_t mem_handle; + + mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; + mr_in.l_key = 0; + mr_in.r_key = 0; + mr_in.pd_hndl = ptag; + mr_in.size = len; + mr_in.start = (VAPI_virt_addr_t) (MT_virt_addr_t) buf; + mr_in.type = VAPI_MR; + + ret = VAPI_register_mr(nic, &mr_in, &mem_handle.hndl, &mr_out); + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_register_mr"); + return OMPI_ERROR; + } + + mem_handle.lkey = mr_out.l_key; + mem_handle.rkey = mr_out.r_key; + + memhandle->lkey = mem_handle.lkey; + memhandle->rkey = mem_handle.rkey; + + /* D_PRINT("addr = %p, lkey = %d\n", buf, memhandle->lkey); */ + + memhandle->hndl = mem_handle.hndl; + + return OMPI_SUCCESS; +} + + +int mca_bmi_ib_post_send(mca_bmi_ib_module_t *ib_bmi, + mca_bmi_ib_endpoint_t *peer, + ib_buffer_t *ib_buf, void* addr) +{ + VAPI_ret_t ret; + int msg_len = ib_buf->desc.sg_entry.len; + + IB_PREPARE_SEND_DESC(ib_buf, (peer->rem_qp_num), + msg_len, addr); + + /* TODO - get this from NIC properties */ + if(msg_len < 128) { /* query this information from VAPI_query_qp(property max_inline_data_sq) */ + ret = EVAPI_post_inline_sr(ib_bmi->nic, + peer->lcl_qp_hndl, + &ib_buf->desc.sr); + } else { + ret = VAPI_post_sr(ib_bmi->nic, + peer->lcl_qp_hndl, + &ib_buf->desc.sr); + } + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_sr"); + return OMPI_ERROR; + } + return OMPI_SUCCESS; +} + + +void mca_bmi_ib_buffer_repost(VAPI_hca_hndl_t nic, void* addr) +{ + VAPI_ret_t ret; + ib_buffer_t *ib_buf = (ib_buffer_t*)addr; + + IB_PREPARE_RECV_DESC(ib_buf); + + ret = VAPI_post_rr(nic, ib_buf->qp_hndl, &(ib_buf->desc.rr)); + + if(VAPI_OK != ret) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_rr"); + ompi_output(0, "Error in buffer reposting"); + } +} + +void mca_bmi_ib_prepare_ack(mca_bmi_ib_module_t *ib_bmi, + void* addr_to_reg, int len_to_reg, + void* ack_buf, int* len_added) +{ + mca_bmi_ib_mem_registry_info_t *info = + mca_bmi_ib_register_mem_with_registry(ib_bmi, + addr_to_reg, (size_t)len_to_reg); + + if(NULL == info) { + ompi_output(0, "Error in registering"); + } + + A_PRINT("Sending Remote key : %d", info->reply.r_key); + + memcpy(ack_buf,(void*) &(info->reply.r_key), sizeof(VAPI_rkey_t)); + + *len_added = sizeof(VAPI_rkey_t); +} + +int mca_bmi_ib_rdma_write(mca_bmi_ib_module_t *ib_bmi, + mca_bmi_ib_endpoint_t *peer, ib_buffer_t *ib_buf, + void* send_buf, size_t send_len, void* remote_buf, + VAPI_rkey_t remote_key, void* id_buf) +{ + VAPI_ret_t ret; + + mca_bmi_ib_mem_registry_info_t *info = + mca_bmi_ib_register_mem_with_registry(ib_bmi, + send_buf, send_len); + + if (NULL == info) { + return OMPI_ERROR; + } + + /* Prepare descriptor */ + IB_PREPARE_RDMA_W_DESC(ib_buf, (peer->rem_qp_num), + send_len, send_buf, (info->reply.l_key), remote_key, + id_buf, remote_buf); + + ret = VAPI_post_sr(ib_bmi->nic, + peer->lcl_qp_hndl, + &ib_buf->desc.sr); + if(ret != VAPI_OK) { + MCA_BMI_IB_VAPI_RET(ret, "VAPI_post_sr"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} diff --git a/src/mca/bmi/ib/bmi_ib_priv.h b/src/mca/bmi/ib/bmi_ib_priv.h new file mode 100644 index 0000000000..406fdc8074 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_priv.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_PRIV_H +#define MCA_BMI_IB_PRIV_H + +#include +#include "class/ompi_free_list.h" +#include "bmi_ib_vapi.h" +#include "bmi_ib_memory.h" + +#define NUM_IB_SEND_BUF (1) +#define NUM_IB_RECV_BUF (4) + +#define MCA_BMI_IB_FIRST_FRAG_SIZE (65536) + +typedef enum { + IB_RECV, + IB_SEND +} IB_wr_t; + +typedef enum { + IB_COMP_ERROR, + IB_COMP_RECV, + IB_COMP_SEND, + IB_COMP_RDMA_W, + IB_COMP_NOTHING +} IB_comp_t; + +struct vapi_memhandle_t { + VAPI_mr_hndl_t hndl; + /* Memory region handle */ + + VAPI_lkey_t lkey; + /* Local key to registered memory, needed for + * posting send/recv requests */ + + VAPI_rkey_t rkey; + /* Remote key to registered memory, need to send this + * to remote processes for incoming RDMA ops */ +}; + +typedef struct vapi_memhandle_t vapi_memhandle_t; + +struct vapi_descriptor_t { + union { + VAPI_rr_desc_t rr; + /* Receive descriptor */ + + VAPI_sr_desc_t sr; + /* Send descriptor */ + }; + + VAPI_sg_lst_entry_t sg_entry; + /* Scatter/Gather entry */ +}; + +typedef struct vapi_descriptor_t vapi_descriptor_t; + +struct ib_buffer_t { + ompi_list_item_t super; + vapi_descriptor_t desc; + /* Descriptor of the buffer */ + + vapi_memhandle_t hndl; + /* Buffer handle */ + + char buf[MCA_BMI_IB_FIRST_FRAG_SIZE]; + /* Buffer space */ + + VAPI_qp_hndl_t qp_hndl; + /* Queue pair used for this IB buffer */ +}; + +typedef struct ib_buffer_t ib_buffer_t; + + +#define DUMP_IB_STATE(ib_bmi) { \ + ompi_output(0, "[%s:%d] ", __FILE__, __LINE__); \ + ompi_output(0, "Dumping IB state"); \ + ompi_output(0, "HCA ID : %s", ib_bmi->hca_id); \ + ompi_output(0, "LID : %d", ib_bmi->port.lid); \ + ompi_output(0, "HCA handle : %d", ib_bmi->nic); \ + ompi_output(0, "Protection Domain: %d", ib_bmi->ptag); \ + ompi_output(0, "Comp Q handle : %d", ib_bmi->cq_hndl); \ + ompi_output(0, "Async hndl : %d", ib_bmi->async_handler); \ +} + +#define IB_PREPARE_RECV_DESC(ib_buf_ptr) { \ + ib_buf_ptr->desc.rr.comp_type = VAPI_SIGNALED; \ + ib_buf_ptr->desc.rr.opcode = VAPI_RECEIVE; \ + ib_buf_ptr->desc.rr.id = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) ib_buf_ptr; \ + ib_buf_ptr->desc.rr.sg_lst_len = 1; \ + ib_buf_ptr->desc.rr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \ + ib_buf_ptr->desc.sg_entry.len = MCA_BMI_IB_FIRST_FRAG_SIZE; \ + ib_buf_ptr->desc.sg_entry.lkey = ib_buf_ptr->hndl.lkey; \ + ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) ib_buf_ptr->buf; \ +} + +#define IB_PREPARE_SEND_DESC(ib_buf_ptr, qp, msg_len, \ + id_buf) { \ + ib_buf_ptr->desc.sr.comp_type = VAPI_SIGNALED; \ + ib_buf_ptr->desc.sr.opcode = VAPI_SEND; \ + ib_buf_ptr->desc.sr.remote_qkey = 0; \ + ib_buf_ptr->desc.sr.remote_qp = qp; \ + ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) id_buf; \ + ib_buf_ptr->desc.sr.sg_lst_len = 1; \ + ib_buf_ptr->desc.sr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \ + ib_buf_ptr->desc.sg_entry.len = msg_len; \ + ib_buf_ptr->desc.sg_entry.lkey = ib_buf_ptr->hndl.lkey; \ + ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) ib_buf_ptr->buf; \ +} + +#define IB_SET_REMOTE_QP_NUM(ib_buf_ptr, qp) { \ + ib_buf_ptr->desc.sr.remote_qp = qp; \ +} + +#define IB_SET_SEND_DESC_ID(ib_buf_ptr, addr) { \ + ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) addr; \ +} + +#define IB_SET_SEND_DESC_LEN(ib_buf_ptr, msg_len) { \ + ib_buf_ptr->desc.sg_entry.len = msg_len; \ +} + +#define IB_PREPARE_RDMA_W_DESC(ib_buf_ptr, qp, \ + msg_len, user_buf, local_key, remote_key, \ + id_buf, remote_buf) { \ + ib_buf_ptr->desc.sr.comp_type = VAPI_SIGNALED; \ + ib_buf_ptr->desc.sr.opcode = VAPI_RDMA_WRITE; \ + ib_buf_ptr->desc.sr.remote_qkey = 0; \ + ib_buf_ptr->desc.sr.remote_qp = qp; \ + ib_buf_ptr->desc.sr.id = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) id_buf; \ + ib_buf_ptr->desc.sr.sg_lst_len = 1; \ + ib_buf_ptr->desc.sr.sg_lst_p = &ib_buf_ptr->desc.sg_entry; \ + ib_buf_ptr->desc.sg_entry.len = msg_len; \ + ib_buf_ptr->desc.sg_entry.lkey = local_key; \ + ib_buf_ptr->desc.sg_entry.addr = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) user_buf; \ + ib_buf_ptr->desc.sr.remote_addr = (VAPI_virt_addr_t) \ + (MT_virt_addr_t) remote_buf; \ + ib_buf_ptr->desc.sr.r_key = remote_key; \ +} + + +struct mca_bmi_ib_module_t; +struct mca_bmi_base_endpoint_t; + + +int mca_bmi_ib_module_init(struct mca_bmi_ib_module_t*); + +int mca_bmi_ib_register_mem( + VAPI_hca_hndl_t nic, + VAPI_pd_hndl_t ptag, + void* buf, + int len, + vapi_memhandle_t* memhandle); + +int mca_bmi_ib_post_send( + struct mca_bmi_ib_module_t *ib_module, + struct mca_bmi_base_endpoint_t *peer, + ib_buffer_t *ib_buf, void*); + +void mca_bmi_ib_buffer_repost( + VAPI_hca_hndl_t nic, + void* addr); + +void mca_bmi_ib_prepare_ack( + struct mca_bmi_ib_module_t *ib_module, + void* addr_to_reg, int len_to_reg, + void* ack_buf, int* len_added); + +int mca_bmi_ib_rdma_write( + struct mca_bmi_ib_module_t *ib_module, + struct mca_bmi_base_endpoint_t *peer, + ib_buffer_t *ib_buf, + void* send_buf, + size_t send_len, + void* remote_buf, + VAPI_rkey_t remote_key, void*); + +int mca_bmi_ib_create_qp(VAPI_hca_hndl_t nic, + VAPI_pd_hndl_t ptag, + VAPI_cq_hndl_t recv_cq, + VAPI_cq_hndl_t send_cq, + VAPI_qp_hndl_t* qp_hndl, + VAPI_qp_prop_t* qp_prop, + int transport_type); + +int mca_bmi_ib_qp_init( + VAPI_hca_hndl_t nic, + VAPI_qp_hndl_t qp_hndl, + VAPI_qp_num_t remote_qp, + IB_lid_t remote_lid); + +#endif /* MCA_BMI_IB_PRIV_H */ diff --git a/src/mca/bmi/ib/bmi_ib_proc.c b/src/mca/bmi/ib/bmi_ib_proc.c new file mode 100644 index 0000000000..beee38595d --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_proc.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "class/ompi_hash_table.h" +#include "mca/base/mca_base_module_exchange.h" + +#include "bmi_ib.h" +#include "bmi_ib_vapi.h" +#include "bmi_ib_proc.h" + +static void mca_bmi_ib_proc_construct(mca_bmi_ib_proc_t* proc); +static void mca_bmi_ib_proc_destruct(mca_bmi_ib_proc_t* proc); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_proc_t, + ompi_list_item_t, mca_bmi_ib_proc_construct, + mca_bmi_ib_proc_destruct); + +void mca_bmi_ib_proc_construct(mca_bmi_ib_proc_t* proc) +{ + proc->proc_ompi = 0; + proc->proc_addr_count = 0; + proc->proc_peers = 0; + proc->proc_peer_count = 0; + OBJ_CONSTRUCT(&proc->proc_lock, ompi_mutex_t); + /* add to list of all proc instance */ + OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock); + ompi_list_append(&mca_bmi_ib_component.ib_procs, &proc->super); + OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock); +} + +/* + * Cleanup ib proc instance + */ + +void mca_bmi_ib_proc_destruct(mca_bmi_ib_proc_t* proc) +{ + /* remove from list of all proc instances */ + OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock); + ompi_list_remove_item(&mca_bmi_ib_component.ib_procs, &proc->super); + OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock); + + /* release resources */ + if(NULL != proc->proc_peers) { + free(proc->proc_peers); + } +} + + +/* + * Look for an existing IB process instances based on the associated + * ompi_proc_t instance. + */ +static mca_bmi_ib_proc_t* mca_bmi_ib_proc_lookup_ompi(ompi_proc_t* ompi_proc) +{ + mca_bmi_ib_proc_t* ib_proc; + + OMPI_THREAD_LOCK(&mca_bmi_ib_component.ib_lock); + + for(ib_proc = (mca_bmi_ib_proc_t*) + ompi_list_get_first(&mca_bmi_ib_component.ib_procs); + ib_proc != (mca_bmi_ib_proc_t*) + ompi_list_get_end(&mca_bmi_ib_component.ib_procs); + ib_proc = (mca_bmi_ib_proc_t*)ompi_list_get_next(ib_proc)) { + + if(ib_proc->proc_ompi == ompi_proc) { + OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock); + return ib_proc; + } + + } + + OMPI_THREAD_UNLOCK(&mca_bmi_ib_component.ib_lock); + + return NULL; +} + +/* + * Create a IB process structure. There is a one-to-one correspondence + * between a ompi_proc_t and a mca_bmi_ib_proc_t instance. We cache + * additional data (specifically the list of mca_bmi_ib_endpoint_t instances, + * and published addresses) associated w/ a given destination on this + * datastructure. + */ + +mca_bmi_ib_proc_t* mca_bmi_ib_proc_create(ompi_proc_t* ompi_proc) +{ + mca_bmi_ib_proc_t* module_proc = NULL; + + /* Check if we have already created a IB proc + * structure for this ompi process */ + module_proc = mca_bmi_ib_proc_lookup_ompi(ompi_proc); + + if(module_proc != NULL) { + + /* Gotcha! */ + return module_proc; + } + + /* Oops! First time, gotta create a new IB proc + * out of the ompi_proc ... */ + + module_proc = OBJ_NEW(mca_bmi_ib_proc_t); + + /* Initialize number of peer */ + module_proc->proc_peer_count = 0; + + module_proc->proc_ompi = ompi_proc; + + /* build a unique identifier (of arbitrary + * size) to represent the proc */ + module_proc->proc_guid = ompi_proc->proc_name; + + /* IB module doesn't have addresses exported at + * initialization, so the addr_count is set to one. */ + module_proc->proc_addr_count = 1; + + + /* XXX: Right now, there can be only 1 peer associated + * with a proc. Needs a little bit change in + * mca_bmi_ib_proc_t to allow on demand increasing of + * number of peers for this proc */ + + module_proc->proc_peers = (mca_bmi_base_endpoint_t**) + malloc(module_proc->proc_addr_count * sizeof(mca_bmi_base_endpoint_t*)); + + if(NULL == module_proc->proc_peers) { + OBJ_RELEASE(module_proc); + return NULL; + } + return module_proc; +} + + +/* + * Note that this routine must be called with the lock on the process + * already held. Insert a bmi instance into the proc array and assign + * it an address. + */ +int mca_bmi_ib_proc_insert(mca_bmi_ib_proc_t* module_proc, + mca_bmi_base_endpoint_t* module_peer) +{ + /* insert into peer array */ + module_peer->peer_proc = module_proc; + module_proc->proc_peers[module_proc->proc_peer_count++] = module_peer; + + return OMPI_SUCCESS; +} diff --git a/src/mca/bmi/ib/bmi_ib_proc.h b/src/mca/bmi/ib/bmi_ib_proc.h new file mode 100644 index 0000000000..668e6cb306 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_proc.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_PROC_H +#define MCA_BMI_IB_PROC_H + +#include "mca/ns/ns.h" +#include "class/ompi_object.h" +#include "proc/proc.h" +#include "bmi_ib.h" +#include "bmi_ib_vapi.h" +#include "bmi_ib_addr.h" +#include "bmi_ib_peer.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_proc_t); + +/** + * Represents the state of a remote process and the set of addresses + * that it exports. Also cache an instance of mca_bmi_base_endpoint_t for + * each + * BMI instance that attempts to open a connection to the process. + */ +struct mca_bmi_ib_proc_t { + ompi_list_item_t super; + /**< allow proc to be placed on a list */ + + ompi_proc_t *proc_ompi; + /**< pointer to corresponding ompi_proc_t */ + + orte_process_name_t proc_guid; + /**< globally unique identifier for the process */ + + size_t proc_addr_count; + /**< number of addresses published by peer */ + + struct mca_bmi_base_endpoint_t **proc_peers; + /**< array of peers that have been created to access this proc */ + + size_t proc_peer_count; + /**< number of peers */ + + ompi_mutex_t proc_lock; + /**< lock to protect against concurrent access to proc state */ +}; +typedef struct mca_bmi_ib_proc_t mca_bmi_ib_proc_t; + +mca_bmi_ib_proc_t* mca_bmi_ib_proc_create(ompi_proc_t* ompi_proc); +int mca_bmi_ib_proc_insert(mca_bmi_ib_proc_t*, mca_bmi_base_endpoint_t*); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_recvfrag.c b/src/mca/bmi/ib/bmi_ib_recvfrag.c new file mode 100644 index 0000000000..4d7d9f1697 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_recvfrag.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mca/pml/base/pml_base_sendreq.h" +#include "bmi_ib.h" +#include "bmi_ib_peer.h" +#include "bmi_ib_recvfrag.h" +#include "bmi_ib_sendfrag.h" +#include "bmi_ib_memory.h" + +static void mca_bmi_ib_recv_frag_construct(mca_bmi_ib_recv_frag_t* frag); +static void mca_bmi_ib_recv_frag_destruct(mca_bmi_ib_recv_frag_t* frag); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_recv_frag_t, + mca_bmi_base_recv_frag_t, + mca_bmi_ib_recv_frag_construct, + mca_bmi_ib_recv_frag_destruct); + +/* + * IB fragment constructor + */ + +static void mca_bmi_ib_recv_frag_construct(mca_bmi_ib_recv_frag_t* frag) +{ +} + + +/* + * IB fragment destructor + */ + +static void mca_bmi_ib_recv_frag_destruct(mca_bmi_ib_recv_frag_t* frag) +{ +} + +void +mca_bmi_ib_recv_frag_done ( + mca_bmi_base_header_t *header, + mca_bmi_base_recv_frag_t* frag, + mca_bmi_base_recv_request_t *request) +{ + D_PRINT(""); + frag->frag_base.frag_owner->bmi_recv_progress ( + frag->frag_base.frag_owner, + request, + frag->frag_base.frag_size, + frag->frag_base.frag_size); + + /* Return recv frag to free list */ + OMPI_FREE_LIST_RETURN(&mca_bmi_ib_component.ib_recv_frags, + (ompi_list_item_t*)frag); +} + +static void mca_bmi_ib_data_frag( + mca_bmi_ib_module_t *ib_bmi, + mca_bmi_base_header_t *hdr) +{ + bool matched; + int rc; + ompi_list_item_t *item; + mca_bmi_ib_recv_frag_t *recv_frag; + size_t hdr_length; + + OMPI_FREE_LIST_WAIT (&mca_bmi_ib_component.ib_recv_frags, item, rc); + + recv_frag = (mca_bmi_ib_recv_frag_t *) item; + recv_frag->super.frag_base.frag_owner = &ib_bmi->super; + recv_frag->super.frag_base.frag_peer = NULL; + recv_frag->super.frag_request = NULL; + recv_frag->super.frag_is_buffered = false; + + /* Copy the header, mca_bmi_base_match() */ + recv_frag->super.frag_base.frag_header = *hdr; + + switch(hdr->hdr_common.hdr_type) { + case MCA_BMI_HDR_TYPE_MATCH: + hdr_length = sizeof(mca_bmi_base_match_header_t); + recv_frag->super.frag_base.frag_size = hdr->hdr_match.hdr_msg_length; + break; + case MCA_BMI_HDR_TYPE_RNDV: + hdr_length = sizeof(mca_bmi_base_rendezvous_header_t); + recv_frag->super.frag_base.frag_size = hdr->hdr_rndv.hdr_frag_length; + break; + } + + /* Taking the data starting point be default */ + recv_frag->super.frag_base.frag_addr = (char *) hdr + hdr_length; + + /* match against preposted requests */ + matched = ib_bmi->super.bmi_match( + recv_frag->super.frag_base.frag_owner, + &recv_frag->super, + &recv_frag->super.frag_base.frag_header.hdr_match); + + if (!matched) { + memcpy (recv_frag->unex_buf, (char *) hdr + hdr_length, recv_frag->super.frag_base.frag_size); + recv_frag->super.frag_is_buffered = true; + recv_frag->super.frag_base.frag_addr = recv_frag->unex_buf; + } +} + +static void mca_bmi_ib_ctrl_frag( + mca_bmi_ib_module_t *ib_bmi, + mca_bmi_base_header_t *header) +{ + mca_bmi_ib_send_frag_t *send_frag; + mca_bmi_base_send_request_t *req; + void *data_ptr; + + send_frag = (mca_bmi_ib_send_frag_t *) + header->hdr_ack.hdr_src_ptr.pval; + req = (mca_bmi_base_send_request_t *) + send_frag->frag_send.frag_request; + + req->req_peer_match = header->hdr_ack.hdr_dst_match; + req->req_peer_addr = header->hdr_ack.hdr_dst_addr; + req->req_peer_size = header->hdr_ack.hdr_dst_size; + + /* Locate data in the ACK buffer */ + data_ptr = (void*) + ((char*) header + sizeof(mca_bmi_base_ack_header_t)); + + /* Copy over data to request buffer */ + memcpy(&((mca_bmi_ib_send_request_t *) req)->req_key, + data_ptr, sizeof(VAPI_rkey_t)); + + /* Progress & release fragments */ + mca_bmi_ib_send_frag_send_complete(ib_bmi, send_frag); +} + +static void mca_bmi_ib_last_frag(mca_bmi_ib_module_t *ib_bmi, + mca_bmi_base_header_t *hdr) +{ + mca_bmi_ib_fin_header_t *fin_hdr = (mca_bmi_ib_fin_header_t *)hdr; + mca_bmi_base_recv_request_t *request; + request = (mca_bmi_base_recv_request_t*) hdr->hdr_frag.hdr_dst_ptr.pval; + + /* deregister memory if this is the last fragment */ + if ((request->req_bytes_received + hdr->hdr_frag.hdr_frag_length) >= + request->req_recv.req_bytes_packed) { + mca_bmi_ib_deregister_mem_with_registry(ib_bmi, + fin_hdr->mr_addr.pval, (size_t)fin_hdr->mr_size); + } + + ib_bmi->super.bmi_recv_progress ( + &ib_bmi->super, + request, + hdr->hdr_frag.hdr_frag_length, + hdr->hdr_frag.hdr_frag_length); + +} + +/* + * Process incoming receive fragments + * + */ + +void mca_bmi_ib_process_recv(mca_bmi_ib_module_t *ib_bmi, void* addr) +{ + ib_buffer_t *ib_buf; + mca_bmi_base_header_t *header; + + ib_buf = (ib_buffer_t *) addr; + header = (mca_bmi_base_header_t *) &ib_buf->buf[0]; + + switch(header->hdr_common.hdr_type) { + case MCA_BMI_HDR_TYPE_MATCH : + case MCA_BMI_HDR_TYPE_RNDV : + case MCA_BMI_HDR_TYPE_FRAG : + mca_bmi_ib_data_frag(ib_bmi, header); + break; + case MCA_BMI_HDR_TYPE_ACK : + mca_bmi_ib_ctrl_frag(ib_bmi, header); + break; + case MCA_BMI_HDR_TYPE_FIN : + A_PRINT("Fin"); + mca_bmi_ib_last_frag(ib_bmi, header); + break; + default : + ompi_output(0, "Unknown fragment type"); + break; + } +} diff --git a/src/mca/bmi/ib/bmi_ib_recvfrag.h b/src/mca/bmi/ib/bmi_ib_recvfrag.h new file mode 100644 index 0000000000..08c2438227 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_recvfrag.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_RECV_FRAG_H +#define MCA_BMI_IB_RECV_FRAG_H + +#include "mca/bmi/bmi.h" +#include "mca/bmi/base/bmi_base_recvfrag.h" + +#define MCA_BMI_IB_UNEX_BUF_SIZE (4096) + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_recv_frag_t); + +/** + * IB received fragment derived type. + */ +struct mca_bmi_ib_recv_frag_t { + mca_bmi_base_recv_frag_t super; + /**< base receive fragment descriptor */ + char unex_buf[MCA_BMI_IB_UNEX_BUF_SIZE]; + /**< Unexpected buffer */ +}; +typedef struct mca_bmi_ib_recv_frag_t mca_bmi_ib_recv_frag_t; + +struct mca_bmi_ib_module_t; + + +void mca_bmi_ib_recv_frag_done (mca_bmi_base_header_t*, + mca_bmi_base_recv_frag_t*, mca_bmi_base_recv_request_t*); + +void mca_bmi_ib_process_recv(struct mca_bmi_ib_module_t* , void*); +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_sendfrag.c b/src/mca/bmi/ib/bmi_ib_sendfrag.c new file mode 100644 index 0000000000..0e59ac15b7 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_sendfrag.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "include/types.h" +#include "datatype/datatype.h" +#include "mca/pml/base/pml_base_sendreq.h" +#include "bmi_ib.h" +#include "bmi_ib_peer.h" +#include "bmi_ib_proc.h" +#include "bmi_ib_sendfrag.h" +#include "bmi_ib_priv.h" +#include "bmi_ib_memory.h" + +static void mca_bmi_ib_send_frag_construct(mca_bmi_ib_send_frag_t* frag); +static void mca_bmi_ib_send_frag_destruct(mca_bmi_ib_send_frag_t* frag); + +OBJ_CLASS_INSTANCE(mca_bmi_ib_send_frag_t, + mca_bmi_base_send_frag_t, + mca_bmi_ib_send_frag_construct, + mca_bmi_ib_send_frag_destruct); + +/* + * Placeholders for send fragment constructor/destructors. + */ + +static void mca_bmi_ib_send_frag_construct(mca_bmi_ib_send_frag_t* frag) +{ + frag->frag_progressed = 0; + frag->frag_ack_pending = 0; +} + +static void mca_bmi_ib_send_frag_destruct(mca_bmi_ib_send_frag_t* frag) +{ +} + +/* + * Allocate a IB send descriptor + * + */ +mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag( + mca_bmi_ib_module_t* ib_bmi, + mca_bmi_base_send_request_t* request) +{ + ompi_free_list_t *flist = &ib_bmi->send_free; + ompi_list_item_t *item; + mca_bmi_ib_send_frag_t *ib_send_frag; + + item = ompi_list_remove_first(&((flist)->super)); + while(NULL == item) { + + mca_bmi_tstamp_t tstamp = 0; + + D_PRINT("Gone one NULL descriptor ... trying again"); + + mca_bmi_ib_component_progress(0); + item = ompi_list_remove_first (&((flist)->super)); + } + + ib_send_frag = (mca_bmi_ib_send_frag_t *)item; + return ib_send_frag; +} + + +int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi) +{ + int i, rc, num_send_frags; + ompi_list_item_t *item; + ompi_free_list_t *flist = &ib_bmi->send_free; + ib_buffer_t *ib_buf_ptr; + mca_bmi_ib_send_frag_t *ib_send_frag; + + num_send_frags = ompi_list_get_size(&(flist->super)); + item = ompi_list_get_first(&((flist)->super)); + + /* Register the buffers */ + for(i = 0; i < num_send_frags; + item = ompi_list_get_next(item), i++) { + + ib_send_frag = (mca_bmi_ib_send_frag_t *) item; + + ib_send_frag->frag_progressed = 0; + + ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf; + + rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag, + (void*) ib_buf_ptr->buf, + MCA_BMI_IB_FIRST_FRAG_SIZE, + &ib_buf_ptr->hndl); + if(rc != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + IB_PREPARE_SEND_DESC(ib_buf_ptr, 0, + MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr); + } + + return OMPI_SUCCESS; +} + + +/* + * Process send completions + * + */ + +void mca_bmi_ib_send_frag_send_complete(mca_bmi_ib_module_t *ib_bmi, mca_bmi_ib_send_frag_t* sendfrag) +{ + mca_bmi_base_header_t *hdr; + mca_bmi_base_send_request_t* req = sendfrag->frag_send.frag_request; + hdr = (mca_bmi_base_header_t *) sendfrag->ib_buf.buf; + + switch(hdr->hdr_common.hdr_type) { + case MCA_BMI_HDR_TYPE_MATCH: + if (0 == (hdr->hdr_common.hdr_flags & MCA_BMI_FLAGS_ACK) + || mca_bmi_base_send_request_matched(req)) { + + ib_bmi->super.bmi_send_progress(&ib_bmi->super, + sendfrag->frag_send.frag_request, + hdr->hdr_rndv.hdr_frag_length); + if(req->req_cached == false) { + OMPI_FREE_LIST_RETURN(&ib_bmi->send_free, + ((ompi_list_item_t *) sendfrag)); + } + } + break; + + case MCA_BMI_HDR_TYPE_ACK: + + OMPI_FREE_LIST_RETURN(&ib_bmi->send_free, + ((ompi_list_item_t *) sendfrag)); + break; + + case MCA_BMI_HDR_TYPE_FIN: + + ib_bmi->super.bmi_send_progress(&ib_bmi->super, + sendfrag->frag_send.frag_request, + hdr->hdr_frag.hdr_frag_length); + OMPI_FREE_LIST_RETURN(&ib_bmi->send_free, + ((ompi_list_item_t *) sendfrag)); + break; + } +} + diff --git a/src/mca/bmi/ib/bmi_ib_sendfrag.h b/src/mca/bmi/ib/bmi_ib_sendfrag.h new file mode 100644 index 0000000000..00f8b532b9 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_sendfrag.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_SEND_FRAG_H +#define MCA_BMI_IB_SEND_FRAG_H + +#include "ompi_config.h" +#include "mca/bmi/base/bmi_base_sendreq.h" +#include "mca/bmi/base/bmi_base_sendfrag.h" + +#include "bmi_ib_priv.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif +OBJ_CLASS_DECLARATION(mca_bmi_ib_send_frag_t); + +typedef enum { + MCA_BMI_IB_FRAG_SEND, + MCA_BMI_IB_FRAG_PUT, + MCA_BMI_IB_FRAG_GET, + MCA_BMI_IB_FRAG_ACK +} mca_bmi_ib_frag_type_t; + +/** + * IB send fragment derived type. + */ +struct mca_bmi_ib_frag_t { + mca_bmi_base_descriptor_t base; + mca_bmi_base_segment_t segment; + struct mca_bmi_base_endpoint_t *endpoint; + mca_bmi_ib_frag_type_t type; + mca_bmi_base_tag_t tag; + + size_t size; + int rc; + bool frag_ack_pending; +}; +typedef struct mca_bmi_ib_frag_t mca_bmi_ib_frag_t; + + + +/* + * Allocate an IB send descriptor + * + */ +#define MCA_BMI_IB_FRAG_ALLOC1(frag, rc) \ +{ + + ompi_list_item_t *item; + OMPI_FREE_LIST_WAIT(&mca_bmi_ib_module.ib_frags1, item, rc); + frag = (mca_bmi_ib_frag_t*) item; + +} + + +int mca_bmi_ib_send_frag_register(mca_bmi_ib_module_t *ib_bmi) +{ + int i, rc, num_send_frags; + ompi_list_item_t *item; + ompi_free_list_t *flist = &ib_bmi->ib_frags1; + ib_buffer_t *ib_buf_ptr; + mca_bmi_ib_frag_t *ib_frag; + + num_send_frags = ompi_list_get_size(&(flist->super)); + item = ompi_list_get_first(&((flist)->super)); + + /* Register the buffers */ + for(i = 0; i < num_send_frags; + item = ompi_list_get_next(item), i++) { + + ib_send_frag = (mca_bmi_ib_send_frag_t *) item; + + ib_send_frag->frag_progressed = 0; + + ib_buf_ptr = (ib_buffer_t *) &ib_send_frag->ib_buf; + + rc = mca_bmi_ib_register_mem(ib_bmi->nic, ib_bmi->ptag, + (void*) ib_buf_ptr->buf, + MCA_BMI_IB_FIRST_FRAG_SIZE, + &ib_buf_ptr->hndl); + if(rc != OMPI_SUCCESS) { + return OMPI_ERROR; + } + + IB_PREPARE_SEND_DESC(ib_buf_ptr, 0, + MCA_BMI_IB_FIRST_FRAG_SIZE, ib_buf_ptr); + } + + return OMPI_SUCCESS; +} + + + + +struct mca_bmi_ib_module_t; + +mca_bmi_ib_send_frag_t* mca_bmi_ib_alloc_send_frag( + struct mca_bmi_ib_module_t* ib_bmi, + mca_bmi_base_send_request_t* request); + +int mca_bmi_ib_send_frag_register(struct mca_bmi_ib_module_t *bmi); +void mca_bmi_ib_send_frag_send_complete(struct mca_bmi_ib_module_t *bmi, mca_bmi_ib_send_frag_t*); + + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/src/mca/bmi/ib/bmi_ib_vapi.h b/src/mca/bmi/ib/bmi_ib_vapi.h new file mode 100644 index 0000000000..138361d165 --- /dev/null +++ b/src/mca/bmi/ib/bmi_ib_vapi.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004 The Ohio State University. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BMI_IB_VAPI_H +#define MCA_BMI_IB_VAPI_H + +#include +#include +#include + +/* HACK: Alert, these are dumb defines, + * all this stuff should be runtime. Ignoring for now. + */ + +#define DEFAULT_PORT (1) +#define DEFAULT_CQ_SIZE (40000) +#define DEFAULT_WQ_SIZE (10000) +#define DEFAULT_SG_LIST (1) +#define DEFAULT_PKEY_IX (0) +#define DEFAULT_PSN (0) +#define DEFAULT_QP_OUS_RD_ATOM (1) +#define DEFAULT_MTU (MTU1024) +#define DEFAULT_MIN_RNR_TIMER (5) +#define DEFAULT_TIME_OUT (10) +#define DEFAULT_RETRY_COUNT (7) +#define DEFAULT_RNR_RETRY (7) +#define DEFAULT_MAX_RDMA_DST_OPS (16) + +#define DEFAULT_TRAFFIC_CLASS (0) +#define DEFAULT_HOP_LIMIT (63) +#define DEFAULT_FLOW_LABEL (0) +#define DEFAULT_SERVICE_LEVEL (0) +#define DEFAULT_STATIC_RATE (0) +#define DEFAULT_SRC_PATH_BITS (0) + +/* This is a convinence macro. + * + * ret : The value to return if call failed + * vapi_ret : The value which was returned from the last VAPI call + * func_name : The VAPI function which was called + */ +#define MCA_BMI_IB_VAPI_RET(vapi_ret, func_name) { \ + ompi_output(0,"[%s:%d] ", __FILE__, __LINE__); \ + ompi_output(0,"%s : %s",func_name,VAPI_strerror(vapi_ret)); \ +} + +/* Debug Print */ +#if 0 +#define D_PRINT(fmt, args...) { \ + ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \ + ##args); \ +} +#else +#define D_PRINT(fmt, args...) +#endif + +#if 0 +#define A_PRINT(fmt, args...) { \ + ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \ + ##args); \ +} +#else +#define A_PRINT(fmt, args...) +#endif + +#if 0 +#define B_PRINT(fmt, args...) { \ + ompi_output(0, "[%s:%d:%s] " fmt, __FILE__, __LINE__, __func__, \ + ##args); \ +} +#else +#define B_PRINT(fmt, args...) +#endif + +#endif diff --git a/src/mca/bmi/ib/configure.params b/src/mca/bmi/ib/configure.params new file mode 100644 index 0000000000..229078a273 --- /dev/null +++ b/src/mca/bmi/ib/configure.params @@ -0,0 +1,22 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=bmi_ib.c +PARAM_CONFIG_HEADER_FILE="ib_config.h" +PARAM_CONFIG_FILES="Makefile" diff --git a/src/mca/bmi/ib/configure.stub b/src/mca/bmi/ib/configure.stub new file mode 100644 index 0000000000..12a5827bf2 --- /dev/null +++ b/src/mca/bmi/ib/configure.stub @@ -0,0 +1,148 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004 The Ohio State University. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# Main function. This will be invoked in the middle of the templated +# configure script. +# +AC_DEFUN([MCA_CONFIGURE_STUB],[ + + # Additional --with flags that can be specified + + AC_ARG_WITH(ptl-ib, + AC_HELP_STRING([--with-ptl-ib=IBDIR], + [Specify the installation directory of IB (should enable the correct automatic determination of using the 32 or 64 bit library, if both are present under IBDIR/lib and IBDIR/lib64)])) + AC_ARG_WITH(ptl-ib-libdir, + AC_HELP_STRING([--with-ptl-ib-libdir=IBLIBDIR], + [directory where the IB library can be found, if it is not in $IBDIR/lib or $IBDIR/lib64])) + + # Quick sanity check + + if test "$with_ptl_ib" = "no"; then + AC_MSG_WARN([*** --without-ptl-ib specified -- aborting]) + AC_MSG_ERROR([Will not continue]) + fi + + # Find the right IBDIR + + if test "$with_ptl_ib" != "" -a "$with_ptl_ib" != "yes" ; then + IBDIR="$with_ptl_ib" + IBLIBDIR="$with_ptl_ib" + fi + if test "$with_ptl_ib_libdir" != "" -a "$with_ptl_ib_libdir" != "yes" -a \ + "$with_ptl_ib_libdir" != "no"; then + IBLIBDIR="$with_ptl_ib_libdir" + fi + + # Add to CPPFLAGS if necessary + + EXTRA_CPPFLAGS= + if test "$IBDIR" != ""; then + if test -d "$IBDIR/include"; then + EXTRA_CPPFLAGS="-I$IBDIR/include" + else + AC_MSG_WARN([*** Warning: cannot find $IBDIR/include]) + AC_MSG_WARN([*** Will still try to configure ib ptl anyway...]) + fi + if test "$IBDIR" != "" -a -d "$IBDIR/wrap"; then + EXTRA_CPPFLAGS="-I$IBDIR/wrap $EXTRA_CPPFLAGS" + else + AC_MSG_WARN([*** Warning: cannot find $IBDIR/wrap]) + AC_MSG_WARN([*** Will still try to configure ib ptl anyway...]) + fi + fi + + # See if we can find vapi.h + + CPPFLAGS="$CPPFLAGS $EXTRA_CPPFLAGS" + AC_CHECK_HEADERS(vapi.h,, + AC_MSG_ERROR([*** Cannot find working vapi.h])) + + # Note that it is possible to find the library even if -L is not + # specified, if the LD_LIBRARY_PATH includes the directory where + # the shared ib library is kept. Hence, we unset LD_LIBRARY_PATH + # before running this test. + + LD_LIBRARY_PATH_save="$LD_LIBRARY_PATH" + unset LD_LIBRARY_PATH + + # Helpfer function to try to find libvapi (called from below). In + # some versions of Mellanox (v3.1), we need to expliitly link in + # the thread libraries. #$%#@$%@%#$!!! + +mca_ptl_ib_try_find_libvapi() { + func1=[$]1 + func2=[$]2 + + LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS" + vapi_badness= + AC_CHECK_LIB([vapi], [$func1], [], [vapi_badness=true], + [-lmtl_common -lmpga -lmosal]) + if test "$vapi_badness" != ""; then + AC_CHECK_LIB([pthread], [pthread_create], + [pthread=yes LIBS="$LIBS -lpthread"], + [pthread=no]) + if test "$pthread" = "yes"; then + AC_CHECK_LIB([vapi], [$func2], [], [], + [-lmtl_common -lmpga -lmosal]) + fi + fi +} + + # The libraries may be in $IBDIR/lib or $IBDIR/lib64. Try them + # both. + + LIBS_save="$LIBS" + LDFLAGS_save="$LDFLAGS" + LIBS="$LIBS -lmosal -lmpga -lmtl_common" + LIBS_orig="$LIBS" + + EXTRA_LDFLAGS= + if test -d "$IBLIBDIR/lib"; then + EXTRA_LDFLAGS="-L$IBLIBDIR/lib" + LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS" + mca_ptl_ib_try_find_libvapi VAPI_open_hca VAPI_query_hca_cap + if test "$LIBS" != "$LIBS_orig"; then + echo "--> found libvapi in $IBLIBDIR/lib" + fi + fi + + if test "$LIBS" = "$LIBS_orig" -a -d "$IBLIBDIR/lib64"; then + EXTRA_LDFLAGS="-L$IBLIBDIR/lib64" + LDFLAGS="$LDFLAGS_save $EXTRA_LDFLAGS" + mca_ptl_ib_try_find_libvapi EVAPI_list_hcas EVAPI_open_hca + if test "$LIBS" != "$LIBS_orig"; then + echo "--> found libvapi in $IBLIBDIR/lib64" + fi + fi + + if test "$LIBS" = "$LIBS_orig"; then + AC_MSG_ERROR([*** Cannot find working libvapi.]) + fi + LD_LIBRARY_PATH="$LD_LIBRARY_PATH_save" + LIBS="$LIBS -lmtl_common -lmpga" + + # + # Save extra compiler/linker flags so that they can be added in + # the wrapper compilers, if necessary + # + + WRAPPER_EXTRA_LDFLAGS="$EXTRA_LDFLAGS" + WRAPPER_EXTRA_LIBS="-lvapi -lmtl_common -lmpga -lmosal" +])dnl