From afdfa70f73709f84a9c8ebe452c1dbdd57ed525c Mon Sep 17 00:00:00 2001 From: Galen Shipman Date: Thu, 18 Aug 2005 17:08:27 +0000 Subject: [PATCH] Added support for openib RDMA READ.. note that performance is currently an issue so PUT is default.. We are determining if this is an openib issue or a btl issue as we have seen performance increases on mvapi. This commit was SVN r6928. --- ompi/mca/btl/openib/btl_openib.c | 44 ++++++- ompi/mca/btl/openib/btl_openib.h | 19 ++- ompi/mca/btl/openib/btl_openib_component.c | 117 +++++++++--------- ompi/mca/btl/openib/btl_openib_endpoint.c | 3 +- ompi/mca/btl/openib/btl_openib_endpoint.h | 1 - .../mca/mpool/openib/mpool_openib_component.c | 10 -- ompi/mca/mpool/openib/mpool_openib_module.c | 10 +- 7 files changed, 120 insertions(+), 84 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index e542e75469..10bceaee02 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -32,7 +32,6 @@ #include "mca/mpool/openib/mpool_openib.h" #include #include -extern int errno; mca_btl_openib_module_t mca_btl_openib_module = { { @@ -57,7 +56,7 @@ mca_btl_openib_module_t mca_btl_openib_module = { mca_btl_openib_prepare_dst, mca_btl_openib_send, mca_btl_openib_put, - NULL /* get */ + mca_btl_openib_get /* get */ } }; @@ -775,7 +774,7 @@ int mca_btl_openib_send( } /* - * RDMA local buffer to remote buffer address. + * RDMA WRITE local buffer to remote buffer address. */ int mca_btl_openib_put( mca_btl_base_module_t* btl, @@ -812,6 +811,45 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, } + +/* + * RDMA READ remote buffer to local buffer address. + */ + +int mca_btl_openib_get( mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor) +{ + struct ibv_send_wr* bad_wr; + mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; + frag->endpoint = endpoint; + frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ; + frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->wr_desc.sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval; + frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0]; + frag->sg_entry.addr = (uintptr_t) frag->base.des_dst->seg_addr.pval; + frag->sg_entry.length = frag->base.des_dst->seg_len; + + BTL_VERBOSE(("frag->wr_desc.sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu" + , frag->wr_desc.sr_desc.wr.rdma.remote_addr + , frag->wr_desc.sr_desc.wr.rdma.rkey + , frag->sg_entry.addr + , frag->sg_entry.length)); + + if(ibv_post_send(endpoint->lcl_qp_low, + &frag->wr_desc.sr_desc, + &bad_wr)){ + BTL_ERROR(("error posting send request errno says %s", strerror(errno))); + return OMPI_ERROR; + } + + MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(endpoint, 1); + MCA_BTL_OPENIB_ENDPOINT_POST_RR_LOW(endpoint, 1); + + return OMPI_SUCCESS; + +} + /* * Initialize the btl module by allocating a protection domain * and creating both the high and low priority completion queues diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 856b699bc3..5578508ab2 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -306,7 +306,24 @@ extern int mca_btl_openib_put( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* btl_peer, struct mca_btl_base_descriptor_t* decriptor -); + ); + +/** + * PML->BTL Initiate a get of the specified size. + * + * @param btl (IN) BTL instance + * @param btl_base_peer (IN) BTL peer addressing + * @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t) + * @param size (IN) Number of bytes PML is requesting BTL to deliver + * @param flags (IN) Flags that should be passed to the peer via the message header. + * @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments + */ +extern int mca_btl_openib_get( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor + ); + /** * Allocate a descriptor. diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 5b43112646..898105e40a 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -41,7 +41,6 @@ #include #include /* for strerror()*/ -extern int errno; mca_btl_openib_component_t mca_btl_openib_component = { { /* First, the mca_base_component_t struct containing meta information @@ -507,71 +506,67 @@ int mca_btl_openib_component_progress() * we will check the high priority and process them until there are none left. * note that low priority messages are only processed one per progress call. */ - do{ - ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); - if(ne < 0 ){ - BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno))); + ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); + if(ne < 0 ){ + BTL_ERROR(("error polling CQ with %d errno says %s\n", ne, strerror(errno))); + return OMPI_ERROR; + } + else if(wc.status != IBV_WC_SUCCESS) { + BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n", + wc.status, wc.wr_id)); + return OMPI_ERROR; + } + else if(1 == ne) { + BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode)); + + /* Handle work completions */ + switch(wc.opcode) { + case IBV_WC_RECV_RDMA_WITH_IMM: + BTL_ERROR(("Got an RDMA with Immediate data Not supported!")); return OMPI_ERROR; - } - else if(wc.status != IBV_WC_SUCCESS) { - BTL_ERROR(("error polling CQ with status %d for wr_id %llu\n", - wc.status, wc.wr_id)); - return OMPI_ERROR; - } - else if(1 == ne) { - BTL_VERBOSE(("completion queue event says opcode is %d\n", wc.opcode)); + + case IBV_WC_RECV: + /* Process a RECV */ + + BTL_VERBOSE(("Got an recv on the completion queue")); + frag = (mca_btl_openib_frag_t*) wc.wr_id; + endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; + frag->rc=OMPI_SUCCESS; + frag->segment.seg_len = + wc.byte_len- + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); + + + + /* advance the segment address past the header and subtract from the length..*/ + openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, + frag->hdr->tag, + &frag->base, + openib_btl->ib_reg[frag->hdr->tag].cbdata); + + OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); + MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); + OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); + count++; + break; - /* Handle work completions */ - switch(wc.opcode) { - case IBV_WC_RECV_RDMA_WITH_IMM: - BTL_ERROR(("Got an RDMA with Immediate data Not supported!")); - return OMPI_ERROR; - - case IBV_WC_RECV: - /* Process a RECV */ + case IBV_WC_RDMA_READ: + case IBV_WC_RDMA_WRITE: + case IBV_WC_SEND : - BTL_VERBOSE(("Got an recv on the completion queue")); - frag = (mca_btl_openib_frag_t*) wc.wr_id; - endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; - frag->rc=OMPI_SUCCESS; - frag->segment.seg_len = - wc.byte_len- - ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); - - + /* Process a completed send or rdma write*/ + frag = (mca_btl_openib_frag_t*) wc.wr_id; + frag->rc = OMPI_SUCCESS; + frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); + count++; + break; - /* advance the segment address past the header and subtract from the length..*/ - openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, - frag->hdr->tag, - &frag->base, - openib_btl->ib_reg[frag->hdr->tag].cbdata); - - OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); - MCA_BTL_OPENIB_ENDPOINT_POST_RR_HIGH(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); - OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); - count++; - break; - - case IBV_WC_RDMA_WRITE: - case IBV_WC_SEND : - - /* Process a completed send or rdma write*/ - frag = (mca_btl_openib_frag_t*) wc.wr_id; - frag->rc = OMPI_SUCCESS; - frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); - count++; - break; - - - break; - - default: - BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode)); - break; - } + + default: + BTL_ERROR(("Unhandled work completion opcode is %d", wc.opcode)); + break; } } - while(ne > 0); ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc ); if(ne < 0){ @@ -614,8 +609,10 @@ int mca_btl_openib_component_progress() count++; break; + case IBV_WC_RDMA_READ: case IBV_WC_RDMA_WRITE: case IBV_WC_SEND : + /* Process a completed send */ frag = (mca_btl_openib_frag_t*) wc.wr_id; frag->rc = OMPI_SUCCESS; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 59364ed733..d9478b88fd 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -32,7 +32,6 @@ #include "class/ompi_free_list.h" #include #include -extern int errno; static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint); static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); @@ -728,7 +727,7 @@ int mca_btl_openib_endpoint_create_qp( qp_attr->qp_state = IBV_QPS_INIT; qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix; qp_attr->port_num = openib_btl->port_num; - qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE; + qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; if(ibv_modify_qp((*qp), qp_attr, IBV_QP_STATE | diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 973f70b688..7d8cc1302d 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -27,7 +27,6 @@ #include #include #include "mca/btl/base/btl_base_error.h" -extern int errno; #if defined(c_plusplus) || defined(__cplusplus) extern "C" { diff --git a/ompi/mca/mpool/openib/mpool_openib_component.c b/ompi/mca/mpool/openib/mpool_openib_component.c index 3129bfbfe1..fdbf20f532 100644 --- a/ompi/mca/mpool/openib/mpool_openib_component.c +++ b/ompi/mca/mpool/openib/mpool_openib_component.c @@ -87,16 +87,6 @@ OBJ_CLASS_INSTANCE( -static char* mca_mpool_openib_param_register_string( - const char* param_name, - const char* default_value) -{ - char *param_value; - int id = mca_base_param_register_string("mpool","openib",param_name,NULL,default_value); - mca_base_param_lookup_string(id, ¶m_value); - return param_value; -} - /** * component open/close/init function diff --git a/ompi/mca/mpool/openib/mpool_openib_module.c b/ompi/mca/mpool/openib/mpool_openib_module.c index 349068dee6..145c70f4e8 100644 --- a/ompi/mca/mpool/openib/mpool_openib_module.c +++ b/ompi/mca/mpool/openib/mpool_openib_module.c @@ -46,10 +46,7 @@ void* mca_mpool_openib_alloc( size_t align, mca_mpool_base_registration_t** registration) { - mca_mpool_openib_module_t* mpool_openib = (mca_mpool_openib_module_t*)mpool; - /* void* addr_malloc = (void*)malloc((*size) + mca_mpool_openib_component.page_size); */ - /* void* addr = (void*) ALIGN_ADDR(addr_malloc, mca_mpool_openib_component.page_size_log); */ - + void* addr_malloc = (void*)memalign(mca_mpool_openib_component.page_size, size); void* addr = addr_malloc; @@ -81,7 +78,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool, mpool_module->resources.ib_pd, addr, size, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ /* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */ ); @@ -104,7 +101,6 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool, int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size_t size, mca_mpool_base_registration_t* registration){ - mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool; mca_mpool_openib_registration_t * openib_reg; openib_reg = (mca_mpool_openib_registration_t*) registration; if(ibv_dereg_mr(openib_reg->mr)){ @@ -127,7 +123,7 @@ void* mca_mpool_openib_realloc( mca_mpool_base_registration_t* old_reg = *registration; void* new_mem = mpool->mpool_alloc(mpool, size, 0, registration); memcpy(new_mem, addr, old_reg->bound - old_reg->base); - mpool->mpool_free(mpool, addr, &old_reg); + mpool->mpool_free(mpool, addr, old_reg); return new_mem; }