From 2f67ab82bb03e45beeba04baf4992bcc7cedfa68 Mon Sep 17 00:00:00 2001 From: Galen Shipman Date: Tue, 19 Jul 2005 21:04:22 +0000 Subject: [PATCH] Working version of openib btl ;-) Fixed receive descriptor counts that limited mvapi and openib to 2 procs. Begin porting error messages to use the BTL_ERROR macro. This commit was SVN r6554. --- ompi/mca/btl/base/btl_base_error.h | 24 ++ ompi/mca/btl/mvapi/btl_mvapi.c | 1 + ompi/mca/btl/mvapi/btl_mvapi.h | 5 - ompi/mca/btl/mvapi/btl_mvapi_component.c | 25 +- ompi/mca/btl/mvapi/btl_mvapi_endpoint.c | 4 + ompi/mca/btl/mvapi/btl_mvapi_endpoint.h | 18 +- ompi/mca/btl/openib/btl_openib.c | 55 +++-- ompi/mca/btl/openib/btl_openib.h | 6 +- ompi/mca/btl/openib/btl_openib_component.c | 220 ++++++++---------- ompi/mca/btl/openib/btl_openib_endpoint.c | 63 +++-- ompi/mca/btl/openib/btl_openib_endpoint.h | 24 +- ompi/mca/btl/openib/btl_openib_frag.c | 8 +- ompi/mca/mpool/openib/mpool_openib.h | 4 +- .../mca/mpool/openib/mpool_openib_component.c | 7 - ompi/mca/mpool/openib/mpool_openib_module.c | 5 +- 15 files changed, 262 insertions(+), 207 deletions(-) create mode 100644 ompi/mca/btl/base/btl_base_error.h diff --git a/ompi/mca/btl/base/btl_base_error.h b/ompi/mca/btl/base/btl_base_error.h new file mode 100644 index 0000000000..d03dad8c60 --- /dev/null +++ b/ompi/mca/btl/base/btl_base_error.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_BASE_ERROR_H +#define MCA_BTL_BASE_ERROR_H + +#define BTL_ERROR(fmt, args...) { \ + opal_output(0, "[%s:%d:%d " fmt, __FILE__, __LINE__, __func__, ##args); \ +} + +#endif diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index c7dfe770af..df1f4772fc 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -828,5 +828,6 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl) return OMPI_ERROR; } + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index 9f7f40d558..169be06edb 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -157,11 +157,6 @@ struct mca_btl_mvapi_module_t { mca_mpool_base_module_t* ib_pool; /**< ib memory pool */ - - uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/ - uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/ - - VAPI_rr_desc_t* rr_desc_post; /**< an array to allow posting of rr in one swoop */ diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index 4322d75bcc..34e8f7e62e 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -487,13 +487,19 @@ int mca_btl_mvapi_component_progress() { uint32_t i; int count = 0; - mca_btl_mvapi_frag_t* frag; + mca_btl_mvapi_frag_t* frag; + mca_btl_mvapi_endpoint_t* endpoint; /* Poll for completions */ for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) { VAPI_ret_t ret; VAPI_wc_desc_t comp; mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i]; + + /* we have two completion queues, one for "high" priority and one for "low". + * we will check the high priority and process them until there are none left. + * note that low priority messages are only processed one per progress call. + */ do{ ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp); if(VAPI_OK == ret) { @@ -504,7 +510,7 @@ int mca_btl_mvapi_component_progress() return OMPI_ERROR; } - /* Handle n/w completions */ + /* Handle work completions */ switch(comp.opcode) { case VAPI_CQE_RQ_RDMA_WITH_IMM: if(comp.imm_data_valid){ @@ -515,7 +521,7 @@ int mca_btl_mvapi_component_progress() case VAPI_CQE_SQ_RDMA_WRITE: case VAPI_CQE_SQ_SEND_DATA : - /* Process a completed send */ + /* Process a completed send or an rdma write */ frag = (mca_btl_mvapi_frag_t*) comp.id; frag->rc = OMPI_SUCCESS; frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); @@ -524,15 +530,18 @@ int mca_btl_mvapi_component_progress() case VAPI_CQE_RQ_SEND_DATA: - DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); + /* Process a RECV */ + DEBUG_OUT("Got an recv completion" ); frag = (mca_btl_mvapi_frag_t*) comp.id; + endpoint = (mca_btl_endpoint_t*) frag->endpoint; + frag->rc=OMPI_SUCCESS; frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* advance the segment address past the header and subtract from the length..*/ mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_high, -1); + OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); @@ -540,7 +549,7 @@ int mca_btl_mvapi_component_progress() break; default: - opal_output(0, "Errorneous network completion"); + opal_output(0, "Unhandled work completion opcode is %d", comp.opcode); break; } } @@ -570,15 +579,17 @@ int mca_btl_mvapi_component_progress() case VAPI_CQE_RQ_SEND_DATA: + DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); frag = (mca_btl_mvapi_frag_t*) comp.id; + endpoint = (mca_btl_endpoint_t*) frag->endpoint; frag->rc=OMPI_SUCCESS; frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* advance the segment address past the header and subtract from the length..*/ mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_low, -1); + OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1); mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 34af110f93..a2504b5875 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -114,6 +114,10 @@ static void mca_btl_mvapi_endpoint_construct(mca_btl_base_endpoint_t* endpoint) OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t); + + endpoint->rr_posted_high = 0; + endpoint->rr_posted_low = 0; + } /* diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h index 59d5aa6807..0413c1d0c6 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.h @@ -109,6 +109,10 @@ struct mca_btl_base_endpoint_t { VAPI_qp_prop_t lcl_qp_prop_low; /* Low priority local QP properties */ + + uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/ + uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/ + }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; @@ -161,14 +165,14 @@ static inline int mca_btl_mvapi_endpoint_post_rr_sub(int cnt, static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * endpoint, int additional){ mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; int rc; - OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); + OPAL_THREAD_LOCK(&endpoint->ib_lock); - if(mvapi_btl->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){ + if(endpoint->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){ - rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_high, + rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_high, endpoint, &mvapi_btl->recv_free_eager, - &mvapi_btl->rr_posted_high, + &endpoint->rr_posted_high, mvapi_btl->nic, endpoint->lcl_qp_hndl_high ); @@ -177,12 +181,12 @@ static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * end return rc; } } - if(mvapi_btl->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){ + if(endpoint->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){ - rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_low, + rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_low, endpoint, &mvapi_btl->recv_free_max, - &mvapi_btl->rr_posted_low, + &endpoint->rr_posted_low, mvapi_btl->nic, endpoint->lcl_qp_hndl_low ); diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 5305522311..7bac7ed13a 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -21,7 +21,7 @@ #include "opal/util/if.h" #include "mca/pml/pml.h" #include "mca/btl/btl.h" - +#include "mca/btl/base/btl_base_error.h" #include "btl_openib.h" #include "btl_openib_frag.h" #include "btl_openib_proc.h" @@ -127,6 +127,10 @@ int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl, return OMPI_SUCCESS; } +/* + *Register callback function to support send/recv semantics + */ + int mca_btl_openib_register( struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, @@ -179,7 +183,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( } /** - * + * Return a segment * */ int mca_btl_openib_free( @@ -189,16 +193,16 @@ int mca_btl_openib_free( mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des; if(frag->size == 0) { - MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag); - OBJ_RELEASE(frag->openib_reg); - - + MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag); + } else if(frag->size == mca_btl_openib_component.max_send_size){ MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag); } else if(frag->size == mca_btl_openib_component.eager_limit){ MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); + } else { + BTL_ERROR("invalid descriptor"); } return OMPI_SUCCESS; @@ -265,13 +269,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base); if(OMPI_SUCCESS != rc) { - opal_output(0, "%s:%d:%s error removing memory region from memory pool tree", __FILE__, __LINE__, __func__); + BTL_ERROR("error removing memory region from memory pool tree"); return NULL; } if(is_leave_pinned) { if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)){ - opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); + BTL_ERROR("error removing item from reg_mru_list"); return NULL; } } @@ -290,7 +294,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( if(rc != OMPI_SUCCESS) { - opal_output(0,"%s:%d:%s error inserting memory region into memory pool tree", __FILE__, __LINE__, __func__); + BTL_ERROR("error inserting memory region into memory pool tree"); return NULL; } @@ -302,7 +306,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( } else if(is_leave_pinned) { if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) { - opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); + BTL_ERROR("error removing item from reg_mru_list"); return NULL; } opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); @@ -399,13 +403,16 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.addr = (uintptr_t) iov.iov_base; - frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; + frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->rkey; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->openib_reg = openib_reg; + DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr); + + return &frag->base; } else if (max_data+reserve <= btl->btl_eager_limit) { @@ -416,7 +423,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( } iov.iov_len = max_data; - iov.iov_base = frag->segment.seg_addr.pval + reserve; + iov.iov_base = frag->segment.seg_addr.lval + reserve; rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; @@ -507,8 +514,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( frag->base.des_flags = 0; if(NULL!= openib_reg){ - reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1; bool is_leave_pinned = openib_reg->base_reg.is_leave_pinned; + reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1; + if(frag->segment.seg_len > reg_len ) { size_t new_len = openib_reg->base_reg.bound - openib_reg->base_reg.base + 1 @@ -560,6 +568,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( } opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); } + OBJ_RETAIN(openib_reg); } else { if(mca_btl_openib_component.leave_pinned) { @@ -601,7 +610,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( opal_output(0,"%s:%d:%s error inserting memory region into memory pool", __FILE__, __LINE__, __func__); return NULL; } - + OBJ_RETAIN(openib_reg); opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); @@ -622,14 +631,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.addr = (uintptr_t) frag->segment.seg_addr.pval; - frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; + frag->segment.seg_key.key32[0] = frag->mr->rkey; frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->openib_reg = openib_reg; - OBJ_RETAIN(openib_reg); + DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0]); + return &frag->base; } @@ -712,10 +722,17 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; frag->endpoint = endpoint; frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_dst->seg_addr.pval; frag->sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0]; frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval; frag->sg_entry.length = frag->base.des_src->seg_len; + + DEBUG_OUT("frag->sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu" + , frag->sr_desc.wr.rdma.remote_addr + , frag->sr_desc.wr.rdma.rkey + , frag->sg_entry.addr + , frag->sg_entry.length); if(ibv_post_send(endpoint->lcl_qp_low, &frag->sr_desc, @@ -785,7 +802,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl) /* Allocate Protection Domain */ struct ibv_context *ctx; - + openib_btl->poll_cq = false; ctx = openib_btl->ib_dev_context; openib_btl->ib_pd = ibv_alloc_pd(ctx); @@ -821,5 +838,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl) /* TODO: EVAPI_set_qsync_event_handler? */ + + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 24a3ba4f7b..778d3aea92 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -155,14 +155,10 @@ struct mca_btl_openib_module_t { mca_mpool_base_module_t* ib_pool; /**< ib memory pool */ - uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/ - uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/ - - - /**< an array to allow posting of rr in one swoop */ size_t ib_inline_max; /**< max size of inline send*/ + bool poll_cq; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 5d682714c4..dbc093512b 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -31,6 +31,7 @@ #include "btl_openib_frag.h" #include "btl_openib_endpoint.h" #include "mca/btl/base/base.h" +#include "mca/btl/base/btl_base_error.h" #include "datatype/convertor.h" @@ -253,12 +254,15 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, mca_btl_openib_module_t * openib_btl; mca_btl_base_selected_module_t* ib_selected; opal_list_item_t* item; + struct dlist *dev_list; + + struct ibv_device* ib_dev; + + /* initialization */ *num_btl_modules = 0; num_devs = 0; - struct dlist *dev_list; - struct ibv_device* ib_dev; /* Determine the number of hca's available on the host */ dev_list = ibv_get_devices(); @@ -268,7 +272,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, num_devs++; if(0 == num_devs) { - opal_output(0, "No hca's found on this host! \n"); + BTL_ERROR("No hca's found on this host!"); return NULL; } @@ -297,51 +301,18 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, for(i = 0; i < num_devs; i++){ struct ibv_device_attr ib_dev_attr; - struct ibv_context* ib_dev_context; - struct ibv_pd *my_pd; - struct ibv_mr *mr; - void* my_addr; - uint32_t my_size; - uint32_t my_indx; - uint32_t my_mult; - my_mult = 4096; - + struct ibv_context* ib_dev_context; + ib_dev = ib_devs[i]; ib_dev_context = ibv_open_device(ib_dev); if(!ib_dev_context) { - opal_output(0, "%s: error obtaining device context for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno)); + BTL_ERROR(" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)); return NULL; } - - my_pd = ibv_alloc_pd(ib_dev_context); - for(my_indx = 1; my_indx <= 8192; my_indx++){ - my_size = my_mult * my_indx; - my_addr = memalign(4096, my_size); - - memset(my_addr, 0, my_size); - mr = ibv_reg_mr( - my_pd, - my_addr, - my_size, - IBV_ACCESS_REMOTE_WRITE - /* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */ - ); - - - if(NULL == mr){ - opal_output(0, "%s: error on mr test! can't register %lu bytes, errno says %s \n", __func__, my_size, strerror(errno)); - break; - } - else { - opal_output(0, "%s: successfully registerted %lu bytes", __func__, my_size); - ibv_dereg_mr(mr); - } - } - - + if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ - opal_output(0, "%s: error obtaining device attributes for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno)); + BTL_ERROR("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno)); return NULL; } @@ -352,8 +323,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, struct ibv_port_attr* ib_port_attr; ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr)); if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){ - opal_output(0, "%s: error getting port attributes for device %s port number %d errno says %s", - __func__, ibv_get_device_name(ib_dev), j, strerror(errno)); + BTL_ERROR("error getting port attributes for device %s port number %d errno says %s", + ibv_get_device_name(ib_dev), j, strerror(errno)); return NULL; } @@ -438,7 +409,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, &mpool_resources); if(NULL == openib_btl->ib_pool) { - opal_output(0, "%s: error creating vapi memory pool! aborting ib btl initialization", __func__); + BTL_ERROR("error creating vapi memory pool! aborting ib btl initialization"); return NULL; } @@ -531,68 +502,80 @@ int mca_btl_openib_component_progress() uint32_t i, ne; int count = 0; mca_btl_openib_frag_t* frag; + mca_btl_openib_endpoint_t* endpoint; /* Poll for completions */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { struct ibv_wc wc; + memset(&wc, 0, sizeof(struct ibv_wc)); + mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i]; - + + /* we have two completion queues, one for "high" priority and one for "low". + * we will check the high priority and process them until there are none left. + * note that low priority messages are only processed one per progress call. + */ do{ ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); if(ne < 0 ){ - opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno)); + BTL_ERROR("error polling CQ with %d errno says %s\n", ne, strerror(errno)); return OMPI_ERROR; } else if(wc.status != IBV_WC_SUCCESS) { - opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n", - __func__, - wc.status, wc.wr_id); + BTL_ERROR("error polling CQ with status %d for wr_id %d\n", + wc.status, wc.wr_id); return OMPI_ERROR; } - else if(1 == ne) { - /* Handle n/w completions */ + else if(1 == ne) { + DEBUG_OUT("completion queue event says opcode is %d\n", wc.opcode); + + /* Handle work completions */ switch(wc.opcode) { case IBV_WC_RECV_RDMA_WITH_IMM: - opal_output(0, "Got an RDMA with Immediate data Not supported!\n"); + BTL_ERROR("Got an RDMA with Immediate data Not supported!"); return OMPI_ERROR; + case IBV_WC_RECV: + /* Process a RECV */ + + DEBUG_OUT("Got an recv on the completion queue"); + frag = (mca_btl_openib_frag_t*) wc.wr_id; + endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; + frag->rc=OMPI_SUCCESS; + frag->segment.seg_len = + wc.byte_len- + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); + + + OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1); + + mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); + + /* advance the segment address past the header and subtract from the length..*/ + openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, + frag->hdr->tag, + &frag->base, + openib_btl->ib_reg[frag->hdr->tag].cbdata); + + OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); + count++; + break; + case IBV_WC_RDMA_WRITE: case IBV_WC_SEND : - if(wc.opcode & IBV_WC_RECV){ - /* process a recv completion (this should only occur for a send not an rdma) */ - DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); - frag = (mca_btl_openib_frag_t*) wc.wr_id; - frag->rc=OMPI_SUCCESS; - frag->segment.seg_len = - wc.byte_len- - ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); - - /* advance the segment address past the header and subtract from the length..*/ - openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, - frag->hdr->tag, - &frag->base, - openib_btl->ib_reg[frag->hdr->tag].cbdata); - - OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1); - - mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); - - count++; - } - else { - /* Process a completed send */ - frag = (mca_btl_openib_frag_t*) wc.wr_id; - frag->rc = OMPI_SUCCESS; - frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); - count++; - - } + + /* Process a completed send or rdma write*/ + frag = (mca_btl_openib_frag_t*) wc.wr_id; + frag->rc = OMPI_SUCCESS; + frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); + count++; + break; + break; default: - opal_output(0, "Errorneous network completion"); + BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode); break; } } @@ -601,12 +584,11 @@ int mca_btl_openib_component_progress() ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc ); if(ne < 0){ - opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno)); + BTL_ERROR("error polling CQ with %d errno says %s", ne, strerror(errno)); return OMPI_ERROR; } else if(wc.status != IBV_WC_SUCCESS) { - opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n", - __func__, + BTL_ERROR("error polling CQ with status %d for wr_id %d", wc.status, wc.wr_id); return OMPI_ERROR; } @@ -614,46 +596,46 @@ int mca_btl_openib_component_progress() /* Handle n/w completions */ switch(wc.opcode) { case IBV_WC_RECV_RDMA_WITH_IMM: - opal_output(0, "Got an RDMA with Immediate data Not supported!\n"); + BTL_ERROR("Got an RDMA with Immediate data Not supported!"); return OMPI_ERROR; + case IBV_WC_RECV: + /* process a recv completion (this should only occur for a send not an rdma) */ + DEBUG_OUT( "%s:%d ib recv under redesign\n", __FILE__, __LINE__); + frag = (mca_btl_openib_frag_t*) wc.wr_id; + endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; + frag->rc=OMPI_SUCCESS; + + /* advance the segment address past the header and subtract from the length..*/ + frag->segment.seg_len = + wc.byte_len- + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); + + OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1); + + mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); + + openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, + frag->hdr->tag, + &frag->base, + openib_btl->ib_reg[frag->hdr->tag].cbdata); + + OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_max), (opal_list_item_t*) frag); + + count++; + break; + case IBV_WC_RDMA_WRITE: case IBV_WC_SEND : - if(wc.opcode & IBV_WC_RECV){ - /* process a recv completion (this should only occur for a send not an rdma) */ - DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); - frag = (mca_btl_openib_frag_t*) wc.wr_id; - frag->rc=OMPI_SUCCESS; - frag->segment.seg_len = - wc.byte_len- - ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); - - /* advance the segment address past the header and subtract from the length..*/ - openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, - frag->hdr->tag, - &frag->base, - openib_btl->ib_reg[frag->hdr->tag].cbdata); - - OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1); - - mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0); - - count++; - } - else { - /* Process a completed send */ - frag = (mca_btl_openib_frag_t*) wc.wr_id; - frag->rc = OMPI_SUCCESS; - frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); - count++; - - } - + /* Process a completed send */ + frag = (mca_btl_openib_frag_t*) wc.wr_id; + frag->rc = OMPI_SUCCESS; + frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); + count++; break; default: - opal_output(0, "Errorneous network completion"); + BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode); break; } } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index b71def9490..2c38c234b9 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -74,14 +74,17 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope } frag->sr_desc.opcode = IBV_WR_SEND; - frag->sr_desc.send_flags = IBV_SEND_SIGNALED; + + frag->sg_entry.length = + frag->segment.seg_len + + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); - frag->sg_entry.length = frag->segment.seg_len + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */ + + if(frag->sg_entry.length <= openib_btl->ib_inline_max) { + /* frag->sr_desc.send_flags |= IBV_SEND_INLINE; */ + } - /* TODO: should check if we can inline send,, but can't find - * inline send defined in openib verbs api. - * if(frag->sg_entry.len <= openib_btl->ib_inline_max) { - */ + if(ibv_post_send(ib_qp, &frag->sr_desc, &bad_wr)) { @@ -90,7 +93,7 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope } mca_btl_openib_endpoint_post_rr(endpoint, 1); - return OMPI_ERROR; + return OMPI_SUCCESS; } @@ -114,6 +117,14 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t); + endpoint->lcl_qp_attr_high = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr)); + endpoint->lcl_qp_attr_low = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr)); + memset(endpoint->lcl_qp_attr_high, 0, sizeof(struct ibv_qp_attr)); + memset(endpoint->lcl_qp_attr_low, 0, sizeof(struct ibv_qp_attr)); + endpoint->rr_posted_high = 0; + endpoint->rr_posted_low = 0; + + } /* @@ -190,9 +201,9 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_prop_high.qp_num, - endpoint->lcl_qp_prop_low.qp_num, - endpoint->endpoint_btl->port.lid); + endpoint->lcl_qp_high->qp_num, + endpoint->lcl_qp_low->qp_num, + endpoint->endpoint_btl->ib_port_attr->lid); if(rc < 0) { ORTE_ERROR_LOG(rc); @@ -318,6 +329,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } + srand48(getpid() * time(NULL)); endpoint->lcl_psn_high = lrand48() & 0xffffff; /* Create the Low Priority Queue Pair */ @@ -334,7 +346,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_high->qp_num, - endpoint->lcl_qp_low.qp_num, + endpoint->lcl_qp_low->qp_num, openib_btl->ib_port_attr->lid); /* Send connection info over to remote endpoint */ @@ -367,6 +379,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } + srand48(getpid() * time(NULL)); endpoint->lcl_psn_high = lrand48() & 0xffffff; /* Create the Low Priority Queue Pair */ @@ -383,7 +396,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", endpoint->lcl_qp_high->qp_num, - endpoint->lcl_qp_low.qp_num, + endpoint->lcl_qp_low->qp_num, openib_btl->ib_port_attr->lid); @@ -415,6 +428,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) { endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; + endpoint->endpoint_btl->poll_cq = true; mca_btl_openib_progress_send_frags(endpoint); } @@ -491,12 +505,13 @@ static void mca_btl_openib_endpoint_recv( break; case MCA_BTL_IB_CONNECT_ACK: + DEBUG_OUT("Got a connect ack from %d\n", endpoint->vpid); mca_btl_openib_endpoint_connected(ib_endpoint); - break; case MCA_BTL_IB_CONNECTED : + break; default : opal_output(0, "Connected -> Connecting not possible.\n"); @@ -581,9 +596,9 @@ int mca_btl_openib_endpoint_send( DEBUG_OUT("Send to : %d, len : %d, frag : %p", - endpoint->endpoint_proc->proc_guid.vpid, - frag->ib_buf.desc.sg_entry.len, - frag); + endpoint->endpoint_proc->proc_guid.vpid, + frag->sg_entry.length, + frag); rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag); @@ -686,23 +701,27 @@ int mca_btl_openib_endpoint_create_qp( ) { { + struct ibv_qp* my_qp; struct ibv_qp_init_attr qp_init_attr; + + memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); + qp_init_attr.send_cq = cq; qp_init_attr.recv_cq = cq; - qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size; + qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size; qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.ib_wq_size; - qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size; + qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size; qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size; qp_init_attr.qp_type = IBV_QPT_RC; - (*qp) = ibv_create_qp(pd, &qp_init_attr); + my_qp = ibv_create_qp(pd, &qp_init_attr); - if(NULL == (*qp)) { + if(NULL == my_qp) { opal_output(0, "%s: error creating qp errno says %s\n", __func__, strerror(errno)); return OMPI_ERROR; } - + (*qp) = my_qp; openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data; } @@ -711,7 +730,7 @@ int mca_btl_openib_endpoint_create_qp( qp_attr->qp_state = IBV_QPS_INIT; qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix; qp_attr->port_num = openib_btl->port_num; - qp_attr->qp_access_flags = 0; + qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE; if(ibv_modify_qp((*qp), qp_attr, IBV_QP_STATE | diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 395c222872..62f62b2a5b 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -1,3 +1,4 @@ + /* * Copyright (c) 2004-2005 The Trustees of Indiana University. * All rights reserved. @@ -90,7 +91,7 @@ struct mca_btl_base_endpoint_t { /**< lock for concurrent access to endpoint state */ opal_list_t pending_send_frags; - /**< list of pending send frags for this endpoint */ + /**< list of pending send frags for this endpotint */ uint32_t rem_qp_num_high; uint32_t rem_qp_num_low; @@ -115,7 +116,11 @@ struct mca_btl_base_endpoint_t { struct ibv_qp_attr* lcl_qp_attr_high; struct ibv_qp_attr* lcl_qp_attr_low; /* Local QP attributes (Low and High) */ + + uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/ + uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/ + }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; @@ -160,16 +165,15 @@ static inline int mca_btl_openib_endpoint_post_rr_sub(int cnt, } for(i=0; i< cnt; i++){ - if(ibv_post_recv(qp, &rr_desc_post[i], &bad_wr)) { opal_output(0, "%s: error posting receive errno says %s\n", __func__, strerror(errno)); return OMPI_ERROR; + } + } - return OMPI_SUCCESS; - } OPAL_THREAD_ADD32(rr_posted, cnt); return OMPI_SUCCESS; } @@ -179,12 +183,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e int rc; OPAL_THREAD_LOCK(&openib_btl->ib_lock); - if(openib_btl->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){ + if(endpoint->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){ - rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_high, + rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_high, endpoint, &openib_btl->recv_free_eager, - &openib_btl->rr_posted_high, + &endpoint->rr_posted_high, endpoint->lcl_qp_high ); if(rc != OMPI_SUCCESS){ @@ -192,12 +196,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e return rc; } } - if(openib_btl->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){ + if(endpoint->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){ - rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_low, + rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_low, endpoint, &openib_btl->recv_free_max, - &openib_btl->rr_posted_low, + &endpoint->rr_posted_low, endpoint->lcl_qp_low ); if(rc != OMPI_SUCCESS) { diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 96362aeaa9..a7855aafc8 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -45,11 +45,12 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; - frag->sr_desc.wr_id = frag; + frag->sr_desc.wr_id = (uint64_t) frag; frag->sr_desc.sg_list = &frag->sg_entry; frag->sr_desc.num_sge = 1; frag->sr_desc.opcode = IBV_WR_SEND; frag->sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->sr_desc.next = NULL; } static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag) @@ -60,10 +61,11 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f frag->base.des_dst_cnt = 1; frag->base.des_src = NULL; frag->base.des_src_cnt = 0; - - frag->rr_desc.wr_id = frag; + + frag->rr_desc.wr_id = (uint64_t) frag; frag->rr_desc.sg_list = &frag->sg_entry; frag->rr_desc.num_sge = 1; + frag->rr_desc.next = NULL; } static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag) diff --git a/ompi/mca/mpool/openib/mpool_openib.h b/ompi/mca/mpool/openib/mpool_openib.h index f9de8aaf82..6e7380cebb 100644 --- a/ompi/mca/mpool/openib/mpool_openib.h +++ b/ompi/mca/mpool/openib/mpool_openib.h @@ -30,11 +30,11 @@ extern "C" { static inline void * DOWN_ALIGN_ADDR(void * addr, uint32_t cnt) { - return (void*)((uintptr_t)(addr) & (~((uintptr_t)0) << (cnt))); + return (void*)((uintptr_t)((unsigned char*) addr) & (~((uintptr_t)0) << (cnt))); } static inline void* ALIGN_ADDR(void* addr, uint32_t cnt ) { - DOWN_ALIGN_ADDR(((addr) + ~(~((uintptr_t)0) << (cnt))), (cnt)); + DOWN_ALIGN_ADDR((((unsigned char*) addr) + ~(~((uintptr_t)0) << (cnt))), (cnt)); return addr; } diff --git a/ompi/mca/mpool/openib/mpool_openib_component.c b/ompi/mca/mpool/openib/mpool_openib_component.c index b4e41e58cd..3129bfbfe1 100644 --- a/ompi/mca/mpool/openib/mpool_openib_component.c +++ b/ompi/mca/mpool/openib/mpool_openib_component.c @@ -70,13 +70,6 @@ static void mca_mpool_openib_registration_constructor( mca_mpool_openib_registra static void mca_mpool_openib_registration_destructor( mca_mpool_openib_registration_t * registration ) { - mca_mpool_base_remove((void*) registration); - registration->base_reg.mpool->mpool_deregister( - registration->base_reg.mpool, - registration->base_reg.base, - 0, - (mca_mpool_base_registration_t*) registration); - registration->base_reg.base = NULL; registration->base_reg.bound = NULL; registration->base_reg.is_leave_pinned=false; diff --git a/ompi/mca/mpool/openib/mpool_openib_module.c b/ompi/mca/mpool/openib/mpool_openib_module.c index a041763cda..349068dee6 100644 --- a/ompi/mca/mpool/openib/mpool_openib_module.c +++ b/ompi/mca/mpool/openib/mpool_openib_module.c @@ -57,6 +57,7 @@ void* mca_mpool_openib_alloc( free(addr_malloc); return NULL; } + (*registration)->alloc_base = addr_malloc; return addr; } @@ -80,7 +81,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool, mpool_module->resources.ib_pd, addr, size, - IBV_ACCESS_REMOTE_WRITE + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE /* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */ ); @@ -106,7 +107,7 @@ int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool; mca_mpool_openib_registration_t * openib_reg; openib_reg = (mca_mpool_openib_registration_t*) registration; - if(! ibv_dereg_mr(openib_reg->mr)){ + if(ibv_dereg_mr(openib_reg->mr)){ opal_output(0, "%s: error unpinning openib memory errno says %s\n", __func__, strerror(errno)); return OMPI_ERROR; }