diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 84fdf2868e..36a1ca4f1b 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -26,10 +26,9 @@ #include "btl_openib_proc.h" #include "btl_openib_endpoint.h" #include "datatype/convertor.h" -#include "mca/common/vapi/vapi_mem_reg.h" #include "mca/mpool/base/base.h" #include "mca/mpool/mpool.h" -#include "mca/mpool/mvapi/mpool_mvapi.h" +#include "mca/mpool/openib/mpool_openib.h" mca_btl_openib_module_t mca_btl_openib_module = { { @@ -67,7 +66,7 @@ int mca_btl_openib_add_procs( struct mca_btl_base_endpoint_t** peers, ompi_bitmap_t* reachable) { - mca_btl_openib_module_t* mvapi_btl = (mca_btl_openib_module_t*)btl; + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)btl; int i, rc; for(i = 0; i < (int) nprocs; i++) { @@ -98,7 +97,7 @@ int mca_btl_openib_add_procs( return OMPI_ERR_OUT_OF_RESOURCE; } - ib_peer->endpoint_btl = mvapi_btl; + ib_peer->endpoint_btl = openib_btl; rc = mca_btl_openib_proc_insert(ib_proc, ib_peer); if(rc != OMPI_SUCCESS) { OBJ_RELEASE(ib_peer); @@ -131,12 +130,12 @@ int mca_btl_openib_register( void* cbdata) { /* TODO add register stuff here... */ - mca_btl_openib_module_t* mvapi_btl = (mca_btl_openib_module_t*) btl; + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; OPAL_THREAD_LOCK(&ib->btl.ib_lock); - mvapi_btl->ib_reg[tag].cbfunc = cbfunc; - mvapi_btl->ib_reg[tag].cbdata = cbdata; + openib_btl->ib_reg[tag].cbfunc = cbfunc; + openib_btl->ib_reg[tag].cbdata = cbdata; OPAL_THREAD_UNLOCK(&ib->btl.ib_lock); return OMPI_SUCCESS; } @@ -153,9 +152,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( size_t size) { mca_btl_openib_frag_t* frag; - mca_btl_openib_module_t* mvapi_btl; + mca_btl_openib_module_t* openib_btl; int rc; - mvapi_btl = (mca_btl_openib_module_t*) btl; + openib_btl = (mca_btl_openib_module_t*) btl; if(size <= mca_btl_openib_component.eager_limit){ MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc); @@ -169,7 +168,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( size: mca_btl_openib_component.max_send_size ; } - frag->segment.seg_len = size <= mvapi_btl->super.btl_eager_limit ? size : mvapi_btl->super.btl_eager_limit; + frag->segment.seg_len = size <= openib_btl->super.btl_eager_limit ? size : openib_btl->super.btl_eager_limit; frag->base.des_flags = 0; return (mca_btl_base_descriptor_t*)frag; @@ -218,9 +217,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( size_t* size ) { - mca_btl_openib_module_t* mvapi_btl; + mca_btl_openib_module_t* openib_btl; mca_btl_openib_frag_t* frag; - mca_mpool_mvapi_registration_t * vapi_reg; + mca_mpool_openib_registration_t * vapi_reg; struct iovec iov; int32_t iov_count = 1; size_t max_data = *size; @@ -228,8 +227,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( int rc; - mvapi_btl = (mca_btl_openib_module_t*) btl; - vapi_reg = (mca_mpool_mvapi_registration_t*) registration; + openib_btl = (mca_btl_openib_module_t*) btl; + vapi_reg = (mca_mpool_openib_registration_t*) registration; /** if the data fits in the eager limit and we aren't told to pinn then we simply pack, if the data fits in the eager limit and the data is non contiguous @@ -267,14 +266,14 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( } if(is_leave_pinned) { - if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)){ + if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)){ opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); return NULL; } } OBJ_RELEASE(vapi_reg); - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, base_addr, new_len, (mca_mpool_base_registration_t**) &vapi_reg); @@ -283,8 +282,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( rc = mca_mpool_base_insert(vapi_reg->base_reg.base, vapi_reg->base_reg.bound - vapi_reg->base_reg.base + 1, - mvapi_btl->ib_pool, - (void*) (&mvapi_btl->super), + openib_btl->ib_pool, + (void*) (&openib_btl->super), (mca_mpool_base_registration_t*) vapi_reg); @@ -296,15 +295,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( OBJ_RETAIN(vapi_reg); if(is_leave_pinned) { vapi_reg->is_leave_pinned = is_leave_pinned; - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } } else if(is_leave_pinned) { - if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { + if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); return NULL; } - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } frag->mem_hndl = vapi_reg->hndl; @@ -345,11 +344,11 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( if(mca_btl_openib_component.leave_pinned) { - if(mca_btl_openib_component.reg_mru_len <= mvapi_btl->reg_mru_list.opal_list_length ) { + if(mca_btl_openib_component.reg_mru_len <= openib_btl->reg_mru_list.opal_list_length ) { - mca_mpool_mvapi_registration_t* old_reg = - (mca_mpool_mvapi_registration_t*) - opal_list_remove_last(&mvapi_btl->reg_mru_list); + mca_mpool_openib_registration_t* old_reg = + (mca_mpool_openib_registration_t*) + opal_list_remove_last(&openib_btl->reg_mru_list); if( NULL == old_reg) { opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); @@ -367,15 +366,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( OBJ_RELEASE(old_reg); } - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, iov.iov_base, max_data, (mca_mpool_base_registration_t**) &vapi_reg); rc = mca_mpool_base_insert(vapi_reg->base_reg.base, vapi_reg->base_reg.bound - vapi_reg->base_reg.base + 1, - mvapi_btl->ib_pool, - (void*) (&mvapi_btl->super), + openib_btl->ib_pool, + (void*) (&openib_btl->super), (mca_mpool_base_registration_t*) vapi_reg); if(rc != OMPI_SUCCESS) return NULL; @@ -383,10 +382,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( vapi_reg->is_leave_pinned = true; - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } else { - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, iov.iov_base, max_data, (mca_mpool_base_registration_t**) &vapi_reg); @@ -439,7 +438,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( /** if the data fits in the max limit and we aren't told to pinn then we simply pack, if the data is non contiguous then we pack **/ - else if(max_data + reserve <= mvapi_btl->super.btl_max_send_size) { + else if(max_data + reserve <= openib_btl->super.btl_max_send_size) { MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc); if(NULL == frag) { @@ -486,14 +485,14 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( size_t reserve, size_t* size) { - mca_btl_openib_module_t* mvapi_btl; + mca_btl_openib_module_t* openib_btl; mca_btl_openib_frag_t* frag; - mca_mpool_mvapi_registration_t * vapi_reg; + mca_mpool_openib_registration_t * vapi_reg; int rc; size_t reg_len; - mvapi_btl = (mca_btl_openib_module_t*) btl; - vapi_reg = (mca_mpool_mvapi_registration_t*) registration; + openib_btl = (mca_btl_openib_module_t*) btl; + vapi_reg = (mca_mpool_openib_registration_t*) registration; MCA_BTL_IB_FRAG_ALLOC_FRAG(btl, frag, rc); @@ -522,14 +521,14 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( } if(is_leave_pinned) { - if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { + if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); return NULL; } } OBJ_RELEASE(vapi_reg); - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, base_addr, new_len, (mca_mpool_base_registration_t**) &vapi_reg); @@ -537,8 +536,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( rc = mca_mpool_base_insert(vapi_reg->base_reg.base, vapi_reg->base_reg.bound - vapi_reg->base_reg.base + 1, - mvapi_btl->ib_pool, - (void*) (&mvapi_btl->super), + openib_btl->ib_pool, + (void*) (&openib_btl->super), (mca_mpool_base_registration_t*) vapi_reg); if(OMPI_SUCCESS != rc) { @@ -549,27 +548,27 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( if(is_leave_pinned) { vapi_reg->is_leave_pinned = is_leave_pinned; - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } } else if(is_leave_pinned){ - if(NULL == opal_list_remove_item(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { + if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg)) { opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); return NULL; } - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } } else { if(mca_btl_openib_component.leave_pinned) { - if( mca_btl_openib_component.reg_mru_len <= mvapi_btl->reg_mru_list.opal_list_length ) { + if( mca_btl_openib_component.reg_mru_len <= openib_btl->reg_mru_list.opal_list_length ) { - mca_mpool_mvapi_registration_t* old_reg = - (mca_mpool_mvapi_registration_t*) - opal_list_remove_last(&mvapi_btl->reg_mru_list); + mca_mpool_openib_registration_t* old_reg = + (mca_mpool_openib_registration_t*) + opal_list_remove_last(&openib_btl->reg_mru_list); if( NULL == old_reg) { opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); @@ -585,7 +584,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( OBJ_RELEASE(old_reg); } - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, frag->segment.seg_addr.pval, *size, (mca_mpool_base_registration_t**) &vapi_reg); @@ -594,8 +593,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( rc = mca_mpool_base_insert(vapi_reg->base_reg.base, vapi_reg->base_reg.bound - vapi_reg->base_reg.base + 1, - mvapi_btl->ib_pool, - (void*) (&mvapi_btl->super), + openib_btl->ib_pool, + (void*) (&openib_btl->super), (mca_mpool_base_registration_t*) vapi_reg); if(OMPI_SUCCESS != rc){ opal_output(0,"%s:%d:%s error inserting memory region into memory pool", __FILE__, __LINE__, __func__); @@ -603,10 +602,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( } OBJ_RETAIN(vapi_reg); - opal_list_append(&mvapi_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); + opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) vapi_reg); } else { - mvapi_btl->ib_pool->mpool_register(mvapi_btl->ib_pool, + openib_btl->ib_pool->mpool_register(openib_btl->ib_pool, frag->segment.seg_addr.pval, *size, (mca_mpool_base_registration_t**) &vapi_reg); @@ -637,40 +636,40 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) { - mca_btl_openib_module_t* mvapi_btl; - mvapi_btl = (mca_btl_openib_module_t*) btl; + mca_btl_openib_module_t* openib_btl; + openib_btl = (mca_btl_openib_module_t*) btl; - if(mvapi_btl->send_free_eager.fl_num_allocated != - mvapi_btl->send_free_eager.super.opal_list_length){ + if(openib_btl->send_free_eager.fl_num_allocated != + openib_btl->send_free_eager.super.opal_list_length){ opal_output(0, "btl ib send_free_eager frags: %d allocated %d returned \n", - mvapi_btl->send_free_eager.fl_num_allocated, - mvapi_btl->send_free_eager.super.opal_list_length); + openib_btl->send_free_eager.fl_num_allocated, + openib_btl->send_free_eager.super.opal_list_length); } - if(mvapi_btl->send_free_max.fl_num_allocated != - mvapi_btl->send_free_max.super.opal_list_length){ + if(openib_btl->send_free_max.fl_num_allocated != + openib_btl->send_free_max.super.opal_list_length){ opal_output(0, "btl ib send_free_max frags: %d allocated %d returned \n", - mvapi_btl->send_free_max.fl_num_allocated, - mvapi_btl->send_free_max.super.opal_list_length); + openib_btl->send_free_max.fl_num_allocated, + openib_btl->send_free_max.super.opal_list_length); } - if(mvapi_btl->send_free_frag.fl_num_allocated != - mvapi_btl->send_free_frag.super.opal_list_length){ + if(openib_btl->send_free_frag.fl_num_allocated != + openib_btl->send_free_frag.super.opal_list_length){ opal_output(0, "btl ib send_free_frag frags: %d allocated %d returned \n", - mvapi_btl->send_free_frag.fl_num_allocated, - mvapi_btl->send_free_frag.super.opal_list_length); + openib_btl->send_free_frag.fl_num_allocated, + openib_btl->send_free_frag.super.opal_list_length); } - if(mvapi_btl->recv_free_eager.fl_num_allocated != - mvapi_btl->recv_free_eager.super.opal_list_length){ + if(openib_btl->recv_free_eager.fl_num_allocated != + openib_btl->recv_free_eager.super.opal_list_length){ opal_output(0, "btl ib recv_free_eager frags: %d allocated %d returned \n", - mvapi_btl->recv_free_eager.fl_num_allocated, - mvapi_btl->recv_free_eager.super.opal_list_length); + openib_btl->recv_free_eager.fl_num_allocated, + openib_btl->recv_free_eager.super.opal_list_length); } - if(mvapi_btl->recv_free_max.fl_num_allocated != - mvapi_btl->recv_free_max.super.opal_list_length){ + if(openib_btl->recv_free_max.fl_num_allocated != + openib_btl->recv_free_max.super.opal_list_length){ opal_output(0, "btl ib recv_free_max frags: %d allocated %d returned \n", - mvapi_btl->recv_free_max.fl_num_allocated, - mvapi_btl->recv_free_max.super.opal_list_length); + openib_btl->recv_free_max.fl_num_allocated, + openib_btl->recv_free_max.super.opal_list_length); } return OMPI_SUCCESS; @@ -709,23 +708,23 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, mca_btl_base_endpoint_t* endpoint, mca_btl_base_descriptor_t* descriptor) { - mca_btl_openib_module_t* mvapi_btl = (mca_btl_openib_module_t*) btl; + struct ibv_send_wr* bad_wr; + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; frag->endpoint = endpoint; - frag->sr_desc.opcode = VAPI_RDMA_WRITE; - - frag->sr_desc.remote_qp = endpoint->rem_qp_num_low; - frag->sr_desc.remote_addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_dst->seg_addr.pval; - frag->sr_desc.r_key = frag->base.des_dst->seg_key.key32[0]; - frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_src->seg_addr.pval; - frag->sg_entry.len = frag->base.des_src->seg_len; + frag->sr_desc.opcode = IB_WC_RDMA_WRITE; + frag->sr_desc.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval; + frag->sr_desc.rdma.rkey = frag->base.des_dst->seg_key.key32[0]; + frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval; + frag->sg_entry.length = frag->base.des_src->seg_len; - frag->ret = VAPI_post_sr(mvapi_btl->nic, - endpoint->lcl_qp_hndl_low, - &frag->sr_desc); - if(VAPI_OK != frag->ret){ + if(ibv_post_send(endpoint->lcl_qp_low, + frag->sr_desc, + &bad_wr)){ + opal_output(0, "%s: error posting send request\n", __func__); return OMPI_ERROR; - } + } + mca_btl_openib_endpoint_post_rr(endpoint, 1); return OMPI_SUCCESS; @@ -741,92 +740,72 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, * events and abort the OMPI application if necessary. * */ -static void async_event_handler(VAPI_hca_hndl_t hca_hndl, - VAPI_event_record_t * event_p, - void *priv_data) -{ - switch (event_p->type) { - case VAPI_QP_PATH_MIGRATED: - case VAPI_EEC_PATH_MIGRATED: - case VAPI_QP_COMM_ESTABLISHED: - case VAPI_EEC_COMM_ESTABLISHED: - case VAPI_SEND_QUEUE_DRAINED: - case VAPI_PORT_ACTIVE: - { - DEBUG_OUT("Got an asynchronous event: %s\n", - VAPI_event_record_sym(event_p->type)); - break; - } - case VAPI_CQ_ERROR: - case VAPI_LOCAL_WQ_INV_REQUEST_ERROR: - case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR: - case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR: - case VAPI_PATH_MIG_REQ_ERROR: - case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR: - case VAPI_LOCAL_CATASTROPHIC_ERROR: - case VAPI_PORT_ERROR: - { - opal_output(0, "Got an asynchronous event: %s (%s)", - VAPI_event_record_sym(event_p->type), - VAPI_event_syndrome_sym(event_p-> - syndrome)); - break; - } - default: - opal_output(0, "Warning!! Got an undefined " - "asynchronous event\n"); - } +/* static void async_event_handler(VAPI_hca_hndl_t hca_hndl, */ +/* VAPI_event_record_t * event_p, */ +/* void *priv_data) */ +/* { */ +/* switch (event_p->type) { */ +/* case VAPI_QP_PATH_MIGRATED: */ +/* case VAPI_EEC_PATH_MIGRATED: */ +/* case VAPI_QP_COMM_ESTABLISHED: */ +/* case VAPI_EEC_COMM_ESTABLISHED: */ +/* case VAPI_SEND_QUEUE_DRAINED: */ +/* case VAPI_PORT_ACTIVE: */ +/* { */ +/* DEBUG_OUT("Got an asynchronous event: %s\n", */ +/* VAPI_event_record_sym(event_p->type)); */ +/* break; */ +/* } */ +/* case VAPI_CQ_ERROR: */ +/* case VAPI_LOCAL_WQ_INV_REQUEST_ERROR: */ +/* case VAPI_LOCAL_WQ_ACCESS_VIOL_ERROR: */ +/* case VAPI_LOCAL_WQ_CATASTROPHIC_ERROR: */ +/* case VAPI_PATH_MIG_REQ_ERROR: */ +/* case VAPI_LOCAL_EEC_CATASTROPHIC_ERROR: */ +/* case VAPI_LOCAL_CATASTROPHIC_ERROR: */ +/* case VAPI_PORT_ERROR: */ +/* { */ +/* opal_output(0, "Got an asynchronous event: %s (%s)", */ +/* VAPI_event_record_sym(event_p->type), */ +/* VAPI_event_syndrome_sym(event_p-> */ +/* syndrome)); */ +/* break; */ +/* } */ +/* default: */ +/* opal_output(0, "Warning!! Got an undefined " */ +/* "asynchronous event\n"); */ +/* } */ -} +/* } */ -int mca_btl_openib_module_init(mca_btl_openib_module_t *mvapi_btl) +int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl) { /* Allocate Protection Domain */ - VAPI_ret_t ret; + struct ibv_context *ctx; + ctx = openib_btl->ib_dev_context; + uint32_t cqe_cnt = 0; - - ret = VAPI_alloc_pd(mvapi_btl->nic, &mvapi_btl->ptag); - if(ret != VAPI_OK) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_alloc_pd"); + openib_btl->ib_pd = ibv_alloc_pd(ctx); + + + if(NULL == openib->ib_pd) { + ompi_output(0, "%s: error allocating pd for %s\n", __func__, ibv_get_device_name(openib_btl->ib_dev)); return OMPI_ERROR; } - ret = VAPI_create_cq(mvapi_btl->nic, mvapi_btl->ib_cq_size, - &mvapi_btl->cq_hndl_low, &cqe_cnt); - + openib_btl->ib_cq = ibv_create_cq(ctx, openib_btl->ib_cq_size, NULL); - if( VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_create_cq"); + if(NULL == openib_btl->ib_cq) { + ompi_output(0, "%s: error creating cq for %s\n", __func__, ibv_get_device_name(openib_btl->ib_dev)); return OMPI_ERROR; } - - ret = VAPI_create_cq(mvapi_btl->nic, mvapi_btl->ib_cq_size, - &mvapi_btl->cq_hndl_high, &cqe_cnt); + + /* TODO: EVAPI_set_qsync_event_handler? */ - - if( VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_create_cq"); - return OMPI_ERROR; - } - - - if(cqe_cnt <= 0) { - opal_output(0, "%s: error creating completion queue ", __func__); - return OMPI_ERROR; - } - - ret = EVAPI_set_async_event_handler(mvapi_btl->nic, - async_event_handler, 0, &mvapi_btl->async_handler); - - if(VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "EVAPI_set_async_event_handler"); - return OMPI_ERROR; - } - return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 7be38fc4dc..537172b394 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -53,7 +53,7 @@ struct mca_btl_openib_component_t { uint32_t ib_num_btls; /**< number of hcas available to the IB component */ - struct mca_btl_openib_module_t *mvapi_btls; + struct mca_btl_openib_module_t *openib_btls; /**< array of available PTLs */ int ib_free_list_num; @@ -111,19 +111,16 @@ struct mca_btl_openib_module_t { mca_btl_base_module_t super; /**< base PTL interface */ bool btl_inited; mca_btl_openib_recv_reg_t ib_reg[256]; - VAPI_hca_id_t hca_id; /**< ID of HCA */ - - int port; /**< ID of the PORT */ - struct ibv_device; /* the ib device */ - - VAPI_hca_hndl_t nic; /**< NIC handle */ - VAPI_pd_hndl_t ptag; /**< Protection Domain tag */ - - VAPI_cq_hndl_t cq_hndl_high; /**< High Priority Completion Queue handle */ - VAPI_cq_hndl_t cq_hndl_low; /**< Low Priority Completion Queue handle */ - - EVAPI_async_handler_hndl_t async_handler; - /**< Async event handler used to detect weird/unknown events */ + + uint8_t port_num; /**< ID of the PORT */ + struct ibv_device *ib_dev; /* the ib device */ + struct ibv_context *ib_dev_context; + struct ibv_pd *ib_pd; + struct ibv_cq *ib_cq_high; + struct ibv_cq *ib_cq_low; + struct ibv_port_attr* ib_port_attr; + struct ibv_recv_wr* rr_desc_post; + ompi_free_list_t send_free_eager; /**< free list of eager buffer descriptors */ ompi_free_list_t send_free_max; /**< free list of max buffer descriptors */ @@ -132,7 +129,7 @@ struct mca_btl_openib_module_t { ompi_free_list_t recv_free_eager; /**< High priority free list of buffer descriptors */ ompi_free_list_t recv_free_max; /**< Low priority free list of buffer descriptors */ - opal_list_t reg_mru_list; /**< a most recently used list of mca_mpool_mvapi_registration_t + opal_list_t reg_mru_list; /**< a most recently used list of mca_mpool_openib_registration_t entries, this allows us to keep a working set of memory pinned */ opal_list_t repost; /**< list of buffers to repost */ @@ -146,7 +143,7 @@ struct mca_btl_openib_module_t { uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/ - VAPI_rr_desc_t* rr_desc_post; + /**< an array to allow posting of rr in one swoop */ size_t ib_inline_max; /**< max size of inline send*/ @@ -171,7 +168,7 @@ struct mca_btl_openib_module_t { }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; - struct mca_btl_openib_frag_t; +struct mca_btl_openib_frag_t; extern mca_btl_openib_module_t mca_btl_openib_module; /** @@ -208,8 +205,8 @@ extern mca_btl_base_module_t** mca_btl_openib_component_init( * IB component progress. */ extern int mca_btl_openib_component_progress( - void -); + void + ); /** @@ -322,9 +319,9 @@ extern int mca_btl_openib_put( * @param size (IN) Requested descriptor size. */ extern mca_btl_base_descriptor_t* mca_btl_openib_alloc( - struct mca_btl_base_module_t* btl, - size_t size); - + struct mca_btl_base_module_t* btl, + size_t size); + /** * Return a segment allocated by this BTL. @@ -333,8 +330,8 @@ extern mca_btl_base_descriptor_t* mca_btl_openib_alloc( * @param descriptor (IN) Allocated descriptor. */ extern int mca_btl_openib_free( - struct mca_btl_base_module_t* btl, - mca_btl_base_descriptor_t* des); + struct mca_btl_base_module_t* btl, + mca_btl_base_descriptor_t* des); /** @@ -345,13 +342,13 @@ extern int mca_btl_openib_free( * @param peer (IN) BTL peer addressing */ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct ompi_convertor_t* convertor, - size_t reserve, - size_t* size -); + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size + ); /** * Allocate a descriptor initialized for RDMA write. @@ -360,12 +357,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( * @param peer (IN) BTL peer addressing */ extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct ompi_convertor_t* convertor, - size_t reserve, - size_t* size); + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + mca_mpool_base_registration_t* registration, + struct ompi_convertor_t* convertor, + size_t reserve, + size_t* size); /** * Return a send fragment to the modules free list. * @@ -374,12 +371,12 @@ extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( * */ extern void mca_btl_openib_send_frag_return( - struct mca_btl_base_module_t* btl, - struct mca_btl_openib_frag_t* -); + struct mca_btl_base_module_t* btl, + struct mca_btl_openib_frag_t* + ); -int mca_btl_openib_module_init(mca_btl_openib_module_t* mvapi_btl); +int mca_btl_openib_module_init(mca_btl_openib_module_t* openib_btl); #if defined(c_plusplus) || defined(__cplusplus) diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index af5f317995..82f0b7149d 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -31,11 +31,14 @@ #include "btl_openib.h" #include "btl_openib_frag.h" #include "btl_openib_endpoint.h" -#include "mca/btl/base/base.h" -#include -#include +#include "mca/btl/base/base.h" + + #include "datatype/convertor.h" #include "mca/mpool/mvapi/mpool_mvapi.h" +#include +#include + mca_btl_openib_component_t mca_btl_openib_component = { { @@ -240,17 +243,11 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, bool enable_progress_threads, bool enable_mpi_threads) { - VAPI_ret_t vapi_ret; struct ibv_device **ib_devs; - - VAPI_hca_hndl_t hca_hndl; - VAPI_hca_vendor_t hca_vendor; - VAPI_hca_cap_t hca_cap; - VAPI_hca_port_t hca_port; uint32_t num_devs; mca_btl_base_module_t** btls; uint32_t i,j, length; - struct mca_mpool_base_resources_t hca_pd; + struct mca_mpool_base_resources_t mpool_resources; opal_list_t btl_list; mca_btl_openib_module_t * mvapi_btl; mca_btl_base_selected_module_t* ib_selected; @@ -261,7 +258,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, struct dlist *dev_list; struct ibv_device* ib_dev; - + /* Determine the number of hca's available on the host */ dev_list = ibv_get_devices(); dlist_start(dev_list); @@ -298,37 +295,44 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, for(i = 0; i < num_devs; i++){ + struct ibv_device_attr* ib_attr; + struct ibv_context* ib_dev_context; ib_dev = ib_devs[i]; - bv_open_device(ib_dev); - - - vapi_ret = VAPI_query_hca_cap(hca_hndl, &hca_vendor, &hca_cap); - if(VAPI_OK != vapi_ret) { - opal_output(0, "%s:error getting hca properties\n", __func__); + ib_dev_context = ibv_open_device(ib_dev); + if(!ib_dev_context) { + opal_output(0, "%s: error obtaining device context for %s\n", __func__, ibv_get_device_name(ib_dev)); return NULL; } - - - /* Note ports are 1 based hence j = 1 */ - for(j = 1; j <= hca_cap.phys_port_num; j++){ - vapi_ret = VAPI_query_hca_port_prop(hca_hndl, (IB_port_t) j, &hca_port); - if(VAPI_OK != vapi_ret) { - opal_output(0, "%s:error getting hca port properties\n", __func__); - return NULL; - } - - if( PORT_ACTIVE == hca_port.state ){ - - mvapi_btl = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t)); - memcpy(mvapi_btl, &mca_btl_openib_module, sizeof(mca_btl_openib_module)); + + if(ibv_query_device(context, &ib_attr)){ + opal_output(0, "%s: error obtaining device attributes for %s\n", __func__, ibv_get_device_name(ib_dev)); + return NULL; + } + + + /* Note ports are 1 based hence j = 1 */ + + for(j = 1; j <= ib_dev_attr.phys_port_cnt; j++){ + struct ibv_port_attr* ib_port_attr; + if(ibv_query_port(ib_dev_context, (uint8_t) j, &ib_port_attr)){ + opal_output(0, "%s: error getting port attributes for device %s port number %d", + __func__, ibv_get_device_name(ib_dev), j); + return NULL; + } + + if( IBV_PORT_ACTIVE == ib_port_attr->state ){ + + openib_btl = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t)); + memcpy(openib_btl, &mca_btl_openib_module, sizeof(mca_btl_openib_module)); ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); - ib_selected->btl_module = (mca_btl_base_module_t*) mvapi_btl; - memcpy(mvapi_btl->hca_id, hca_ids[i], sizeof(VAPI_hca_id_t)); - mvapi_btl->nic = hca_hndl; - mvapi_btl->port_id = (IB_port_t) j; - mvapi_btl->port = hca_port; + ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; + openib_btl->ib_dev = ib_dev; + openib_btl->ib_dev_context = ib_dev_context; + openib_btl->port_num = (uint8_t) j; + openib_btl->ib_port_attr = ib_port_attr; + opal_list_append(&btl_list, (opal_list_item_t*) ib_selected); mca_btl_openib_component.ib_num_btls ++; @@ -339,10 +343,10 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, /* Allocate space for btl modules */ - mca_btl_openib_component.mvapi_btls = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t) * + mca_btl_openib_component.openib_btls = (mca_btl_openib_module_t*) malloc(sizeof(mca_btl_openib_module_t) * mca_btl_openib_component.ib_num_btls); - if(NULL == mca_btl_openib_component.mvapi_btls) { + if(NULL == mca_btl_openib_component.openib_btls) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return NULL; } @@ -358,98 +362,92 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){ item = opal_list_remove_first(&btl_list); ib_selected = (mca_btl_base_selected_module_t*)item; - mvapi_btl = (mca_btl_openib_module_t*) ib_selected->btl_module; - memcpy(&(mca_btl_openib_component.mvapi_btls[i]), mvapi_btl , sizeof(mca_btl_openib_module_t)); + openib_btl = (mca_btl_openib_module_t*) ib_selected->btl_module; + memcpy(&(mca_btl_openib_component.openib_btls[i]), openib_btl , sizeof(mca_btl_openib_module_t)); free(ib_selected); - free(mvapi_btl); + free(openib_btl); - mvapi_btl = &mca_btl_openib_component.mvapi_btls[i]; + openib_btl = &mca_btl_openib_component.openib_btls[i]; - /* Initialize the modules function pointers */ - - /* Initialize module state */ - OBJ_CONSTRUCT(&mvapi_btl->ib_lock, opal_mutex_t); - OBJ_CONSTRUCT(&mvapi_btl->send_free_eager, ompi_free_list_t); - OBJ_CONSTRUCT(&mvapi_btl->send_free_max, ompi_free_list_t); - OBJ_CONSTRUCT(&mvapi_btl->send_free_frag, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); + OBJ_CONSTRUCT(&openib_btl->send_free_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->send_free_max, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->send_free_frag, ompi_free_list_t); - OBJ_CONSTRUCT(&mvapi_btl->recv_free_eager, ompi_free_list_t); - OBJ_CONSTRUCT(&mvapi_btl->recv_free_max, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->recv_free_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&openib_btl->recv_free_max, ompi_free_list_t); - OBJ_CONSTRUCT(&mvapi_btl->repost, opal_list_t); - OBJ_CONSTRUCT(&mvapi_btl->reg_mru_list, opal_list_t); + OBJ_CONSTRUCT(&openib_btl->repost, opal_list_t); + OBJ_CONSTRUCT(&openib_btl->reg_mru_list, opal_list_t); - if(mca_btl_openib_module_init(mvapi_btl) != OMPI_SUCCESS) { - free(hca_ids); + if(mca_btl_openib_module_init(openib_btl) != OMPI_SUCCESS) { + free(ib_devs); return NULL; } - hca_pd.hca = mvapi_btl->nic; - hca_pd.pd_tag = mvapi_btl->ptag; - + mpool_resources.ib_pd = openib_btl->ib_pd; + /* initialize the memory pool using the hca */ - mvapi_btl->ib_pool = + openib_btl->ib_pool = mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name, - &mvapi_btl->super, - &hca_pd); + &openib_btl->super, + &mpool_resources); - if(NULL == mvapi_btl->ib_pool) { + if(NULL == openib_btl->ib_pool) { opal_output(0, "%s: error creating vapi memory pool! aborting ib btl initialization", __func__); return NULL; } + /* Initialize pool of send fragments */ - length = sizeof(mca_btl_openib_frag_t) + sizeof(mca_btl_openib_header_t) + - mvapi_btl->super.btl_eager_limit+ + openib_btl->super.btl_eager_limit+ 2*MCA_BTL_IB_FRAG_ALIGN; - ompi_free_list_init(&mvapi_btl->send_free_eager, + ompi_free_list_init(&openib_btl->send_free_eager, length, OBJ_CLASS(mca_btl_openib_send_frag_eager_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - mvapi_btl->ib_pool); + openib_btl->ib_pool); - ompi_free_list_init(&mvapi_btl->recv_free_eager, + ompi_free_list_init(&openib_btl->recv_free_eager, length, OBJ_CLASS(mca_btl_openib_recv_frag_eager_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - mvapi_btl->ib_pool); + openib_btl->ib_pool); length = sizeof(mca_btl_openib_frag_t) + sizeof(mca_btl_openib_header_t) + - mvapi_btl->super.btl_max_send_size+ + openib_btl->super.btl_max_send_size + 2*MCA_BTL_IB_FRAG_ALIGN; - ompi_free_list_init(&mvapi_btl->send_free_max, + ompi_free_list_init(&openib_btl->send_free_max, length, OBJ_CLASS(mca_btl_openib_send_frag_max_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - mvapi_btl->ib_pool); - - - + openib_btl->ib_pool); + /* Initialize pool of receive fragments */ - ompi_free_list_init (&mvapi_btl->recv_free_max, + ompi_free_list_init (&openib_btl->recv_free_max, length, OBJ_CLASS (mca_btl_openib_recv_frag_max_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, - mca_btl_openib_component.ib_free_list_inc, mvapi_btl->ib_pool); + mca_btl_openib_component.ib_free_list_inc, openib_btl->ib_pool); @@ -460,24 +458,20 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules, - ompi_free_list_init(&mvapi_btl->send_free_frag, + ompi_free_list_init(&openib_btl->send_free_frag, length, OBJ_CLASS(mca_btl_openib_send_frag_frag_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - mvapi_btl->ib_pool); + openib_btl->ib_pool); /* Initialize the rr_desc_post array for posting of rr*/ - mvapi_btl->rr_desc_post = (VAPI_rr_desc_t*) malloc((mca_btl_openib_component.ib_rr_buf_max * sizeof(VAPI_rr_desc_t))); + openib_btl->rr_desc_post = (struct ibv_recv_wr **) + malloc((mca_btl_openib_component.ib_rr_buf_max * sizeof(struct ibv_recv_wr*))); - /* This is now done by the memory pool passed to free_list_init.. Initialize the send descriptors */ - /* if(mca_btl_openib_send_frag_register(mvapi_btl) != OMPI_SUCCESS) { */ - /* free(hca_ids); */ - /* return NULL; */ - /* } */ - btls[i] = &mvapi_btl->super; + btls[i] = &openib_btl->super; } /* Post OOB receive to support dynamic connection setup */ @@ -502,10 +496,10 @@ int mca_btl_openib_component_progress() for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { VAPI_ret_t ret; VAPI_wc_desc_t comp; - mca_btl_openib_module_t* mvapi_btl = &mca_btl_openib_component.mvapi_btls[i]; + mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i]; do{ - ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp); + ret = VAPI_poll_cq(openib_btl->nic, openib_btl->cq_hndl_high, &comp); if(VAPI_OK == ret) { if(comp.status != VAPI_SUCCESS) { opal_output(0, "Got error : %s, Vendor code : %d Frag : %p", @@ -528,7 +522,7 @@ int mca_btl_openib_component_progress() /* Process a completed send */ frag = (mca_btl_openib_frag_t*) comp.id; frag->rc = OMPI_SUCCESS; - frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); + frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); count++; break; @@ -539,10 +533,10 @@ int mca_btl_openib_component_progress() frag->rc=OMPI_SUCCESS; frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* advance the segment address past the header and subtract from the length..*/ - mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); + openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, frag->hdr->tag, &frag->base, openib_btl->ib_reg[frag->hdr->tag].cbdata); - OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_high, -1); + OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); + OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1); mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)comp.id)->endpoint, 0); @@ -557,7 +551,7 @@ int mca_btl_openib_component_progress() } while(VAPI_OK == ret); - ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_low, &comp); + ret = VAPI_poll_cq(openib_btl->nic, openib_btl->cq_hndl_low, &comp); if(VAPI_OK == ret) { if(comp.status != VAPI_SUCCESS) { opal_output(0, "Got error : %s, Vendor code : %d Frag : %p", @@ -574,7 +568,7 @@ int mca_btl_openib_component_progress() /* Process a completed send */ frag = (mca_btl_openib_frag_t*) comp.id; frag->rc = OMPI_SUCCESS; - frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); + frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); count++; break; @@ -585,10 +579,10 @@ int mca_btl_openib_component_progress() frag->rc=OMPI_SUCCESS; frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* advance the segment address past the header and subtract from the length..*/ - mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); + openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, frag->hdr->tag, &frag->base, openib_btl->ib_reg[frag->hdr->tag].cbdata); - OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag); - OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_low, -1); + OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_max), (opal_list_item_t*) frag); + OPAL_THREAD_ADD32(&openib_btl->rr_posted_low, -1); mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)comp.id)->endpoint, 0); diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index d130f5ecc6..ac60d30f56 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -35,62 +35,61 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); int mca_btl_openib_endpoint_create_qp( - mca_btl_openib_module_t* mvapi_btl, - VAPI_hca_hndl_t nic, - VAPI_pd_hndl_t ptag, - VAPI_cq_hndl_t cq_hndl, - VAPI_qp_hndl_t* qp_hndl, - VAPI_qp_prop_t* qp_prop, - int transport_type); + mca_btl_openib_module_t* openib_btl, + struct ibv_pd* pd, + struct ibv_cq* cq, + struct ibv_qp_attr* qp_attr, + struct ibv_qp** qp + ); + int mca_btl_openib_endpoint_qp_init_query( - - mca_btl_openib_module_t* mvapi_btl, - VAPI_hca_hndl_t nic, - VAPI_qp_hndl_t qp_hndl, - VAPI_qp_num_t remote_qp_num, - IB_lid_t remote_lid, - IB_port_t port_id - ); - + mca_btl_openib_module_t* openib_btl, + struct ibv_qp* qp, + struct ibv_qp_attr* attr, + uint32_t lcl_psn, + uint32_t rem_qp_num, + uint32_t rem_psn, + uint16_t rem_lid, + uint32_t port_num + ); -static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* mvapi_btl, mca_btl_openib_endpoint_t * endpoint, mca_btl_openib_frag_t * frag) +static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* openib_btl, + mca_btl_openib_endpoint_t * endpoint, + mca_btl_openib_frag_t * frag) { + struct ibv_qp* ib_qp; + struct ibv_send_wr *bad_wr; + frag->sr_desc.remote_qkey = 0; - frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; + frag->sg_entry.addr = (uintprt_t) frag->hdr; - VAPI_qp_hndl_t qp_hndl; - if(frag->base.des_flags && MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){ - frag->sr_desc.remote_qp = endpoint->rem_qp_num_high; - qp_hndl = endpoint->lcl_qp_hndl_high; + if(frag->base.des_flags && MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= openib_btl->super.btl_eager_limit){ + ib_qp = endpoint->lcl_qp_high; } else { - frag->sr_desc.remote_qp = endpoint->rem_qp_num_low; - qp_hndl = endpoint->lcl_qp_hndl_low; + ib_qp = endpoint->lcl_qp_low; } - frag->sr_desc.opcode = VAPI_SEND; - frag->sg_entry.len = frag->segment.seg_len + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */ - - if(frag->sg_entry.len <= mvapi_btl->ib_inline_max) { - frag->ret = EVAPI_post_inline_sr(mvapi_btl->nic, - qp_hndl, - &frag->sr_desc); - - }else { - frag->ret = VAPI_post_sr(mvapi_btl->nic, - qp_hndl, - &frag->sr_desc); - } - + frag->sr_desc.opcode = IBV_WR_SEND; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; - if(VAPI_OK != frag->ret) + frag->sg_entry.length = frag->segment.seg_len + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */ + + /* TODO: should check if we can inline send,, but can't find + * inline send defined in openib verbs api. + * if(frag->sg_entry.len <= openib_btl->ib_inline_max) { + */ + if(ibv_post_send(ib_qp, + frag->sr_desc, + &bad_wr)) { + opal_output(0, "%s: error posting send request\n", __func__); return OMPI_ERROR; - + } mca_btl_openib_endpoint_post_rr(endpoint, 1); - return OMPI_SUCCESS; + return OMPI_ERROR; } @@ -163,12 +162,27 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end ORTE_ERROR_LOG(rc); return rc; } - rc = orte_dps.pack(buffer, &endpoint->endpoint_btl->port.lid, 1, ORTE_UINT32); + + rc = orte_dps.pack(buffer, &endpoint->lcl_psn_high, 1, ORTE_UINT32); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + + rc = orte_dps.pack(buffer, &endpoint->lcl_psn_low, 1, ORTE_UINT32); if(rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } + rc = orte_dps.pack(buffer, &endpoint->endpoint_btl->port.lid, 1, ORTE_UINT16); + if(rc != ORTE_SUCCESS) { + ORTE_ERROR_LOG(rc); + return rc; + } + + + /* send to endpoint */ rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0, mca_btl_openib_endpoint_send_cb, NULL); @@ -210,6 +224,14 @@ static int mca_btl_openib_endpoint_send_connect_ack(mca_btl_base_endpoint_t* end ORTE_ERROR_LOG(rc); return rc; } + if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT32))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if(ORTE_SUCCESS != (rc = orte_dps.pack(buffer, &zero, 1, ORTE_UINT16))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* send to endpoint */ rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_guid, buffer, ORTE_RML_TAG_DYNAMIC-1, 0, @@ -245,7 +267,17 @@ static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endp ORTE_ERROR_LOG(rc); return rc; } - rc = orte_dps.unpack(buffer, &endpoint->rem_lid, &cnt, ORTE_UINT32); + rc = orte_dps.unpack(buffer, &endpoint->rem_psn_high, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dps.unpack(buffer, &endpoint->rem_psn_low, &cnt, ORTE_UINT32); + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + rc = orte_dps.unpack(buffer, &endpoint->rem_lid, &cnt, ORTE_UINT16); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -272,39 +304,37 @@ static int mca_btl_openib_endpoint_set_remote_info(mca_btl_base_endpoint_t* endp static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoint) { int rc; + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; /* Create the High Priority Queue Pair */ - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_high, - &endpoint->lcl_qp_hndl_high, - &endpoint->lcl_qp_prop_high, - VAPI_TS_RC))) { + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, + openib_btl->ib_pd, + openib_btl->ib_cq_high, + endpoint->lcl_qp_attr_high, + &endpoint->lcl_qp_high))) { opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } - - + endpoint->lcl_psn_high = lrand48() & 0xffffff; + /* Create the Low Priority Queue Pair */ - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_low, - &endpoint->lcl_qp_hndl_low, - &endpoint->lcl_qp_prop_low, - VAPI_TS_RC))) { + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, + openib_btl->ib_pd, + openib_btl->ib_cq_low, + endpoint->lcl_qp_attr_low, + &endpoint->lcl_qp_low))) { opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } + endpoint->lcl_psn_low = lrand48() & 0xffffff; DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_prop_high.qp_num, - endpoint->lcl_qp_prop_low.qp_num, - mvapi_btl->port.lid); + endpoint->lcl_qp_high->qp_num, + endpoint->lcl_qp_low.qp_num, + openib_btl->ib_port_attr->lid); /* Send connection info over to remote endpoint */ endpoint->endpoint_state = MCA_BTL_IB_CONNECTING; @@ -323,46 +353,42 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t *endpoint, orte_buffer_t* buffer) { int rc; - - + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; + + /* Create the High Priority Queue Pair */ - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_high, - &endpoint->lcl_qp_hndl_high, - &endpoint->lcl_qp_prop_high, - VAPI_TS_RC))) { + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, + openib_btl->ib_pd, + openib_btl->ib_cq, + endpoint->lcl_qp_attr_high, + &endpoint->lcl_qp_high))) { opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } - - + endpoint->lcl_psn_high = lrand48() & 0xffffff; + /* Create the Low Priority Queue Pair */ - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->endpoint_btl->ptag, - endpoint->endpoint_btl->cq_hndl_low, - &endpoint->lcl_qp_hndl_low, - &endpoint->lcl_qp_prop_low, - VAPI_TS_RC))) { + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_create_qp(openib_btl, + openib_btl->ib_pd, + openib_btl->ib_cq, + endpoint->lcl_qp_attr_low, + &endpoint->lcl_qp_low))) { opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); return rc; } + endpoint->lcl_psn_low = lrand48() & 0xffffff; DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", - endpoint->lcl_qp_prop_high.qp_num, - endpoint->lcl_qp_prop_low.qp_num, - mvapi_btl->port.lid); - - + endpoint->lcl_qp_high->qp_num, + endpoint->lcl_qp_low.qp_num, + openib_btl->ib_port_attr->lid); /* Set the remote side info */ mca_btl_openib_endpoint_set_remote_info(endpoint, buffer); - + /* Connect to endpoint */ rc = mca_btl_openib_endpoint_connect(endpoint); @@ -431,48 +457,48 @@ static void mca_btl_openib_endpoint_recv( /* Update status */ switch(endpoint_state) { - case MCA_BTL_IB_CLOSED : - /* We had this connection closed before. - * The endpoint is trying to connect. Move the - * status of this connection to CONNECTING, - * and then reply with our QP information */ - - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, buffer))) { - opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", - ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); - break; - } - - /* Setup state as connected */ - ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK; + case MCA_BTL_IB_CLOSED : + /* We had this connection closed before. + * The endpoint is trying to connect. Move the + * status of this connection to CONNECTING, + * and then reply with our QP information */ + + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_reply_start_connect(ib_endpoint, buffer))) { + opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); break; + } + + /* Setup state as connected */ + ib_endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK; + break; + + case MCA_BTL_IB_CONNECTING : - case MCA_BTL_IB_CONNECTING : - - mca_btl_openib_endpoint_set_remote_info(ib_endpoint, buffer); - if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) { - opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", - ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); - break; - } - - /* Setup state as connected */ - mca_btl_openib_endpoint_connected(ib_endpoint); - - /* Send him an ack */ - mca_btl_openib_endpoint_send_connect_ack(ib_endpoint); + mca_btl_openib_endpoint_set_remote_info(ib_endpoint, buffer); + if(OMPI_SUCCESS != (rc = mca_btl_openib_endpoint_connect(ib_endpoint))) { + opal_output(0, "[%lu,%lu,%lu] %s:%d errcode %d\n", + ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); break; + } + + /* Setup state as connected */ + mca_btl_openib_endpoint_connected(ib_endpoint); - case MCA_BTL_IB_CONNECT_ACK: + /* Send him an ack */ + mca_btl_openib_endpoint_send_connect_ack(ib_endpoint); + break; - mca_btl_openib_endpoint_connected(ib_endpoint); + case MCA_BTL_IB_CONNECT_ACK: - break; + mca_btl_openib_endpoint_connected(ib_endpoint); - case MCA_BTL_IB_CONNECTED : - break; - default : - opal_output(0, "Connected -> Connecting not possible.\n"); + break; + + case MCA_BTL_IB_CONNECTED : + break; + default : + opal_output(0, "Connected -> Connecting not possible.\n"); } break; @@ -508,7 +534,7 @@ int mca_btl_openib_endpoint_send( ) { int rc; - mca_btl_openib_module_t *mvapi_btl; + mca_btl_openib_module_t *openib_btl; OPAL_THREAD_LOCK(&endpoint->endpoint_send_lock); @@ -551,7 +577,7 @@ int mca_btl_openib_endpoint_send( case MCA_BTL_IB_CONNECTED: { - mvapi_btl = endpoint->endpoint_btl; + openib_btl = endpoint->endpoint_btl; DEBUG_OUT("Send to : %d, len : %d, frag : %p", @@ -559,7 +585,7 @@ int mca_btl_openib_endpoint_send( frag->ib_buf.desc.sg_entry.len, frag); - rc = mca_btl_openib_endpoint_post_send(mvapi_btl, endpoint, frag); + rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag); break; } @@ -577,7 +603,7 @@ void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t* endpoint) { opal_list_item_t *frag_item; mca_btl_openib_frag_t *frag; - mca_btl_openib_module_t* mvapi_btl; + mca_btl_openib_module_t* openib_btl; /*Check if endpoint is connected */ if(endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) { @@ -590,10 +616,10 @@ void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t* endpoint) while(!opal_list_is_empty(&(endpoint->pending_send_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_send_frags)); frag = (mca_btl_openib_frag_t *) frag_item; - mvapi_btl = endpoint->endpoint_btl; + openib_btl = endpoint->endpoint_btl; /* We need to post this one */ - if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(mvapi_btl, endpoint, frag)) + if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag)) opal_output(0, "error in mca_btl_openib_endpoint_send"); } } @@ -608,26 +634,42 @@ int mca_btl_openib_endpoint_connect( mca_btl_openib_endpoint_t *endpoint) { int rc; + mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) endpoint->endpoint_btl; + /* Connection establishment RC */ - rc = mca_btl_openib_endpoint_qp_init_query(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->lcl_qp_hndl_high, - endpoint->rem_qp_num_high, - endpoint->rem_lid, - endpoint->endpoint_btl->port_id); + rc = mca_btl_openib_endpoint_qp_init_query( + openib_btl, + endpoint->lcl_qp_high, + endpoint->lcl_qp_attr_high, + endpoint->lcl_psn_high, + endpoint->rem_qp_num_high, + endpoint->rem_psn_high, + endpoint->rem_lid, + openib_btl->port_num + ); - rc = mca_btl_openib_endpoint_qp_init_query(endpoint->endpoint_btl, - endpoint->endpoint_btl->nic, - endpoint->lcl_qp_hndl_low, - endpoint->rem_qp_num_low, - endpoint->rem_lid, - endpoint->endpoint_btl->port_id); if(rc != OMPI_SUCCESS) { return rc; } - + rc = mca_btl_openib_endpoint_qp_init_query( + openib_btl, + endpoint->lcl_qp_low, + endpoint->lcl_qp_attr_low, + endpoint->lcl_psn_low, + endpoint->rem_qp_num_low, + endpoint->rem_psn_low, + endpoint->rem_lid, + openib_btl->port_num + ); + + + + if(rc != OMPI_SUCCESS) { + return rc; + } + mca_btl_openib_endpoint_post_rr(endpoint, 0); return OMPI_SUCCESS; @@ -636,165 +678,108 @@ int mca_btl_openib_endpoint_connect( int mca_btl_openib_endpoint_create_qp( - mca_btl_openib_module_t* mvapi_btl, - VAPI_hca_hndl_t nic, - VAPI_pd_hndl_t ptag, - VAPI_cq_hndl_t cq_hndl, - VAPI_qp_hndl_t* qp_hndl, - VAPI_qp_prop_t* qp_prop, - int transport_type) + mca_btl_openib_module_t* openib_btl, + struct ibv_pd* pd, + struct ibv_cq* cq, + struct ibv_qp_attr* qp_attr, + struct ibv_qp** qp + ) { + { + struct ibv_qp_init_attr qp_init_attr = { + .send_cq = cq, + .recv_cq = cq, + .cap = { + .max_send_wr = openib_btl->ib_wq_size, + .max_recv_wr = openib_btl->ib_wq_size, + .max_send_sge = openib_btl->ib_sg_list_size, + .max_recv_sge = openib_btl->ib_sg_list_size, + }, + .qp_type = IBV_QPT_RC + }; - VAPI_ret_t ret; - VAPI_qp_init_attr_t qp_init_attr; + (*qp) = ibv_create_qp(pd, &qp_init_attr); - switch(transport_type) { - - case VAPI_TS_RC: /* Set up RC qp parameters */ - qp_init_attr.cap.max_oust_wr_rq = mvapi_btl->ib_wq_size; - qp_init_attr.cap.max_oust_wr_sq = mvapi_btl->ib_wq_size; - qp_init_attr.cap.max_sg_size_rq = mvapi_btl->ib_sg_list_size; - qp_init_attr.cap.max_sg_size_sq = mvapi_btl->ib_sg_list_size; - qp_init_attr.pd_hndl = ptag; - /* We don't have Reliable Datagram Handle right now */ - qp_init_attr.rdd_hndl = 0; - - /* Signal all work requests on this queue pair */ - qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR; - qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR; - - /* Use Reliable Connected transport service */ - qp_init_attr.ts_type = VAPI_TS_RC; - + if(NULL == (*qp)) { + opal_output(0, "%s: error creating qp \n", __func__); + return OMPI_ERROR; + } + openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data; + + } + + { + qp_attr->qp_state = IBV_QPS_INIT; + qp_attr->pkey_index = openib_btl->ib_pkey_ix; + qp_attr->qp_port_num = openib_btl->port_num; + qp_attr->qp_access_flags = 0; - /* Set Send and Recv completion queues */ - qp_init_attr.rq_cq_hndl = cq_hndl; - qp_init_attr.sq_cq_hndl = cq_hndl; - - break; - case VAPI_TS_UD: /* Set up UD qp parameters */ - default: - return OMPI_ERR_NOT_IMPLEMENTED; - } - - ret = VAPI_create_qp(nic, &qp_init_attr, - qp_hndl, qp_prop); - - if(VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_create_qp"); - return OMPI_ERROR; - } + if(ibv_modify_qp((*qp), qp_attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS )) { + opal_output("%s: error modifying qp to INIT\n"); + return OMPI_ERROR; + } + } + return OMPI_SUCCESS; } int mca_btl_openib_endpoint_qp_init_query( - mca_btl_openib_module_t* mvapi_btl, - VAPI_hca_hndl_t nic, - VAPI_qp_hndl_t qp_hndl, - VAPI_qp_num_t remote_qp_num, - IB_lid_t remote_lid, - IB_port_t port_id + mca_btl_openib_module_t* openib_btl, + struct ibv_qp* qp, + struct ibv_qp_attr* attr, + uint32_t lcl_psn, + uint32_t rem_qp_num, + uint32_t rem_psn, + uint16_t rem_lid, + uint32_t port_num ) { + attr->qp_state = IBV_QPS_RTR; + attr->path_mth = openib_btl->ib_mtu; + attr->dest_qp_num = rem_qp_num; + attr->rq_psn = rem_psn; + attr->max_des_rd_atomic = openib_btl->ib_max_rdma_dst_ops; + attr->min_rnr_timer = openib_btl->ib_min_rnr_timer; + attr->ah_attr.is_global = 0; + attr->ah_attr.dlid = rem_lid; + attr->ah_attr.sl = openib_btl->ib_service_level; + attr->ah_attr.src_path_bits = openib_btl->ib_src_path_bits; + attr->ah_attr.port_num = port_num; - VAPI_ret_t ret; - VAPI_qp_attr_t qp_attr; - - VAPI_qp_attr_mask_t qp_attr_mask; - VAPI_qp_init_attr_t qp_init_attr; - VAPI_qp_cap_t qp_cap; - - /* Modifying QP to INIT */ - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - qp_attr.qp_state = VAPI_INIT; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); - qp_attr.pkey_ix = mvapi_btl->ib_pkey_ix; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); - qp_attr.port = port_id; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PORT); - qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_REMOTE_ATOMIC_FLAGS); - - ret = VAPI_modify_qp(nic, qp_hndl, - &qp_attr, &qp_attr_mask, &qp_cap); - - if(VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_modify_qp"); - return OMPI_ERROR; - } - - DEBUG_OUT("Modified to init..Qp %d", qp_hndl); - - /********************** INIT --> RTR ************************/ - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - qp_attr.qp_state = VAPI_RTR; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); - qp_attr.qp_ous_rd_atom = mvapi_btl->ib_qp_ous_rd_atom; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_OUS_RD_ATOM); - qp_attr.path_mtu = mvapi_btl->ib_mtu; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PATH_MTU); - qp_attr.rq_psn = mvapi_btl->ib_psn; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RQ_PSN); - qp_attr.pkey_ix = mvapi_btl->ib_pkey_ix; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_PKEY_IX); - qp_attr.min_rnr_timer = mvapi_btl->ib_min_rnr_timer; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_MIN_RNR_TIMER); - - qp_attr.av.sl = mvapi_btl->ib_service_level; - qp_attr.av.grh_flag = FALSE; - qp_attr.av.static_rate = mvapi_btl->ib_static_rate; - qp_attr.av.src_path_bits = mvapi_btl->ib_src_path_bits; - - qp_attr.dest_qp_num = remote_qp_num; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_DEST_QP_NUM); - qp_attr.av.dlid = remote_lid; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_AV); - - ret = VAPI_modify_qp(nic, qp_hndl, - &qp_attr, &qp_attr_mask, &qp_cap); - - if(VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_modify_qp"); - return OMPI_ERROR; - } - - DEBUG_OUT("Modified to RTR..Qp %d", qp_hndl); - - /************** RTS *******************/ - QP_ATTR_MASK_CLR_ALL(qp_attr_mask); - qp_attr.qp_state = VAPI_RTS; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_QP_STATE); - qp_attr.sq_psn = mvapi_btl->ib_psn; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_SQ_PSN); - qp_attr.timeout = mvapi_btl->ib_timeout; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_TIMEOUT); - qp_attr.retry_count = mvapi_btl->ib_retry_count; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RETRY_COUNT); - qp_attr.rnr_retry = mvapi_btl->ib_rnr_retry; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_RNR_RETRY); - qp_attr.ous_dst_rd_atom = mvapi_btl->ib_max_rdma_dst_ops; - QP_ATTR_MASK_SET(qp_attr_mask, QP_ATTR_OUS_DST_RD_ATOM); - - ret = VAPI_modify_qp(nic, qp_hndl, - &qp_attr, &qp_attr_mask, &qp_cap); - - if(VAPI_OK != ret) { - MCA_BTL_IB_VAPI_ERROR(ret, "VAPI_modify_qp"); - return OMPI_ERROR; - } - DEBUG_OUT("Modified to RTS..Qp %d", qp_hndl); - - ret = VAPI_query_qp(nic, qp_hndl, &qp_attr, &qp_attr_mask, &qp_init_attr ); - if (ret != VAPI_OK) { - opal_output(0, "error querying the queue pair"); + if(ibv_modify_qp(qp, attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + opal_out(0, "%s: error modifing QP to RTR\n", __func__); return OMPI_ERROR; - } - - mvapi_btl->ib_inline_max = qp_init_attr.cap.max_inline_data_sq; - + } + attr->qp_state = IBV_QPS_RTS; + attr->timeout = openib_btl->ib_timeout; + attr->retry_cnt = openib_btl->ib_retry_count; + attr->rnr_retry = openib_btl->ib_rnr_retry; + attr->sq_psn = my_psn; + attr->max_rd_atomic = openib_btl->ib_max_rdma_dst_ops; + if (ibv_modify_qp(qp, attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + opal_output(0, "%s: error modifying QP to RTS\n", __func__); + return OMPI_FAILURE; + } return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index f647382e57..5265b13bad 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -26,7 +26,6 @@ #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif -#define MAX_POST_RR (16) OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t); /** @@ -87,28 +86,31 @@ struct mca_btl_base_endpoint_t { opal_list_t pending_send_frags; /**< list of pending send frags for this endpoint */ - - VAPI_qp_num_t rem_qp_num_high; - /* High priority remote side QP number */ - - VAPI_qp_num_t rem_qp_num_low; - /* Low prioirty remote size QP number */ - - IB_lid_t rem_lid; - /* Local identifier of the remote process */ - - VAPI_qp_hndl_t lcl_qp_hndl_high; - /* High priority local QP handle */ - VAPI_qp_hndl_t lcl_qp_hndl_low; - /* Low priority local QP handle */ + uint32_t rem_qp_num_high; + uint32_t rem_qp_num_low; + /* Remote QP number (Low and High priority) */ - VAPI_qp_prop_t lcl_qp_prop_high; - /* High priority local QP properties */ - - VAPI_qp_prop_t lcl_qp_prop_low; - /* Low priority local QP properties */ + uint16_t rem_lid; + /* Local identifier of the remote process */ + + + uint32_t rem_psn_high; + uint32_t rem_psn_low; + /* Remote processes port sequence number (Low and High) */ + + uint32_t lcl_psn_high; + uint32_t lcl_psn_low; + /* Local processes port sequence number (Low and High) */ + + struct ibv_qp* lcl_qp_high; + struct ibv_qp* lcl_qp_low; + /* Local QP (Low and High) */ + struct ibv_qp_attr* lcl_qp_attr_high; + struct ibv_qp_attr* lcl_qp_attr_low; + /* Local QP attributes (Low and High) */ + }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; @@ -122,77 +124,85 @@ void mca_btl_openib_post_recv(void); void mca_btl_openib_progress_send_frags(mca_btl_openib_endpoint_t*); static inline int mca_btl_openib_endpoint_post_rr_sub(int cnt, - mca_btl_openib_endpoint_t* endpoint, - ompi_free_list_t* frag_list, - uint32_t* rr_posted, - VAPI_hca_hndl_t nic, - VAPI_qp_hndl_t qp_hndl - ) + mca_btl_openib_endpoint_t* endpoint, + ompi_free_list_t* frag_list, + uint32_t* rr_posted, + struct ibv_qp* qp + ) { int rc, i; opal_list_item_t* item; mca_btl_openib_frag_t* frag; - mca_btl_openib_module_t *mvapi_btl = endpoint->endpoint_btl; - VAPI_rr_desc_t* rr_desc_post = mvapi_btl->rr_desc_post; + mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; + struct ibv_recv_wr* bad_wr; - /* prepare frags and post receive requests */ + /* prepare frags and post receive requests, given, this is ugly, + * if openib doesn't plan on supporting a post_list method than + * this should be changed to simply loop through and post receives + * without bothering with the rr_desc_post array as it is not needed + */ for(i = 0; i < cnt; i++) { OMPI_FREE_LIST_WAIT(frag_list, item, rc); frag = (mca_btl_openib_frag_t*) item; frag->endpoint = endpoint; - frag->sg_entry.len = frag->size + ((unsigned char*) frag->segment.seg_addr.pval- (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */ - rr_desc_post[i] = frag->rr_desc; + frag->sg_entry.length = frag->size + + ((unsigned char*) frag->segment.seg_addr.pval- + (unsigned char*) frag->hdr); + rr_desc_post[i] = frag->rr_desc; } - frag->ret = EVAPI_post_rr_list(nic, - qp_hndl, - cnt, - rr_desc_post); - if(VAPI_OK != frag->ret) { - MCA_BTL_IB_VAPI_ERROR(frag->ret, "EVAPI_post_rr_list"); - return OMPI_ERROR; + for(i=0; i< cnt; i++){ + + if(ibv_post_recv(qp, + rr_desc_post[i], + &bad_wr)) { + opal_output(0, "%s: error posting receive\n", __func__); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; } OPAL_THREAD_ADD32(rr_posted, cnt); return OMPI_SUCCESS; } static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * endpoint, int additional){ - mca_btl_openib_module_t * mvapi_btl = endpoint->endpoint_btl; + mca_btl_openib_module_t * openib_btl = endpoint->endpoint_btl; int rc; - OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); + OPAL_THREAD_LOCK(&openib_btl->ib_lock); - if(mvapi_btl->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){ + if(openib_btl->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){ - rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - mvapi_btl->rr_posted_high, + rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_high, endpoint, - &mvapi_btl->recv_free_eager, - &mvapi_btl->rr_posted_high, - mvapi_btl->nic, + &openib_btl->recv_free_eager, + &openib_btl->rr_posted_high, + openib_btl->nic, endpoint->lcl_qp_hndl_high ); if(rc != OMPI_SUCCESS){ - OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); return rc; } } - if(mvapi_btl->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){ + if(openib_btl->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){ - rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - mvapi_btl->rr_posted_low, + rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_low, endpoint, - &mvapi_btl->recv_free_max, - &mvapi_btl->rr_posted_low, - mvapi_btl->nic, + &openib_btl->recv_free_max, + &openib_btl->rr_posted_low, + openib_btl->nic, endpoint->lcl_qp_hndl_low ); if(rc != OMPI_SUCCESS) { - OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); return rc; } } - OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock); + OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); return OMPI_SUCCESS; diff --git a/ompi/mca/btl/openib/btl_openib_error.h b/ompi/mca/btl/openib/btl_openib_error.h index 00ceaaf1c0..bb821e9c1c 100644 --- a/ompi/mca/btl/openib/btl_openib_error.h +++ b/ompi/mca/btl/openib/btl_openib_error.h @@ -17,18 +17,12 @@ #ifndef MCA_BTL_IB_ERROR_H #define MCA_BTL_IB_ERROR_H -#include -#include -#include +#include /* * * */ -#define MCA_BTL_IB_VAPI_ERROR(vapi_ret, func_name) { \ - opal_output(0,"[%s:%d] ", __FILE__, __LINE__); \ - opal_output(0,"%s : %s",func_name,VAPI_strerror(vapi_ret)); \ -} /* Debug Print */ #if 0 diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 903b29cad0..651c2aab77 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -1,12 +1,13 @@ #include "btl_openib_frag.h" -#include "mca/common/vapi/vapi_mem_reg.h" -#include "mca/mpool/mvapi/mpool_mvapi.h" +#include "mca/mpool/openib/mpool_openib.h" static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag) { - mca_mpool_mvapi_registration_t* mem_hndl = (mca_mpool_mvapi_registration_t*) frag->base.super.user_data; + mca_mpool_openib_registration_t* registration = + (mca_mpool_openib_registration_t*) frag->base.super.user_data; + frag->hdr = (mca_btl_openib_header_t*) (frag+1); /* initialize the btl header to point to start at end of frag */ #if 0 mod = (unsigned long) frag->hdr % MCA_BTL_IB_FRAG_ALIGN; @@ -25,13 +26,13 @@ static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag) } #endif - frag->mem_hndl = mem_hndl->hndl; + frag->mr = registration->mr; frag->segment.seg_len = frag->size; - frag->segment.seg_key.key32[0] = (uint32_t) mem_hndl->l_key; - frag->sg_entry.lkey = mem_hndl->l_key; - frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr; + frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; + frag->sg_entry.addr = (uintprt_t) frag->hdr; + frag->sg_entry.length = frag->size; + frag->sg_entry.lkey = frag->mr->lkey; frag->base.des_flags = 0; - } @@ -44,13 +45,11 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; - frag->sr_desc.comp_type = VAPI_SIGNALED; - frag->sr_desc.opcode = VAPI_SEND; - frag->sr_desc.remote_qkey = 0; - frag->sr_desc.sg_lst_len = 1; - frag->sr_desc.sg_lst_p = &frag->sg_entry; - frag->sr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag; - + frag->sr_desc.wr_id = frag; + frag->sr_desc.sg_list = &frag->sg_entry; + frag->sr_desc.num_sge = 1; + frag->sr_desc.opcode = IBV_WR_SEND; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; } static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag) @@ -62,17 +61,11 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f frag->base.des_src = NULL; frag->base.des_src_cnt = 0; - frag->rr_desc.comp_type = VAPI_SIGNALED; - frag->rr_desc.opcode = VAPI_RECEIVE; - frag->rr_desc.sg_lst_len = 1; - frag->rr_desc.sg_lst_p = &frag->sg_entry; - frag->rr_desc.id = (VAPI_virt_addr_t) (MT_virt_addr_t) frag; - - + frag->rr_desc.wr_id = frag; + frag->rr_desc.sg_list = &frag->sg_entry; + frag->rr_desc.num_sge = 1; } - - static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag) { diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 4bfa7d85c9..a7665705e0 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -22,10 +22,8 @@ #include "ompi_config.h" #include "btl_openib.h" -#include -#include -#include -#include "mca/mpool/mvapi/mpool_mvapi.h" +#include +#include "mca/mpool/openib/mpool_openib.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { @@ -55,14 +53,13 @@ struct mca_btl_openib_frag_t { int rc; union{ - VAPI_rr_desc_t rr_desc; - VAPI_sr_desc_t sr_desc; + struct ibv_recv_wr rr_desc; + struct ibv_send_wr sr_desc; }; - VAPI_sg_lst_entry_t sg_entry; - VAPI_mr_hndl_t mem_hndl; - VAPI_ret_t ret; + struct ibv_sge sg_entry; + struct ibv_mr *mr; mca_btl_openib_header_t *hdr; - mca_mpool_mvapi_registration_t * vapi_reg; + mca_mpool_openib_registration_t * vapi_reg; }; typedef struct mca_btl_openib_frag_t mca_btl_openib_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t); diff --git a/ompi/mca/mpool/openib/configure.stub b/ompi/mca/mpool/openib/configure.stub index 292112d2a3..7a9421bb58 100644 --- a/ompi/mca/mpool/openib/configure.stub +++ b/ompi/mca/mpool/openib/configure.stub @@ -88,7 +88,7 @@ AC_DEFUN([MCA_CONFIGURE_STUB],[ # Many vapi.h's have horrid semantics and don't obey ISOC99 # standards. So we have to turn off flags like -pedantic. Sigh. - CFLAGS="`echo $CFLAGS | sed 's/-pedantic//g'`" + #CFLAGS="`echo $CFLAGS | sed 's/-pedantic//g'`" @@ -99,14 +99,14 @@ mca_btl_openib_try_find_libvapi() { LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS" vapi_badness= AC_CHECK_LIB([vapi], [$func1], [], [vapi_badness=true], - [-lmtl_common -lmpga -lmosal]) + [-libcm -lmthca]) if test "$vapi_badness" != ""; then AC_CHECK_LIB([pthread], [pthread_create], [pthread=yes LIBS="$LIBS -lpthread"], [pthread=no]) if test "$pthread" = "yes"; then AC_CHECK_LIB([vapi], [$func2], [], [], - [-lmtl_common -lmpga -lmosal]) + [-libcm -lmthca]) fi fi } diff --git a/ompi/mca/mpool/openib/mpool_openib.h b/ompi/mca/mpool/openib/mpool_openib.h index 5b9abda7cd..adf0658af3 100644 --- a/ompi/mca/mpool/openib/mpool_openib.h +++ b/ompi/mca/mpool/openib/mpool_openib.h @@ -57,30 +57,20 @@ OMPI_COMP_EXPORT extern mca_mpool_openib_component_t mca_mpool_openib_component; struct mca_mpool_base_resources_t { - VAPI_hca_hndl_t hca; /* the hca (nic) */ - VAPI_pd_hndl_t pd_tag; /* the protection domain */ + struct ibv_pd* ib_pd; }; typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; struct mca_mpool_openib_module_t { mca_mpool_base_module_t super; mca_allocator_base_module_t * vapi_allocator; - struct mca_mpool_base_resources_t hca_pd; + struct mca_mpool_base_resources_t resources; }; typedef struct mca_mpool_openib_module_t mca_mpool_openib_module_t; - + struct mca_mpool_openib_registration_t { mca_mpool_base_registration_t base_reg; - VAPI_mr_hndl_t hndl; - /* Memory region handle */ - - VAPI_lkey_t l_key; - /* Local key to registered memory, needed for - * posting send/recv requests */ - - VAPI_rkey_t r_key; - /* Remote key to registered memory, need to send this - * to remote processes for incoming RDMA ops */ + struct ibv_mr *mr; bool is_leave_pinned; }; diff --git a/ompi/mca/mpool/openib/mpool_openib_component.c b/ompi/mca/mpool/openib/mpool_openib_component.c index 540407f814..cbdd88c521 100644 --- a/ompi/mca/mpool/openib/mpool_openib_component.c +++ b/ompi/mca/mpool/openib/mpool_openib_component.c @@ -168,8 +168,7 @@ static mca_mpool_base_module_t* mca_mpool_openib_init( mpool_module = (mca_mpool_openib_module_t*)malloc(sizeof(mca_mpool_openib_module_t)); mca_mpool_openib_module_init(mpool_module); - /* setup allocator TODO fix up */ - mpool_module->hca_pd = *resources; + mpool_module->resources = *resources; mpool_module->vapi_allocator = allocator_component->allocator_init(true, mca_common_vapi_segment_alloc, NULL, &mpool_module->super); if(NULL == mpool_module->vapi_allocator) { diff --git a/ompi/mca/mpool/openib/mpool_openib_module.c b/ompi/mca/mpool/openib/mpool_openib_module.c index c0d7d6673d..7e8da2eec8 100644 --- a/ompi/mca/mpool/openib/mpool_openib_module.c +++ b/ompi/mca/mpool/openib/mpool_openib_module.c @@ -17,8 +17,8 @@ #include "ompi_config.h" #include #include "opal/util/output.h" -#include "mca/mpool/mvapi/mpool_openib.h" - +#include "mca/mpool/openib/mpool_openib.h" +#include /* * Initializes the mpool module. @@ -54,51 +54,30 @@ void* mca_mpool_openib_alloc( * register memory */ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool, - void *addr, - size_t size, - mca_mpool_base_registration_t** registration){ - - + void *addr, + size_t size, + mca_mpool_base_registration_t** registration){ mca_mpool_openib_module_t * mpool_module = (mca_mpool_openib_module_t*) mpool; mca_mpool_openib_registration_t * vapi_reg; - VAPI_mrw_t mr_in, mr_out; - - VAPI_ret_t ret; - - memset(&mr_in, 0, sizeof(VAPI_mrw_t)); - memset(&mr_out, 0, sizeof(VAPI_mrw_t)); - + *registration = (mca_mpool_base_registration_t*) OBJ_NEW(mca_mpool_openib_registration_t); /* (void*) malloc(sizeof(mca_mpool_base_registration_t)); */ vapi_reg = (mca_mpool_openib_registration_t*) *registration; vapi_reg->base_reg.mpool = mpool; + + vapi_reg->mr = ibv_reg_mr( + mpool_module->resources->ib_pd, + addr, + size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE + ); + - vapi_reg->hndl = VAPI_INVAL_HNDL; - - - mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; - mr_in.l_key = 0; - mr_in.r_key = 0; - mr_in.pd_hndl = mpool_module->hca_pd.pd_tag; - mr_in.size = size; - mr_in.start = (VAPI_virt_addr_t) (MT_virt_addr_t) addr; - mr_in.type = VAPI_MR; - - - ret = VAPI_register_mr( - mpool_module->hca_pd.hca, - &mr_in, - &vapi_reg->hndl, - &mr_out - ); - - if(VAPI_OK != ret){ - opal_output(0, "error pinning vapi memory\n"); + if(NULL == vapi_reg->mr){ + opal_output(0, "%s: error registering openib memory\n", __func__); return OMPI_ERROR; } - vapi_reg->l_key = mr_out.l_key; - vapi_reg->r_key = mr_out.r_key; vapi_reg->base_reg.base = addr; vapi_reg->base_reg.bound = (void*) ((char*) addr + size - 1);