1
1

Working version of openib btl ;-)

Fixed receive descriptor counts that limited mvapi and openib to 2 procs.                                                   
Begin porting error messages to use the BTL_ERROR macro. 

This commit was SVN r6554.
Этот коммит содержится в:
Galen Shipman 2005-07-19 21:04:22 +00:00
родитель acb9365793
Коммит 2f67ab82bb
15 изменённых файлов: 262 добавлений и 207 удалений

24
ompi/mca/btl/base/btl_base_error.h Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_BASE_ERROR_H
#define MCA_BTL_BASE_ERROR_H
#define BTL_ERROR(fmt, args...) { \
opal_output(0, "[%s:%d:%d " fmt, __FILE__, __LINE__, __func__, ##args); \
}
#endif

Просмотреть файл

@ -828,5 +828,6 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl)
return OMPI_ERROR; return OMPI_ERROR;
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -157,11 +157,6 @@ struct mca_btl_mvapi_module_t {
mca_mpool_base_module_t* ib_pool; /**< ib memory pool */ mca_mpool_base_module_t* ib_pool; /**< ib memory pool */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
VAPI_rr_desc_t* rr_desc_post; VAPI_rr_desc_t* rr_desc_post;
/**< an array to allow posting of rr in one swoop */ /**< an array to allow posting of rr in one swoop */

Просмотреть файл

@ -488,12 +488,18 @@ int mca_btl_mvapi_component_progress()
uint32_t i; uint32_t i;
int count = 0; int count = 0;
mca_btl_mvapi_frag_t* frag; mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_endpoint_t* endpoint;
/* Poll for completions */ /* Poll for completions */
for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) { for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
VAPI_ret_t ret; VAPI_ret_t ret;
VAPI_wc_desc_t comp; VAPI_wc_desc_t comp;
mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i]; mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i];
/* we have two completion queues, one for "high" priority and one for "low".
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
*/
do{ do{
ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp); ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp);
if(VAPI_OK == ret) { if(VAPI_OK == ret) {
@ -504,7 +510,7 @@ int mca_btl_mvapi_component_progress()
return OMPI_ERROR; return OMPI_ERROR;
} }
/* Handle n/w completions */ /* Handle work completions */
switch(comp.opcode) { switch(comp.opcode) {
case VAPI_CQE_RQ_RDMA_WITH_IMM: case VAPI_CQE_RQ_RDMA_WITH_IMM:
if(comp.imm_data_valid){ if(comp.imm_data_valid){
@ -515,7 +521,7 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_SQ_RDMA_WRITE: case VAPI_CQE_SQ_RDMA_WRITE:
case VAPI_CQE_SQ_SEND_DATA : case VAPI_CQE_SQ_SEND_DATA :
/* Process a completed send */ /* Process a completed send or an rdma write */
frag = (mca_btl_mvapi_frag_t*) comp.id; frag = (mca_btl_mvapi_frag_t*) comp.id;
frag->rc = OMPI_SUCCESS; frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc); frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc);
@ -524,15 +530,18 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_RQ_SEND_DATA: case VAPI_CQE_RQ_SEND_DATA:
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); /* Process a RECV */
DEBUG_OUT("Got an recv completion" );
frag = (mca_btl_mvapi_frag_t*) comp.id; frag = (mca_btl_mvapi_frag_t*) comp.id;
endpoint = (mca_btl_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS; frag->rc=OMPI_SUCCESS;
frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/ /* advance the segment address past the header and subtract from the length..*/
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_high, -1); OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0);
@ -540,7 +549,7 @@ int mca_btl_mvapi_component_progress()
break; break;
default: default:
opal_output(0, "Errorneous network completion"); opal_output(0, "Unhandled work completion opcode is %d", comp.opcode);
break; break;
} }
} }
@ -570,15 +579,17 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_RQ_SEND_DATA: case VAPI_CQE_RQ_SEND_DATA:
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__); DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_mvapi_frag_t*) comp.id; frag = (mca_btl_mvapi_frag_t*) comp.id;
endpoint = (mca_btl_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS; frag->rc=OMPI_SUCCESS;
frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/ /* advance the segment address past the header and subtract from the length..*/
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata); mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag); OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_low, -1); OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1);
mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0); mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0);

Просмотреть файл

@ -114,6 +114,10 @@ static void mca_btl_mvapi_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
endpoint->rr_posted_high = 0;
endpoint->rr_posted_low = 0;
} }
/* /*

Просмотреть файл

@ -109,6 +109,10 @@ struct mca_btl_base_endpoint_t {
VAPI_qp_prop_t lcl_qp_prop_low; VAPI_qp_prop_t lcl_qp_prop_low;
/* Low priority local QP properties */ /* Low priority local QP properties */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
}; };
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -161,14 +165,14 @@ static inline int mca_btl_mvapi_endpoint_post_rr_sub(int cnt,
static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * endpoint, int additional){ static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * endpoint, int additional){
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl; mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl;
int rc; int rc;
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock); OPAL_THREAD_LOCK(&endpoint->ib_lock);
if(mvapi_btl->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){ if(endpoint->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_high, rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_high,
endpoint, endpoint,
&mvapi_btl->recv_free_eager, &mvapi_btl->recv_free_eager,
&mvapi_btl->rr_posted_high, &endpoint->rr_posted_high,
mvapi_btl->nic, mvapi_btl->nic,
endpoint->lcl_qp_hndl_high endpoint->lcl_qp_hndl_high
); );
@ -177,12 +181,12 @@ static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * end
return rc; return rc;
} }
} }
if(mvapi_btl->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){ if(endpoint->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_low, rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_low,
endpoint, endpoint,
&mvapi_btl->recv_free_max, &mvapi_btl->recv_free_max,
&mvapi_btl->rr_posted_low, &endpoint->rr_posted_low,
mvapi_btl->nic, mvapi_btl->nic,
endpoint->lcl_qp_hndl_low endpoint->lcl_qp_hndl_low
); );

Просмотреть файл

@ -21,7 +21,7 @@
#include "opal/util/if.h" #include "opal/util/if.h"
#include "mca/pml/pml.h" #include "mca/pml/pml.h"
#include "mca/btl/btl.h" #include "mca/btl/btl.h"
#include "mca/btl/base/btl_base_error.h"
#include "btl_openib.h" #include "btl_openib.h"
#include "btl_openib_frag.h" #include "btl_openib_frag.h"
#include "btl_openib_proc.h" #include "btl_openib_proc.h"
@ -127,6 +127,10 @@ int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/*
*Register callback function to support send/recv semantics
*/
int mca_btl_openib_register( int mca_btl_openib_register(
struct mca_btl_base_module_t* btl, struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
@ -179,7 +183,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
} }
/** /**
* * Return a segment
* *
*/ */
int mca_btl_openib_free( int mca_btl_openib_free(
@ -189,16 +193,16 @@ int mca_btl_openib_free(
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des; mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des;
if(frag->size == 0) { if(frag->size == 0) {
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
OBJ_RELEASE(frag->openib_reg); OBJ_RELEASE(frag->openib_reg);
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
} }
else if(frag->size == mca_btl_openib_component.max_send_size){ else if(frag->size == mca_btl_openib_component.max_send_size){
MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag); MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag);
} else if(frag->size == mca_btl_openib_component.eager_limit){ } else if(frag->size == mca_btl_openib_component.eager_limit){
MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag); MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag);
} else {
BTL_ERROR("invalid descriptor");
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -265,13 +269,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base); rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base);
if(OMPI_SUCCESS != rc) { if(OMPI_SUCCESS != rc) {
opal_output(0, "%s:%d:%s error removing memory region from memory pool tree", __FILE__, __LINE__, __func__); BTL_ERROR("error removing memory region from memory pool tree");
return NULL; return NULL;
} }
if(is_leave_pinned) { if(is_leave_pinned) {
if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)){ if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)){
opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); BTL_ERROR("error removing item from reg_mru_list");
return NULL; return NULL;
} }
} }
@ -290,7 +294,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
if(rc != OMPI_SUCCESS) { if(rc != OMPI_SUCCESS) {
opal_output(0,"%s:%d:%s error inserting memory region into memory pool tree", __FILE__, __LINE__, __func__); BTL_ERROR("error inserting memory region into memory pool tree");
return NULL; return NULL;
} }
@ -302,7 +306,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
} }
else if(is_leave_pinned) { else if(is_leave_pinned) {
if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) { if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) {
opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__); BTL_ERROR("error removing item from reg_mru_list");
return NULL; return NULL;
} }
opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg);
@ -399,13 +403,16 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uintptr_t) iov.iov_base; frag->sg_entry.addr = (uintptr_t) iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->rkey;
frag->base.des_src = &frag->segment; frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1; frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL; frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0; frag->base.des_dst_cnt = 0;
frag->openib_reg = openib_reg; frag->openib_reg = openib_reg;
DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr);
return &frag->base; return &frag->base;
} else if (max_data+reserve <= btl->btl_eager_limit) { } else if (max_data+reserve <= btl->btl_eager_limit) {
@ -416,7 +423,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
} }
iov.iov_len = max_data; iov.iov_len = max_data;
iov.iov_base = frag->segment.seg_addr.pval + reserve; iov.iov_base = frag->segment.seg_addr.lval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
*size = max_data; *size = max_data;
@ -507,8 +514,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->base.des_flags = 0; frag->base.des_flags = 0;
if(NULL!= openib_reg){ if(NULL!= openib_reg){
reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1;
bool is_leave_pinned = openib_reg->base_reg.is_leave_pinned; bool is_leave_pinned = openib_reg->base_reg.is_leave_pinned;
reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1;
if(frag->segment.seg_len > reg_len ) { if(frag->segment.seg_len > reg_len ) {
size_t new_len = openib_reg->base_reg.bound - openib_reg->base_reg.base + 1 size_t new_len = openib_reg->base_reg.bound - openib_reg->base_reg.base + 1
@ -560,6 +568,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
} }
opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg); opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg);
} }
OBJ_RETAIN(openib_reg);
} else { } else {
if(mca_btl_openib_component.leave_pinned) { if(mca_btl_openib_component.leave_pinned) {
@ -622,14 +631,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->sg_entry.lkey = openib_reg->mr->lkey; frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uintptr_t) frag->segment.seg_addr.pval; frag->sg_entry.addr = (uintptr_t) frag->segment.seg_addr.pval;
frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey; frag->segment.seg_key.key32[0] = frag->mr->rkey;
frag->base.des_dst = &frag->segment; frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1; frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL; frag->base.des_src = NULL;
frag->base.des_src_cnt = 0; frag->base.des_src_cnt = 0;
frag->openib_reg = openib_reg; frag->openib_reg = openib_reg;
OBJ_RETAIN(openib_reg); DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0]);
return &frag->base; return &frag->base;
} }
@ -712,11 +722,18 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
frag->endpoint = endpoint; frag->endpoint = endpoint;
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval; frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_dst->seg_addr.pval;
frag->sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0]; frag->sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval; frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval;
frag->sg_entry.length = frag->base.des_src->seg_len; frag->sg_entry.length = frag->base.des_src->seg_len;
DEBUG_OUT("frag->sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu"
, frag->sr_desc.wr.rdma.remote_addr
, frag->sr_desc.wr.rdma.rkey
, frag->sg_entry.addr
, frag->sg_entry.length);
if(ibv_post_send(endpoint->lcl_qp_low, if(ibv_post_send(endpoint->lcl_qp_low,
&frag->sr_desc, &frag->sr_desc,
&bad_wr)){ &bad_wr)){
@ -785,7 +802,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
/* Allocate Protection Domain */ /* Allocate Protection Domain */
struct ibv_context *ctx; struct ibv_context *ctx;
openib_btl->poll_cq = false;
ctx = openib_btl->ib_dev_context; ctx = openib_btl->ib_dev_context;
openib_btl->ib_pd = ibv_alloc_pd(ctx); openib_btl->ib_pd = ibv_alloc_pd(ctx);
@ -821,5 +838,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
/* TODO: EVAPI_set_qsync_event_handler? */ /* TODO: EVAPI_set_qsync_event_handler? */
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }

Просмотреть файл

@ -155,14 +155,10 @@ struct mca_btl_openib_module_t {
mca_mpool_base_module_t* ib_pool; /**< ib memory pool */ mca_mpool_base_module_t* ib_pool; /**< ib memory pool */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
/**< an array to allow posting of rr in one swoop */ /**< an array to allow posting of rr in one swoop */
size_t ib_inline_max; /**< max size of inline send*/ size_t ib_inline_max; /**< max size of inline send*/
bool poll_cq;

Просмотреть файл

@ -31,6 +31,7 @@
#include "btl_openib_frag.h" #include "btl_openib_frag.h"
#include "btl_openib_endpoint.h" #include "btl_openib_endpoint.h"
#include "mca/btl/base/base.h" #include "mca/btl/base/base.h"
#include "mca/btl/base/btl_base_error.h"
#include "datatype/convertor.h" #include "datatype/convertor.h"
@ -253,12 +254,15 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
mca_btl_openib_module_t * openib_btl; mca_btl_openib_module_t * openib_btl;
mca_btl_base_selected_module_t* ib_selected; mca_btl_base_selected_module_t* ib_selected;
opal_list_item_t* item; opal_list_item_t* item;
struct dlist *dev_list;
struct ibv_device* ib_dev;
/* initialization */ /* initialization */
*num_btl_modules = 0; *num_btl_modules = 0;
num_devs = 0; num_devs = 0;
struct dlist *dev_list;
struct ibv_device* ib_dev;
/* Determine the number of hca's available on the host */ /* Determine the number of hca's available on the host */
dev_list = ibv_get_devices(); dev_list = ibv_get_devices();
@ -268,7 +272,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
num_devs++; num_devs++;
if(0 == num_devs) { if(0 == num_devs) {
opal_output(0, "No hca's found on this host! \n"); BTL_ERROR("No hca's found on this host!");
return NULL; return NULL;
} }
@ -298,50 +302,17 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
for(i = 0; i < num_devs; i++){ for(i = 0; i < num_devs; i++){
struct ibv_device_attr ib_dev_attr; struct ibv_device_attr ib_dev_attr;
struct ibv_context* ib_dev_context; struct ibv_context* ib_dev_context;
struct ibv_pd *my_pd;
struct ibv_mr *mr;
void* my_addr;
uint32_t my_size;
uint32_t my_indx;
uint32_t my_mult;
my_mult = 4096;
ib_dev = ib_devs[i]; ib_dev = ib_devs[i];
ib_dev_context = ibv_open_device(ib_dev); ib_dev_context = ibv_open_device(ib_dev);
if(!ib_dev_context) { if(!ib_dev_context) {
opal_output(0, "%s: error obtaining device context for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno)); BTL_ERROR(" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno));
return NULL; return NULL;
} }
my_pd = ibv_alloc_pd(ib_dev_context);
for(my_indx = 1; my_indx <= 8192; my_indx++){
my_size = my_mult * my_indx;
my_addr = memalign(4096, my_size);
memset(my_addr, 0, my_size);
mr = ibv_reg_mr(
my_pd,
my_addr,
my_size,
IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
);
if(NULL == mr){
opal_output(0, "%s: error on mr test! can't register %lu bytes, errno says %s \n", __func__, my_size, strerror(errno));
break;
}
else {
opal_output(0, "%s: successfully registerted %lu bytes", __func__, my_size);
ibv_dereg_mr(mr);
}
}
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){ if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
opal_output(0, "%s: error obtaining device attributes for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno)); BTL_ERROR("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno));
return NULL; return NULL;
} }
@ -352,8 +323,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
struct ibv_port_attr* ib_port_attr; struct ibv_port_attr* ib_port_attr;
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr)); ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){ if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
opal_output(0, "%s: error getting port attributes for device %s port number %d errno says %s", BTL_ERROR("error getting port attributes for device %s port number %d errno says %s",
__func__, ibv_get_device_name(ib_dev), j, strerror(errno)); ibv_get_device_name(ib_dev), j, strerror(errno));
return NULL; return NULL;
} }
@ -438,7 +409,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
&mpool_resources); &mpool_resources);
if(NULL == openib_btl->ib_pool) { if(NULL == openib_btl->ib_pool) {
opal_output(0, "%s: error creating vapi memory pool! aborting ib btl initialization", __func__); BTL_ERROR("error creating vapi memory pool! aborting ib btl initialization");
return NULL; return NULL;
} }
@ -531,103 +502,55 @@ int mca_btl_openib_component_progress()
uint32_t i, ne; uint32_t i, ne;
int count = 0; int count = 0;
mca_btl_openib_frag_t* frag; mca_btl_openib_frag_t* frag;
mca_btl_openib_endpoint_t* endpoint;
/* Poll for completions */ /* Poll for completions */
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
struct ibv_wc wc; struct ibv_wc wc;
memset(&wc, 0, sizeof(struct ibv_wc));
mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i]; mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i];
/* we have two completion queues, one for "high" priority and one for "low".
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
*/
do{ do{
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc ); ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
if(ne < 0 ){ if(ne < 0 ){
opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno)); BTL_ERROR("error polling CQ with %d errno says %s\n", ne, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
else if(wc.status != IBV_WC_SUCCESS) { else if(wc.status != IBV_WC_SUCCESS) {
opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n", BTL_ERROR("error polling CQ with status %d for wr_id %d\n",
__func__, wc.status, wc.wr_id);
wc.status, wc.wr_id);
return OMPI_ERROR; return OMPI_ERROR;
} }
else if(1 == ne) { else if(1 == ne) {
/* Handle n/w completions */ DEBUG_OUT("completion queue event says opcode is %d\n", wc.opcode);
/* Handle work completions */
switch(wc.opcode) { switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM: case IBV_WC_RECV_RDMA_WITH_IMM:
opal_output(0, "Got an RDMA with Immediate data Not supported!\n"); BTL_ERROR("Got an RDMA with Immediate data Not supported!");
return OMPI_ERROR; return OMPI_ERROR;
case IBV_WC_RDMA_WRITE: case IBV_WC_RECV:
case IBV_WC_SEND : /* Process a RECV */
if(wc.opcode & IBV_WC_RECV){
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/ DEBUG_OUT("Got an recv on the completion queue");
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
count++;
}
else {
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
}
break;
default:
opal_output(0, "Errorneous network completion");
break;
}
}
}
while(ne > 0);
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
if(ne < 0){
opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n",
__func__,
wc.status, wc.wr_id);
return OMPI_ERROR;
}
else if(1 == ne) {
/* Handle n/w completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
opal_output(0, "Got an RDMA with Immediate data Not supported!\n");
return OMPI_ERROR;
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
if(wc.opcode & IBV_WC_RECV){
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id; frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS; frag->rc=OMPI_SUCCESS;
frag->segment.seg_len = frag->segment.seg_len =
wc.byte_len- wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
/* advance the segment address past the header and subtract from the length..*/ /* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag, frag->hdr->tag,
@ -635,25 +558,84 @@ int mca_btl_openib_component_progress()
openib_btl->ib_reg[frag->hdr->tag].cbdata); openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag); OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
count++; count++;
} break;
else {
/* Process a completed send */ case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
/* Process a completed send or rdma write*/
frag = (mca_btl_openib_frag_t*) wc.wr_id; frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS; frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc); frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++; count++;
break;
break;
default:
BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode);
break;
} }
}
}
while(ne > 0);
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
if(ne < 0){
BTL_ERROR("error polling CQ with %d errno says %s", ne, strerror(errno));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
BTL_ERROR("error polling CQ with status %d for wr_id %d",
wc.status, wc.wr_id);
return OMPI_ERROR;
}
else if(1 == ne) {
/* Handle n/w completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
BTL_ERROR("Got an RDMA with Immediate data Not supported!");
return OMPI_ERROR;
case IBV_WC_RECV:
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT( "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
/* advance the segment address past the header and subtract from the length..*/
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_max), (opal_list_item_t*) frag);
count++;
break;
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
break; break;
default: default:
opal_output(0, "Errorneous network completion"); BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode);
break; break;
} }
} }

Просмотреть файл

@ -74,14 +74,17 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
} }
frag->sr_desc.opcode = IBV_WR_SEND; frag->sr_desc.opcode = IBV_WR_SEND;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sg_entry.length = frag->segment.seg_len + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */ frag->sg_entry.length =
frag->segment.seg_len +
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
/* frag->sr_desc.send_flags |= IBV_SEND_INLINE; */
}
/* TODO: should check if we can inline send,, but can't find
* inline send defined in openib verbs api.
* if(frag->sg_entry.len <= openib_btl->ib_inline_max) {
*/
if(ibv_post_send(ib_qp, if(ibv_post_send(ib_qp,
&frag->sr_desc, &frag->sr_desc,
&bad_wr)) { &bad_wr)) {
@ -90,7 +93,7 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
} }
mca_btl_openib_endpoint_post_rr(endpoint, 1); mca_btl_openib_endpoint_post_rr(endpoint, 1);
return OMPI_ERROR; return OMPI_SUCCESS;
} }
@ -114,6 +117,14 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t); OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
endpoint->lcl_qp_attr_high = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
endpoint->lcl_qp_attr_low = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
memset(endpoint->lcl_qp_attr_high, 0, sizeof(struct ibv_qp_attr));
memset(endpoint->lcl_qp_attr_low, 0, sizeof(struct ibv_qp_attr));
endpoint->rr_posted_high = 0;
endpoint->rr_posted_low = 0;
} }
/* /*
@ -190,9 +201,9 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d", DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_prop_high.qp_num, endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_prop_low.qp_num, endpoint->lcl_qp_low->qp_num,
endpoint->endpoint_btl->port.lid); endpoint->endpoint_btl->ib_port_attr->lid);
if(rc < 0) { if(rc < 0) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
@ -318,6 +329,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc; return rc;
} }
srand48(getpid() * time(NULL));
endpoint->lcl_psn_high = lrand48() & 0xffffff; endpoint->lcl_psn_high = lrand48() & 0xffffff;
/* Create the Low Priority Queue Pair */ /* Create the Low Priority Queue Pair */
@ -334,7 +346,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_high->qp_num, endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_low.qp_num, endpoint->lcl_qp_low->qp_num,
openib_btl->ib_port_attr->lid); openib_btl->ib_port_attr->lid);
/* Send connection info over to remote endpoint */ /* Send connection info over to remote endpoint */
@ -367,6 +379,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc); ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc; return rc;
} }
srand48(getpid() * time(NULL));
endpoint->lcl_psn_high = lrand48() & 0xffffff; endpoint->lcl_psn_high = lrand48() & 0xffffff;
/* Create the Low Priority Queue Pair */ /* Create the Low Priority Queue Pair */
@ -383,7 +396,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d", DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_high->qp_num, endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_low.qp_num, endpoint->lcl_qp_low->qp_num,
openib_btl->ib_port_attr->lid); openib_btl->ib_port_attr->lid);
@ -415,6 +428,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{ {
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
endpoint->endpoint_btl->poll_cq = true;
mca_btl_openib_progress_send_frags(endpoint); mca_btl_openib_progress_send_frags(endpoint);
} }
@ -491,12 +505,13 @@ static void mca_btl_openib_endpoint_recv(
break; break;
case MCA_BTL_IB_CONNECT_ACK: case MCA_BTL_IB_CONNECT_ACK:
DEBUG_OUT("Got a connect ack from %d\n", endpoint->vpid);
mca_btl_openib_endpoint_connected(ib_endpoint); mca_btl_openib_endpoint_connected(ib_endpoint);
break; break;
case MCA_BTL_IB_CONNECTED : case MCA_BTL_IB_CONNECTED :
break; break;
default : default :
opal_output(0, "Connected -> Connecting not possible.\n"); opal_output(0, "Connected -> Connecting not possible.\n");
@ -581,9 +596,9 @@ int mca_btl_openib_endpoint_send(
DEBUG_OUT("Send to : %d, len : %d, frag : %p", DEBUG_OUT("Send to : %d, len : %d, frag : %p",
endpoint->endpoint_proc->proc_guid.vpid, endpoint->endpoint_proc->proc_guid.vpid,
frag->ib_buf.desc.sg_entry.len, frag->sg_entry.length,
frag); frag);
rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag); rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag);
@ -686,23 +701,27 @@ int mca_btl_openib_endpoint_create_qp(
) )
{ {
{ {
struct ibv_qp* my_qp;
struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_init_attr qp_init_attr;
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
qp_init_attr.send_cq = cq; qp_init_attr.send_cq = cq;
qp_init_attr.recv_cq = cq; qp_init_attr.recv_cq = cq;
qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size; qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.ib_wq_size; qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size; qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size; qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.qp_type = IBV_QPT_RC;
(*qp) = ibv_create_qp(pd, &qp_init_attr); my_qp = ibv_create_qp(pd, &qp_init_attr);
if(NULL == (*qp)) { if(NULL == my_qp) {
opal_output(0, "%s: error creating qp errno says %s\n", __func__, strerror(errno)); opal_output(0, "%s: error creating qp errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }
(*qp) = my_qp;
openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data; openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data;
} }
@ -711,7 +730,7 @@ int mca_btl_openib_endpoint_create_qp(
qp_attr->qp_state = IBV_QPS_INIT; qp_attr->qp_state = IBV_QPS_INIT;
qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix; qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
qp_attr->port_num = openib_btl->port_num; qp_attr->port_num = openib_btl->port_num;
qp_attr->qp_access_flags = 0; qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
if(ibv_modify_qp((*qp), qp_attr, if(ibv_modify_qp((*qp), qp_attr,
IBV_QP_STATE | IBV_QP_STATE |

Просмотреть файл

@ -1,3 +1,4 @@
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University. * Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved. * All rights reserved.
@ -90,7 +91,7 @@ struct mca_btl_base_endpoint_t {
/**< lock for concurrent access to endpoint state */ /**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags; opal_list_t pending_send_frags;
/**< list of pending send frags for this endpoint */ /**< list of pending send frags for this endpotint */
uint32_t rem_qp_num_high; uint32_t rem_qp_num_high;
uint32_t rem_qp_num_low; uint32_t rem_qp_num_low;
@ -116,6 +117,10 @@ struct mca_btl_base_endpoint_t {
struct ibv_qp_attr* lcl_qp_attr_low; struct ibv_qp_attr* lcl_qp_attr_low;
/* Local QP attributes (Low and High) */ /* Local QP attributes (Low and High) */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
}; };
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -160,16 +165,15 @@ static inline int mca_btl_openib_endpoint_post_rr_sub(int cnt,
} }
for(i=0; i< cnt; i++){ for(i=0; i< cnt; i++){
if(ibv_post_recv(qp, if(ibv_post_recv(qp,
&rr_desc_post[i], &rr_desc_post[i],
&bad_wr)) { &bad_wr)) {
opal_output(0, "%s: error posting receive errno says %s\n", __func__, strerror(errno)); opal_output(0, "%s: error posting receive errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
}
} }
return OMPI_SUCCESS;
}
OPAL_THREAD_ADD32(rr_posted, cnt); OPAL_THREAD_ADD32(rr_posted, cnt);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -179,12 +183,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e
int rc; int rc;
OPAL_THREAD_LOCK(&openib_btl->ib_lock); OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(openib_btl->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){ if(endpoint->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_high, rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_high,
endpoint, endpoint,
&openib_btl->recv_free_eager, &openib_btl->recv_free_eager,
&openib_btl->rr_posted_high, &endpoint->rr_posted_high,
endpoint->lcl_qp_high endpoint->lcl_qp_high
); );
if(rc != OMPI_SUCCESS){ if(rc != OMPI_SUCCESS){
@ -192,12 +196,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e
return rc; return rc;
} }
} }
if(openib_btl->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){ if(endpoint->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_low, rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_low,
endpoint, endpoint,
&openib_btl->recv_free_max, &openib_btl->recv_free_max,
&openib_btl->rr_posted_low, &endpoint->rr_posted_low,
endpoint->lcl_qp_low endpoint->lcl_qp_low
); );
if(rc != OMPI_SUCCESS) { if(rc != OMPI_SUCCESS) {

Просмотреть файл

@ -45,11 +45,12 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_dst = NULL; frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0; frag->base.des_dst_cnt = 0;
frag->sr_desc.wr_id = frag; frag->sr_desc.wr_id = (uint64_t) frag;
frag->sr_desc.sg_list = &frag->sg_entry; frag->sr_desc.sg_list = &frag->sg_entry;
frag->sr_desc.num_sge = 1; frag->sr_desc.num_sge = 1;
frag->sr_desc.opcode = IBV_WR_SEND; frag->sr_desc.opcode = IBV_WR_SEND;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED; frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.next = NULL;
} }
static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag) static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag)
@ -61,9 +62,10 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_src = NULL; frag->base.des_src = NULL;
frag->base.des_src_cnt = 0; frag->base.des_src_cnt = 0;
frag->rr_desc.wr_id = frag; frag->rr_desc.wr_id = (uint64_t) frag;
frag->rr_desc.sg_list = &frag->sg_entry; frag->rr_desc.sg_list = &frag->sg_entry;
frag->rr_desc.num_sge = 1; frag->rr_desc.num_sge = 1;
frag->rr_desc.next = NULL;
} }
static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag) static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag)

Просмотреть файл

@ -30,11 +30,11 @@ extern "C" {
static inline void * DOWN_ALIGN_ADDR(void * addr, uint32_t cnt) { static inline void * DOWN_ALIGN_ADDR(void * addr, uint32_t cnt) {
return (void*)((uintptr_t)(addr) & (~((uintptr_t)0) << (cnt))); return (void*)((uintptr_t)((unsigned char*) addr) & (~((uintptr_t)0) << (cnt)));
} }
static inline void* ALIGN_ADDR(void* addr, uint32_t cnt ) { static inline void* ALIGN_ADDR(void* addr, uint32_t cnt ) {
DOWN_ALIGN_ADDR(((addr) + ~(~((uintptr_t)0) << (cnt))), (cnt)); DOWN_ALIGN_ADDR((((unsigned char*) addr) + ~(~((uintptr_t)0) << (cnt))), (cnt));
return addr; return addr;
} }

Просмотреть файл

@ -70,13 +70,6 @@ static void mca_mpool_openib_registration_constructor( mca_mpool_openib_registra
static void mca_mpool_openib_registration_destructor( mca_mpool_openib_registration_t * registration ) static void mca_mpool_openib_registration_destructor( mca_mpool_openib_registration_t * registration )
{ {
mca_mpool_base_remove((void*) registration);
registration->base_reg.mpool->mpool_deregister(
registration->base_reg.mpool,
registration->base_reg.base,
0,
(mca_mpool_base_registration_t*) registration);
registration->base_reg.base = NULL; registration->base_reg.base = NULL;
registration->base_reg.bound = NULL; registration->base_reg.bound = NULL;
registration->base_reg.is_leave_pinned=false; registration->base_reg.is_leave_pinned=false;

Просмотреть файл

@ -57,6 +57,7 @@ void* mca_mpool_openib_alloc(
free(addr_malloc); free(addr_malloc);
return NULL; return NULL;
} }
(*registration)->alloc_base = addr_malloc;
return addr; return addr;
} }
@ -80,7 +81,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
mpool_module->resources.ib_pd, mpool_module->resources.ib_pd,
addr, addr,
size, size,
IBV_ACCESS_REMOTE_WRITE IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */ /* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
); );
@ -106,7 +107,7 @@ int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size
mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool; mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool;
mca_mpool_openib_registration_t * openib_reg; mca_mpool_openib_registration_t * openib_reg;
openib_reg = (mca_mpool_openib_registration_t*) registration; openib_reg = (mca_mpool_openib_registration_t*) registration;
if(! ibv_dereg_mr(openib_reg->mr)){ if(ibv_dereg_mr(openib_reg->mr)){
opal_output(0, "%s: error unpinning openib memory errno says %s\n", __func__, strerror(errno)); opal_output(0, "%s: error unpinning openib memory errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR; return OMPI_ERROR;
} }