1
1

Working version of openib btl ;-)

Fixed receive descriptor counts that limited mvapi and openib to 2 procs.                                                   
Begin porting error messages to use the BTL_ERROR macro. 

This commit was SVN r6554.
Этот коммит содержится в:
Galen Shipman 2005-07-19 21:04:22 +00:00
родитель acb9365793
Коммит 2f67ab82bb
15 изменённых файлов: 262 добавлений и 207 удалений

24
ompi/mca/btl/base/btl_base_error.h Обычный файл
Просмотреть файл

@ -0,0 +1,24 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_BASE_ERROR_H
#define MCA_BTL_BASE_ERROR_H
#define BTL_ERROR(fmt, args...) { \
opal_output(0, "[%s:%d:%d " fmt, __FILE__, __LINE__, __func__, ##args); \
}
#endif

Просмотреть файл

@ -828,5 +828,6 @@ int mca_btl_mvapi_module_init(mca_btl_mvapi_module_t *mvapi_btl)
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -157,11 +157,6 @@ struct mca_btl_mvapi_module_t {
mca_mpool_base_module_t* ib_pool; /**< ib memory pool */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
VAPI_rr_desc_t* rr_desc_post;
/**< an array to allow posting of rr in one swoop */

Просмотреть файл

@ -487,13 +487,19 @@ int mca_btl_mvapi_component_progress()
{
uint32_t i;
int count = 0;
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_endpoint_t* endpoint;
/* Poll for completions */
for(i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
VAPI_ret_t ret;
VAPI_wc_desc_t comp;
mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i];
/* we have two completion queues, one for "high" priority and one for "low".
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
*/
do{
ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_high, &comp);
if(VAPI_OK == ret) {
@ -504,7 +510,7 @@ int mca_btl_mvapi_component_progress()
return OMPI_ERROR;
}
/* Handle n/w completions */
/* Handle work completions */
switch(comp.opcode) {
case VAPI_CQE_RQ_RDMA_WITH_IMM:
if(comp.imm_data_valid){
@ -515,7 +521,7 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_SQ_RDMA_WRITE:
case VAPI_CQE_SQ_SEND_DATA :
/* Process a completed send */
/* Process a completed send or an rdma write */
frag = (mca_btl_mvapi_frag_t*) comp.id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&mvapi_btl->super, frag->endpoint, &frag->base, frag->rc);
@ -524,15 +530,18 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_RQ_SEND_DATA:
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
/* Process a RECV */
DEBUG_OUT("Got an recv completion" );
frag = (mca_btl_mvapi_frag_t*) comp.id;
endpoint = (mca_btl_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_high, -1);
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0);
@ -540,7 +549,7 @@ int mca_btl_mvapi_component_progress()
break;
default:
opal_output(0, "Errorneous network completion");
opal_output(0, "Unhandled work completion opcode is %d", comp.opcode);
break;
}
}
@ -570,15 +579,17 @@ int mca_btl_mvapi_component_progress()
case VAPI_CQE_RQ_SEND_DATA:
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_mvapi_frag_t*) comp.id;
endpoint = (mca_btl_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_max), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&mvapi_btl->rr_posted_low, -1);
OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1);
mca_btl_mvapi_endpoint_post_rr(((mca_btl_mvapi_frag_t*)comp.id)->endpoint, 0);

Просмотреть файл

@ -114,6 +114,10 @@ static void mca_btl_mvapi_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
endpoint->rr_posted_high = 0;
endpoint->rr_posted_low = 0;
}
/*

Просмотреть файл

@ -109,6 +109,10 @@ struct mca_btl_base_endpoint_t {
VAPI_qp_prop_t lcl_qp_prop_low;
/* Low priority local QP properties */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -161,14 +165,14 @@ static inline int mca_btl_mvapi_endpoint_post_rr_sub(int cnt,
static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * endpoint, int additional){
mca_btl_mvapi_module_t * mvapi_btl = endpoint->endpoint_btl;
int rc;
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
OPAL_THREAD_LOCK(&endpoint->ib_lock);
if(mvapi_btl->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){
if(endpoint->rr_posted_high <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_mvapi_component.ib_rr_buf_max){
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_high,
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_high,
endpoint,
&mvapi_btl->recv_free_eager,
&mvapi_btl->rr_posted_high,
&endpoint->rr_posted_high,
mvapi_btl->nic,
endpoint->lcl_qp_hndl_high
);
@ -177,12 +181,12 @@ static inline int mca_btl_mvapi_endpoint_post_rr( mca_btl_mvapi_endpoint_t * end
return rc;
}
}
if(mvapi_btl->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && mvapi_btl->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){
if(endpoint->rr_posted_low <= mca_btl_mvapi_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_mvapi_component.ib_rr_buf_max){
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - mvapi_btl->rr_posted_low,
rc = mca_btl_mvapi_endpoint_post_rr_sub(mca_btl_mvapi_component.ib_rr_buf_max - endpoint->rr_posted_low,
endpoint,
&mvapi_btl->recv_free_max,
&mvapi_btl->rr_posted_low,
&endpoint->rr_posted_low,
mvapi_btl->nic,
endpoint->lcl_qp_hndl_low
);

Просмотреть файл

@ -21,7 +21,7 @@
#include "opal/util/if.h"
#include "mca/pml/pml.h"
#include "mca/btl/btl.h"
#include "mca/btl/base/btl_base_error.h"
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_proc.h"
@ -127,6 +127,10 @@ int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl,
return OMPI_SUCCESS;
}
/*
*Register callback function to support send/recv semantics
*/
int mca_btl_openib_register(
struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
@ -179,7 +183,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
}
/**
*
* Return a segment
*
*/
int mca_btl_openib_free(
@ -189,16 +193,16 @@ int mca_btl_openib_free(
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des;
if(frag->size == 0) {
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
OBJ_RELEASE(frag->openib_reg);
MCA_BTL_IB_FRAG_RETURN_FRAG(btl, frag);
}
else if(frag->size == mca_btl_openib_component.max_send_size){
MCA_BTL_IB_FRAG_RETURN_MAX(btl, frag);
} else if(frag->size == mca_btl_openib_component.eager_limit){
MCA_BTL_IB_FRAG_RETURN_EAGER(btl, frag);
} else {
BTL_ERROR("invalid descriptor");
}
return OMPI_SUCCESS;
@ -265,13 +269,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
rc = mca_mpool_base_remove((void*) openib_reg->base_reg.base);
if(OMPI_SUCCESS != rc) {
opal_output(0, "%s:%d:%s error removing memory region from memory pool tree", __FILE__, __LINE__, __func__);
BTL_ERROR("error removing memory region from memory pool tree");
return NULL;
}
if(is_leave_pinned) {
if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)){
opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__);
BTL_ERROR("error removing item from reg_mru_list");
return NULL;
}
}
@ -290,7 +294,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
if(rc != OMPI_SUCCESS) {
opal_output(0,"%s:%d:%s error inserting memory region into memory pool tree", __FILE__, __LINE__, __func__);
BTL_ERROR("error inserting memory region into memory pool tree");
return NULL;
}
@ -302,7 +306,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
}
else if(is_leave_pinned) {
if(NULL == opal_list_remove_item(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg)) {
opal_output(0,"%s:%d:%s error removing item from reg_mru_list", __FILE__, __LINE__, __func__);
BTL_ERROR("error removing item from reg_mru_list");
return NULL;
}
opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg);
@ -399,13 +403,16 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uintptr_t) iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey;
frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->rkey;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->openib_reg = openib_reg;
DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu", frag->sg_entry.lkey, frag->sg_entry.addr);
return &frag->base;
} else if (max_data+reserve <= btl->btl_eager_limit) {
@ -416,7 +423,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
}
iov.iov_len = max_data;
iov.iov_base = frag->segment.seg_addr.pval + reserve;
iov.iov_base = frag->segment.seg_addr.lval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after);
*size = max_data;
@ -507,8 +514,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->base.des_flags = 0;
if(NULL!= openib_reg){
reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1;
bool is_leave_pinned = openib_reg->base_reg.is_leave_pinned;
reg_len = (unsigned char*)openib_reg->base_reg.bound - (unsigned char*)frag->segment.seg_addr.pval + 1;
if(frag->segment.seg_len > reg_len ) {
size_t new_len = openib_reg->base_reg.bound - openib_reg->base_reg.base + 1
@ -560,6 +568,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
}
opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg);
}
OBJ_RETAIN(openib_reg);
} else {
if(mca_btl_openib_component.leave_pinned) {
@ -601,7 +610,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
opal_output(0,"%s:%d:%s error inserting memory region into memory pool", __FILE__, __LINE__, __func__);
return NULL;
}
OBJ_RETAIN(openib_reg);
opal_list_append(&openib_btl->reg_mru_list, (opal_list_item_t*) openib_reg);
@ -622,14 +631,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (uintptr_t) frag->segment.seg_addr.pval;
frag->segment.seg_key.key32[0] = (uint32_t) frag->mr->lkey;
frag->segment.seg_key.key32[0] = frag->mr->rkey;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->openib_reg = openib_reg;
OBJ_RETAIN(openib_reg);
DEBUG_OUT("frag->sg_entry.lkey = %lu .addr = %llu frag->segment.seg_key.key32[0] = %lu" , frag->sg_entry.lkey, frag->sg_entry.addr, frag->segment.seg_key.key32[0]);
return &frag->base;
}
@ -712,10 +722,17 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
frag->endpoint = endpoint;
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_src->seg_addr.pval;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.wr.rdma.remote_addr = (uintptr_t) frag->base.des_dst->seg_addr.pval;
frag->sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
frag->sg_entry.addr = (uintptr_t) frag->base.des_src->seg_addr.pval;
frag->sg_entry.length = frag->base.des_src->seg_len;
DEBUG_OUT("frag->sr_desc.wr.rdma.remote_addr = %llu .rkey = %lu frag->sg_entry.addr = %llu .length = %lu"
, frag->sr_desc.wr.rdma.remote_addr
, frag->sr_desc.wr.rdma.rkey
, frag->sg_entry.addr
, frag->sg_entry.length);
if(ibv_post_send(endpoint->lcl_qp_low,
&frag->sr_desc,
@ -785,7 +802,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
/* Allocate Protection Domain */
struct ibv_context *ctx;
openib_btl->poll_cq = false;
ctx = openib_btl->ib_dev_context;
openib_btl->ib_pd = ibv_alloc_pd(ctx);
@ -821,5 +838,7 @@ int mca_btl_openib_module_init(mca_btl_openib_module_t *openib_btl)
/* TODO: EVAPI_set_qsync_event_handler? */
return OMPI_SUCCESS;
}

Просмотреть файл

@ -155,14 +155,10 @@ struct mca_btl_openib_module_t {
mca_mpool_base_module_t* ib_pool; /**< ib memory pool */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
/**< an array to allow posting of rr in one swoop */
size_t ib_inline_max; /**< max size of inline send*/
bool poll_cq;

Просмотреть файл

@ -31,6 +31,7 @@
#include "btl_openib_frag.h"
#include "btl_openib_endpoint.h"
#include "mca/btl/base/base.h"
#include "mca/btl/base/btl_base_error.h"
#include "datatype/convertor.h"
@ -253,12 +254,15 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
mca_btl_openib_module_t * openib_btl;
mca_btl_base_selected_module_t* ib_selected;
opal_list_item_t* item;
struct dlist *dev_list;
struct ibv_device* ib_dev;
/* initialization */
*num_btl_modules = 0;
num_devs = 0;
struct dlist *dev_list;
struct ibv_device* ib_dev;
/* Determine the number of hca's available on the host */
dev_list = ibv_get_devices();
@ -268,7 +272,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
num_devs++;
if(0 == num_devs) {
opal_output(0, "No hca's found on this host! \n");
BTL_ERROR("No hca's found on this host!");
return NULL;
}
@ -297,51 +301,18 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
for(i = 0; i < num_devs; i++){
struct ibv_device_attr ib_dev_attr;
struct ibv_context* ib_dev_context;
struct ibv_pd *my_pd;
struct ibv_mr *mr;
void* my_addr;
uint32_t my_size;
uint32_t my_indx;
uint32_t my_mult;
my_mult = 4096;
struct ibv_context* ib_dev_context;
ib_dev = ib_devs[i];
ib_dev_context = ibv_open_device(ib_dev);
if(!ib_dev_context) {
opal_output(0, "%s: error obtaining device context for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno));
BTL_ERROR(" error obtaining device context for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno));
return NULL;
}
my_pd = ibv_alloc_pd(ib_dev_context);
for(my_indx = 1; my_indx <= 8192; my_indx++){
my_size = my_mult * my_indx;
my_addr = memalign(4096, my_size);
memset(my_addr, 0, my_size);
mr = ibv_reg_mr(
my_pd,
my_addr,
my_size,
IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
);
if(NULL == mr){
opal_output(0, "%s: error on mr test! can't register %lu bytes, errno says %s \n", __func__, my_size, strerror(errno));
break;
}
else {
opal_output(0, "%s: successfully registerted %lu bytes", __func__, my_size);
ibv_dereg_mr(mr);
}
}
if(ibv_query_device(ib_dev_context, &ib_dev_attr)){
opal_output(0, "%s: error obtaining device attributes for %s errno says %s\n", __func__, ibv_get_device_name(ib_dev), strerror(errno));
BTL_ERROR("error obtaining device attributes for %s errno says %s\n", ibv_get_device_name(ib_dev), strerror(errno));
return NULL;
}
@ -352,8 +323,8 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
struct ibv_port_attr* ib_port_attr;
ib_port_attr = (struct ibv_port_attr*) malloc(sizeof(struct ibv_port_attr));
if(ibv_query_port(ib_dev_context, (uint8_t) j, ib_port_attr)){
opal_output(0, "%s: error getting port attributes for device %s port number %d errno says %s",
__func__, ibv_get_device_name(ib_dev), j, strerror(errno));
BTL_ERROR("error getting port attributes for device %s port number %d errno says %s",
ibv_get_device_name(ib_dev), j, strerror(errno));
return NULL;
}
@ -438,7 +409,7 @@ mca_btl_base_module_t** mca_btl_openib_component_init(int *num_btl_modules,
&mpool_resources);
if(NULL == openib_btl->ib_pool) {
opal_output(0, "%s: error creating vapi memory pool! aborting ib btl initialization", __func__);
BTL_ERROR("error creating vapi memory pool! aborting ib btl initialization");
return NULL;
}
@ -531,68 +502,80 @@ int mca_btl_openib_component_progress()
uint32_t i, ne;
int count = 0;
mca_btl_openib_frag_t* frag;
mca_btl_openib_endpoint_t* endpoint;
/* Poll for completions */
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
struct ibv_wc wc;
memset(&wc, 0, sizeof(struct ibv_wc));
mca_btl_openib_module_t* openib_btl = &mca_btl_openib_component.openib_btls[i];
/* we have two completion queues, one for "high" priority and one for "low".
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
*/
do{
ne=ibv_poll_cq(openib_btl->ib_cq_high, 1, &wc );
if(ne < 0 ){
opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno));
BTL_ERROR("error polling CQ with %d errno says %s\n", ne, strerror(errno));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n",
__func__,
wc.status, wc.wr_id);
BTL_ERROR("error polling CQ with status %d for wr_id %d\n",
wc.status, wc.wr_id);
return OMPI_ERROR;
}
else if(1 == ne) {
/* Handle n/w completions */
else if(1 == ne) {
DEBUG_OUT("completion queue event says opcode is %d\n", wc.opcode);
/* Handle work completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
opal_output(0, "Got an RDMA with Immediate data Not supported!\n");
BTL_ERROR("Got an RDMA with Immediate data Not supported!");
return OMPI_ERROR;
case IBV_WC_RECV:
/* Process a RECV */
DEBUG_OUT("Got an recv on the completion queue");
frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
OPAL_THREAD_ADD32(&endpoint->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
/* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
count++;
break;
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
if(wc.opcode & IBV_WC_RECV){
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
count++;
}
else {
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
}
/* Process a completed send or rdma write*/
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
break;
break;
default:
opal_output(0, "Errorneous network completion");
BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode);
break;
}
}
@ -601,12 +584,11 @@ int mca_btl_openib_component_progress()
ne=ibv_poll_cq(openib_btl->ib_cq_low, 1, &wc );
if(ne < 0){
opal_output(0, "%s: error polling CQ with %d errno says %s\n", __func__, ne, strerror(errno));
BTL_ERROR("error polling CQ with %d errno says %s", ne, strerror(errno));
return OMPI_ERROR;
}
else if(wc.status != IBV_WC_SUCCESS) {
opal_output(0, "%s: error polling CQ with status %d for wr_id %d\n",
__func__,
BTL_ERROR("error polling CQ with status %d for wr_id %d",
wc.status, wc.wr_id);
return OMPI_ERROR;
}
@ -614,46 +596,46 @@ int mca_btl_openib_component_progress()
/* Handle n/w completions */
switch(wc.opcode) {
case IBV_WC_RECV_RDMA_WITH_IMM:
opal_output(0, "Got an RDMA with Immediate data Not supported!\n");
BTL_ERROR("Got an RDMA with Immediate data Not supported!");
return OMPI_ERROR;
case IBV_WC_RECV:
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT( "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id;
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
frag->rc=OMPI_SUCCESS;
/* advance the segment address past the header and subtract from the length..*/
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
OPAL_THREAD_ADD32(&endpoint->rr_posted_low, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_max), (opal_list_item_t*) frag);
count++;
break;
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND :
if(wc.opcode & IBV_WC_RECV){
/* process a recv completion (this should only occur for a send not an rdma) */
DEBUG_OUT(0, "%s:%d ib recv under redesign\n", __FILE__, __LINE__);
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc=OMPI_SUCCESS;
frag->segment.seg_len =
wc.byte_len-
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* advance the segment address past the header and subtract from the length..*/
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag,
&frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(openib_btl->recv_free_eager), (opal_list_item_t*) frag);
OPAL_THREAD_ADD32(&openib_btl->rr_posted_high, -1);
mca_btl_openib_endpoint_post_rr(((mca_btl_openib_frag_t*)wc.wr_id)->endpoint, 0);
count++;
}
else {
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
}
/* Process a completed send */
frag = (mca_btl_openib_frag_t*) wc.wr_id;
frag->rc = OMPI_SUCCESS;
frag->base.des_cbfunc(&openib_btl->super, frag->endpoint, &frag->base, frag->rc);
count++;
break;
default:
opal_output(0, "Errorneous network completion");
BTL_ERROR("Unhandled work completion opcode is %d", wc.opcode);
break;
}
}

Просмотреть файл

@ -74,14 +74,17 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
}
frag->sr_desc.opcode = IBV_WR_SEND;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sg_entry.length =
frag->segment.seg_len +
((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
frag->sg_entry.length = frag->segment.seg_len + ((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr); /* sizeof(mca_btl_openib_header_t); */
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
/* frag->sr_desc.send_flags |= IBV_SEND_INLINE; */
}
/* TODO: should check if we can inline send,, but can't find
* inline send defined in openib verbs api.
* if(frag->sg_entry.len <= openib_btl->ib_inline_max) {
*/
if(ibv_post_send(ib_qp,
&frag->sr_desc,
&bad_wr)) {
@ -90,7 +93,7 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
}
mca_btl_openib_endpoint_post_rr(endpoint, 1);
return OMPI_ERROR;
return OMPI_SUCCESS;
}
@ -114,6 +117,14 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
OBJ_CONSTRUCT(&endpoint->endpoint_send_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->endpoint_recv_lock, opal_mutex_t);
OBJ_CONSTRUCT(&endpoint->pending_send_frags, opal_list_t);
endpoint->lcl_qp_attr_high = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
endpoint->lcl_qp_attr_low = (struct ibv_qp_attr *) malloc(sizeof(struct ibv_qp_attr));
memset(endpoint->lcl_qp_attr_high, 0, sizeof(struct ibv_qp_attr));
memset(endpoint->lcl_qp_attr_low, 0, sizeof(struct ibv_qp_attr));
endpoint->rr_posted_high = 0;
endpoint->rr_posted_low = 0;
}
/*
@ -190,9 +201,9 @@ static int mca_btl_openib_endpoint_send_connect_req(mca_btl_base_endpoint_t* end
DEBUG_OUT("Sending High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_prop_high.qp_num,
endpoint->lcl_qp_prop_low.qp_num,
endpoint->endpoint_btl->port.lid);
endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_low->qp_num,
endpoint->endpoint_btl->ib_port_attr->lid);
if(rc < 0) {
ORTE_ERROR_LOG(rc);
@ -318,6 +329,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
srand48(getpid() * time(NULL));
endpoint->lcl_psn_high = lrand48() & 0xffffff;
/* Create the Low Priority Queue Pair */
@ -334,7 +346,7 @@ static int mca_btl_openib_endpoint_start_connect(mca_btl_base_endpoint_t* endpoi
DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_low.qp_num,
endpoint->lcl_qp_low->qp_num,
openib_btl->ib_port_attr->lid);
/* Send connection info over to remote endpoint */
@ -367,6 +379,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
ORTE_NAME_ARGS(orte_process_info.my_name), __FILE__,__LINE__,rc);
return rc;
}
srand48(getpid() * time(NULL));
endpoint->lcl_psn_high = lrand48() & 0xffffff;
/* Create the Low Priority Queue Pair */
@ -383,7 +396,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
DEBUG_OUT("Initialized High Priority QP num = %d, Low Priority QP num = %d, LID = %d",
endpoint->lcl_qp_high->qp_num,
endpoint->lcl_qp_low.qp_num,
endpoint->lcl_qp_low->qp_num,
openib_btl->ib_port_attr->lid);
@ -415,6 +428,7 @@ static int mca_btl_openib_endpoint_reply_start_connect(mca_btl_openib_endpoint_t
static void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
endpoint->endpoint_btl->poll_cq = true;
mca_btl_openib_progress_send_frags(endpoint);
}
@ -491,12 +505,13 @@ static void mca_btl_openib_endpoint_recv(
break;
case MCA_BTL_IB_CONNECT_ACK:
DEBUG_OUT("Got a connect ack from %d\n", endpoint->vpid);
mca_btl_openib_endpoint_connected(ib_endpoint);
break;
case MCA_BTL_IB_CONNECTED :
break;
default :
opal_output(0, "Connected -> Connecting not possible.\n");
@ -581,9 +596,9 @@ int mca_btl_openib_endpoint_send(
DEBUG_OUT("Send to : %d, len : %d, frag : %p",
endpoint->endpoint_proc->proc_guid.vpid,
frag->ib_buf.desc.sg_entry.len,
frag);
endpoint->endpoint_proc->proc_guid.vpid,
frag->sg_entry.length,
frag);
rc = mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag);
@ -686,23 +701,27 @@ int mca_btl_openib_endpoint_create_qp(
)
{
{
struct ibv_qp* my_qp;
struct ibv_qp_init_attr qp_init_attr;
memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
qp_init_attr.send_cq = cq;
qp_init_attr.recv_cq = cq;
qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_send_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_recv_wr = mca_btl_openib_component.ib_wq_size;
qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.cap.max_send_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.cap.max_recv_sge = mca_btl_openib_component.ib_sg_list_size;
qp_init_attr.qp_type = IBV_QPT_RC;
(*qp) = ibv_create_qp(pd, &qp_init_attr);
my_qp = ibv_create_qp(pd, &qp_init_attr);
if(NULL == (*qp)) {
if(NULL == my_qp) {
opal_output(0, "%s: error creating qp errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR;
}
(*qp) = my_qp;
openib_btl->ib_inline_max = qp_init_attr.cap.max_inline_data;
}
@ -711,7 +730,7 @@ int mca_btl_openib_endpoint_create_qp(
qp_attr->qp_state = IBV_QPS_INIT;
qp_attr->pkey_index = mca_btl_openib_component.ib_pkey_ix;
qp_attr->port_num = openib_btl->port_num;
qp_attr->qp_access_flags = 0;
qp_attr->qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
if(ibv_modify_qp((*qp), qp_attr,
IBV_QP_STATE |

Просмотреть файл

@ -1,3 +1,4 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
@ -90,7 +91,7 @@ struct mca_btl_base_endpoint_t {
/**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpoint */
/**< list of pending send frags for this endpotint */
uint32_t rem_qp_num_high;
uint32_t rem_qp_num_low;
@ -115,7 +116,11 @@ struct mca_btl_base_endpoint_t {
struct ibv_qp_attr* lcl_qp_attr_high;
struct ibv_qp_attr* lcl_qp_attr_low;
/* Local QP attributes (Low and High) */
uint32_t rr_posted_high; /**< number of high priority rr posted to the nic*/
uint32_t rr_posted_low; /**< number of low priority rr posted to the nic*/
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -160,16 +165,15 @@ static inline int mca_btl_openib_endpoint_post_rr_sub(int cnt,
}
for(i=0; i< cnt; i++){
if(ibv_post_recv(qp,
&rr_desc_post[i],
&bad_wr)) {
opal_output(0, "%s: error posting receive errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR;
}
}
return OMPI_SUCCESS;
}
OPAL_THREAD_ADD32(rr_posted, cnt);
return OMPI_SUCCESS;
}
@ -179,12 +183,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e
int rc;
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(openib_btl->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){
if(endpoint->rr_posted_high <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_high < mca_btl_openib_component.ib_rr_buf_max){
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_high,
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_high,
endpoint,
&openib_btl->recv_free_eager,
&openib_btl->rr_posted_high,
&endpoint->rr_posted_high,
endpoint->lcl_qp_high
);
if(rc != OMPI_SUCCESS){
@ -192,12 +196,12 @@ static inline int mca_btl_openib_endpoint_post_rr( mca_btl_openib_endpoint_t * e
return rc;
}
}
if(openib_btl->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && openib_btl->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){
if(endpoint->rr_posted_low <= mca_btl_openib_component.ib_rr_buf_min+additional && endpoint->rr_posted_low < mca_btl_openib_component.ib_rr_buf_max){
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - openib_btl->rr_posted_low,
rc = mca_btl_openib_endpoint_post_rr_sub(mca_btl_openib_component.ib_rr_buf_max - endpoint->rr_posted_low,
endpoint,
&openib_btl->recv_free_max,
&openib_btl->rr_posted_low,
&endpoint->rr_posted_low,
endpoint->lcl_qp_low
);
if(rc != OMPI_SUCCESS) {

Просмотреть файл

@ -45,11 +45,12 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->sr_desc.wr_id = frag;
frag->sr_desc.wr_id = (uint64_t) frag;
frag->sr_desc.sg_list = &frag->sg_entry;
frag->sr_desc.num_sge = 1;
frag->sr_desc.opcode = IBV_WR_SEND;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.next = NULL;
}
static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag)
@ -60,10 +61,11 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->rr_desc.wr_id = frag;
frag->rr_desc.wr_id = (uint64_t) frag;
frag->rr_desc.sg_list = &frag->sg_entry;
frag->rr_desc.num_sge = 1;
frag->rr_desc.next = NULL;
}
static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag)

Просмотреть файл

@ -30,11 +30,11 @@ extern "C" {
static inline void * DOWN_ALIGN_ADDR(void * addr, uint32_t cnt) {
return (void*)((uintptr_t)(addr) & (~((uintptr_t)0) << (cnt)));
return (void*)((uintptr_t)((unsigned char*) addr) & (~((uintptr_t)0) << (cnt)));
}
static inline void* ALIGN_ADDR(void* addr, uint32_t cnt ) {
DOWN_ALIGN_ADDR(((addr) + ~(~((uintptr_t)0) << (cnt))), (cnt));
DOWN_ALIGN_ADDR((((unsigned char*) addr) + ~(~((uintptr_t)0) << (cnt))), (cnt));
return addr;
}

Просмотреть файл

@ -70,13 +70,6 @@ static void mca_mpool_openib_registration_constructor( mca_mpool_openib_registra
static void mca_mpool_openib_registration_destructor( mca_mpool_openib_registration_t * registration )
{
mca_mpool_base_remove((void*) registration);
registration->base_reg.mpool->mpool_deregister(
registration->base_reg.mpool,
registration->base_reg.base,
0,
(mca_mpool_base_registration_t*) registration);
registration->base_reg.base = NULL;
registration->base_reg.bound = NULL;
registration->base_reg.is_leave_pinned=false;

Просмотреть файл

@ -57,6 +57,7 @@ void* mca_mpool_openib_alloc(
free(addr_malloc);
return NULL;
}
(*registration)->alloc_base = addr_malloc;
return addr;
}
@ -80,7 +81,7 @@ int mca_mpool_openib_register(mca_mpool_base_module_t* mpool,
mpool_module->resources.ib_pd,
addr,
size,
IBV_ACCESS_REMOTE_WRITE
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE
/* IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE */
);
@ -106,7 +107,7 @@ int mca_mpool_openib_deregister(mca_mpool_base_module_t* mpool, void *addr, size
mca_mpool_openib_module_t * mpool_openib = (mca_mpool_openib_module_t*) mpool;
mca_mpool_openib_registration_t * openib_reg;
openib_reg = (mca_mpool_openib_registration_t*) registration;
if(! ibv_dereg_mr(openib_reg->mr)){
if(ibv_dereg_mr(openib_reg->mr)){
opal_output(0, "%s: error unpinning openib memory errno says %s\n", __func__, strerror(errno));
return OMPI_ERROR;
}