1
1

Use OMPI object system to make fragment hierarchy more object oriented. The

main idea (except of cleanup) is to save on initialisation of unneeded fields
and to use C type checking system to catch obvious errors.

This commit was SVN r16779.
Этот коммит содержится в:
Gleb Natapov 2007-11-28 07:11:14 +00:00
родитель 267cd2342a
Коммит 6a2d210b7d
8 изменённых файлов: 559 добавлений и 510 удалений

Просмотреть файл

@ -466,7 +466,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
uint8_t order,
size_t size)
{
mca_btl_openib_frag_t* frag = NULL;
mca_btl_openib_com_frag_t* frag = NULL;
mca_btl_openib_module_t* openib_btl;
int rc;
openib_btl = (mca_btl_openib_module_t*) btl;
@ -475,13 +475,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
if(NULL == frag)
return NULL;
/* GMS is this necessary anymore ? */
frag->segment.seg_len = size;
frag->base.order = order;
frag->base.des_flags = 0;
/* not all upper layer users set this */
to_base_frag(frag)->segment.seg_len = size;
to_base_frag(frag)->base.order = order;
assert(frag->qp_idx <= order);
return (mca_btl_base_descriptor_t*)frag;
assert(to_send_frag(frag)->qp_idx <= order);
return &to_base_frag(frag)->base;
}
/**
@ -494,19 +493,32 @@ int mca_btl_openib_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des)
{
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des;
/* is this fragment pointing at user memory? */
if(((MCA_BTL_OPENIB_FRAG_SEND_USER == frag->type) ||
(MCA_BTL_OPENIB_FRAG_RECV_USER == frag->type))
&& frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)
frag->registration);
frag->registration = NULL;
if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) ||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
mca_btl_openib_com_frag_t* frag = to_com_frag(des);
if(frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)frag->registration);
frag->registration = NULL;
}
}
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*) btl), frag);
/* reset those field on free so we will not have to do it on alloc */
to_base_frag(des)->base.des_flags = 0;
if(MCA_BTL_OPENIB_FRAG_RECV == openib_frag_type(des) ||
MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) {
to_base_frag(des)->base.des_src = NULL;
to_base_frag(des)->base.des_src_cnt = 0;
} else if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des) ||
MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
to_base_frag(des)->base.des_dst = NULL;
to_base_frag(des)->base.des_dst_cnt = 0;
if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des))
to_com_frag(des)->sg_entry.addr = (uint64_t)to_send_frag(des)->hdr;
}
MCA_BTL_IB_FRAG_RETURN(des);
return OMPI_SUCCESS;
}
@ -546,8 +558,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
)
{
mca_btl_openib_module_t *openib_btl;
mca_btl_openib_frag_t *frag = NULL;
mca_btl_openib_reg_t *openib_reg;
mca_btl_openib_com_frag_t *frag = NULL;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
@ -574,38 +586,35 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
iov.iov_base, max_data, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
frag->registration = (mca_btl_openib_reg_t*)registration;
to_com_frag(frag)->registration =
(mca_btl_openib_reg_t*)registration;
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->base.order = order;
frag->base.des_flags = 0;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->sg_entry.length = max_data;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (unsigned long)iov.iov_base;
frag->sg_entry.addr = (uint64_t)iov.iov_base;
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
to_base_frag(frag)->base.order = order;
to_base_frag(frag)->segment.seg_len = max_data;
to_base_frag(frag)->segment.seg_addr.pval = iov.iov_base;
to_base_frag(frag)->segment.seg_key.key32[0] =
(uint32_t)frag->sg_entry.lkey;
assert(MCA_BTL_NO_ORDER == order);
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
frag->sg_entry.lkey, frag->sg_entry.addr,
frag->segment.seg_key.key32[0]));
frag->sg_entry.lkey));
return &frag->base;
return &to_base_frag(frag)->base;
}
}
@ -621,20 +630,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
return NULL;
iov.iov_len = max_data;
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
iov.iov_base = (unsigned char*)
to_base_frag(frag)->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
frag->segment.seg_len = max_data + reserve;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
/* frag->base.order = order; */
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = order;
return &frag->base;
*size = max_data;
to_base_frag(frag)->segment.seg_len = max_data + reserve;
to_base_frag(frag)->base.order = order;
return &to_base_frag(frag)->base;
}
/**
@ -661,9 +665,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
size_t* size)
{
mca_btl_openib_module_t *openib_btl;
mca_btl_openib_frag_t *frag;
mca_btl_openib_com_frag_t *frag;
mca_btl_openib_reg_t *openib_reg;
int rc;
void *buffer;
openib_btl = (mca_btl_openib_module_t*)btl;
@ -672,16 +677,16 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
return NULL;
}
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
ompi_convertor_get_current_pointer(convertor, &buffer);
if(NULL == registration){
/* we didn't get a memory registration passed in, so we have to
* register the region ourselves
*/
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
frag->segment.seg_addr.pval, *size, 0, &registration);
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, 0,
&registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
MCA_BTL_IB_FRAG_RETURN(frag);
return NULL;
}
/* keep track of the registration we did */
@ -691,24 +696,19 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->sg_entry.length = *size;
frag->sg_entry.lkey = openib_reg->mr->lkey;
frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval;
frag->sg_entry.addr = (uint64_t)buffer;
frag->segment.seg_len = *size;
frag->segment.seg_key.key32[0] = openib_reg->mr->rkey;
frag->base.order = order;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_flags = 0;
to_base_frag(frag)->segment.seg_addr.pval = buffer;
to_base_frag(frag)->segment.seg_len = *size;
to_base_frag(frag)->segment.seg_key.key32[0] = openib_reg->mr->rkey;
to_base_frag(frag)->base.order = order;
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
frag->sg_entry.lkey, frag->sg_entry.addr,
frag->segment.seg_key.key32[0]));
openib_reg->mr->rkey));
return &frag->base;
return &to_base_frag(frag)->base;
}
static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca)
@ -826,7 +826,7 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(BTL_OPENIB_QP_TYPE_SRQ(qp)){
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(openib_btl,
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags);
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
@ -895,13 +895,13 @@ int mca_btl_openib_send(
mca_btl_base_tag_t tag)
{
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)descriptor;
assert(frag->type == MCA_BTL_OPENIB_FRAG_SEND);
mca_btl_openib_send_frag_t* frag = to_send_frag(descriptor);
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
frag->endpoint = endpoint;
to_com_frag(frag)->endpoint = endpoint;
frag->hdr->tag = tag;
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
return mca_btl_openib_endpoint_send(endpoint, frag);
}
@ -913,52 +913,50 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc = OMPI_SUCCESS;
struct ibv_send_wr* bad_wr;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */
int qp = frag->base.order;
mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = descriptor->des_dst->seg_addr.lval;
uint32_t rkey = descriptor->des_dst->seg_key.key32[0];
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER ||
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* setup for queued requests */
frag->endpoint = endpoint;
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_put_frags, (opal_list_item_t *)frag);
opal_list_append(&endpoint->pending_put_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return rc;
/* post descriptor */
} else {
int ib_rc;
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_dst->seg_addr.lval);
frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_dst->seg_key.key32[0]);
} else
#endif
{
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_dst->seg_addr.lval;
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
}
frag->sg_entry.addr = (unsigned long) frag->base.des_src->seg_addr.pval;
frag->sg_entry.length = frag->base.des_src->seg_len;
frag->base.order = qp;
ib_rc = ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr);
if(ib_rc)
rc = OMPI_ERROR;
return OMPI_SUCCESS;
}
return rc;
/* post descriptor */
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
if((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN)
!= (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr =
(uint64_t)descriptor->des_src->seg_addr.pval;
to_com_frag(frag)->sg_entry.length = descriptor->des_src->seg_len;
to_com_frag(frag)->endpoint = endpoint;
descriptor->order = qp;
/* Setting opcode on a frag constructor isn't enough since prepare_src
* may return send_frag instead of put_frag */
frag->sr_desc.opcode = IBV_WR_RDMA_WRITE;
if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->sr_desc, &bad_wr))
return OMPI_ERROR;
return OMPI_SUCCESS;
}
@ -970,69 +968,58 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* descriptor)
{
int rc;
struct ibv_send_wr* bad_wr;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */
int qp = frag->base.order;
frag->endpoint = endpoint;
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ;
mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor);
int qp = descriptor->order;
uint64_t rem_addr = descriptor->des_src->seg_addr.lval;
uint32_t rkey = descriptor->des_src->seg_key.key32[0];
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER);
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
}
/* check for a get token */
} else if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) {
if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_ADD32(&endpoint->get_tokens,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
return OMPI_SUCCESS;
} else {
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_src->seg_addr.lval);
frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_src->seg_key.key32[0]);
} else
#endif
{
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_src->seg_addr.lval;
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0];
}
frag->sg_entry.addr = (unsigned long) frag->base.des_dst->seg_addr.pval;
frag->sg_entry.length = frag->base.des_dst->seg_len;
frag->base.order = qp;
if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr)){
BTL_ERROR(("error posting send request errno (%d) says %s",
errno, strerror(errno)));
rc = ORTE_ERROR;
} else {
rc = ORTE_SUCCESS;
}
}
return rc;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
if((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN)
!= (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
rem_addr = opal_swap_bytes8(rem_addr);
rkey = opal_swap_bytes4(rkey);
}
#endif
frag->sr_desc.wr.rdma.remote_addr = rem_addr;
frag->sr_desc.wr.rdma.rkey = rkey;
to_com_frag(frag)->sg_entry.addr =
(uint64_t)descriptor->des_dst->seg_addr.pval;
to_com_frag(frag)->sg_entry.length = descriptor->des_dst->seg_len;
to_com_frag(frag)->endpoint = endpoint;
descriptor->order = qp;
if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->sr_desc, &bad_wr))
return OMPI_ERROR;
return OMPI_SUCCESS;
}
int mca_btl_openib_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;

Просмотреть файл

@ -546,13 +546,11 @@ static inline int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl,
for(i = 0; i < num_post; i++) {
ompi_free_list_item_t* item;
mca_btl_openib_frag_t* frag;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
frag = (mca_btl_openib_frag_t*)item;
frag->base.order = qp;
frag->endpoint = NULL;
to_base_frag(item)->base.order = qp;
to_com_frag(item)->endpoint = NULL;
if(ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq,
&frag->wr_desc.rd_desc,
&to_recv_frag(item)->rd_desc,
&bad_wr)) {
BTL_ERROR(("error posting receive descriptors to shared "
"receive queue: %s", strerror(errno)));

Просмотреть файл

@ -86,7 +86,7 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
ompi_btl_openib_ini_values_t *src);
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_frag_t *frag,
mca_btl_openib_recv_frag_t *frag,
size_t byte_len, const int prio);
static char* btl_openib_component_status_to_string(enum ibv_wc_status status);
static int btl_openib_component_progress(void);
@ -95,9 +95,7 @@ static void btl_openib_frag_progress_pending_pp(
mca_btl_base_endpoint_t *endpoint,
const int qp);
static void btl_openib_frag_progress_pending_srq(
mca_btl_openib_module_t* openib_btl,
mca_btl_base_endpoint_t *endpoint,
const int qp);
mca_btl_openib_module_t* openib_btl, const int qp);
static void btl_openib_frag_progress_pending_put_get(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp);
@ -226,20 +224,20 @@ static int btl_openib_modex_send(void)
static void btl_openib_control(struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
mca_btl_base_descriptor_t* des,
void* cbdata)
{
/* dont return credits used for control messages */
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)descriptor;
mca_btl_openib_endpoint_t* endpoint = frag->endpoint;
mca_btl_openib_control_header_t *ctl_hdr = frag->segment.seg_addr.pval;
/* don't return credits used for control messages */
mca_btl_openib_endpoint_t* endpoint = to_com_frag(des)->endpoint;
mca_btl_openib_control_header_t *ctl_hdr =
to_base_frag(des)->segment.seg_addr.pval;
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
int qp = frag->qp_idx;
int qp = to_recv_frag(des)->qp_idx;
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
/* if not sent via rdma */
if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) &&
if(!MCA_BTL_OPENIB_RDMA_FRAG(des) &&
ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1);
/* rd_posted don't account for rsv preposts for credit message but
@ -733,15 +731,13 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
openib_btl->super.btl_mpool = openib_btl->hca->mpool;
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
length = sizeof(mca_btl_openib_send_user_frag_t);
init_data->length = length;
init_data->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
init_data->order = mca_btl_openib_component.rdma_qp;
init_data->list = &openib_btl->send_user_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_user_free,
length, 2, OBJ_CLASS(mca_btl_openib_send_user_frag_t),
sizeof(mca_btl_openib_put_frag_t), 2,
OBJ_CLASS(mca_btl_openib_put_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
@ -750,15 +746,13 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
}
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
length = sizeof(mca_btl_openib_recv_user_frag_t);
init_data->length = length;
init_data->type = MCA_BTL_OPENIB_FRAG_RECV_USER;
init_data->order = mca_btl_openib_component.rdma_qp;
init_data->list = &openib_btl->recv_user_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->recv_user_free,
length, 2, OBJ_CLASS(mca_btl_openib_recv_user_frag_t),
sizeof(mca_btl_openib_get_frag_t), 2,
OBJ_CLASS(mca_btl_openib_get_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
@ -767,19 +761,17 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
}
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
length = sizeof(mca_btl_openib_send_frag_control_t) +
length = sizeof(mca_btl_openib_send_control_frag_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
sizeof(mca_btl_openib_eager_rdma_header_t);
init_data->length = sizeof(mca_btl_openib_eager_rdma_header_t);
init_data->type = MCA_BTL_OPENIB_FRAG_CONTROL;
init_data->order = mca_btl_openib_component.eager_rdma_qp;
init_data->list = &openib_btl->send_free_control;
if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_free_control,
length, mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_control_t),
OBJ_CLASS(mca_btl_openib_send_control_frag_t),
mca_btl_openib_component.ib_free_list_num, -1,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool, mca_btl_openib_frag_init,
@ -805,8 +797,6 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
sizeof(mca_btl_openib_footer_t) +
mca_btl_openib_component.qp_infos[qp].size;
init_data->length = mca_btl_openib_component.qp_infos[qp].size;
init_data->type = MCA_BTL_OPENIB_FRAG_SEND;
init_data->order = qp;
init_data->list = &openib_btl->qps[qp].send_free;
@ -827,8 +817,6 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
sizeof(mca_btl_openib_footer_t) +
mca_btl_openib_component.qp_infos[qp].size;
init_data->length = mca_btl_openib_component.qp_infos[qp].size;
init_data->type = MCA_BTL_OPENIB_FRAG_RECV;
init_data->order = qp;
init_data->list = &openib_btl->qps[qp].recv_free;
@ -1135,36 +1123,37 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_frag_t *frag,
mca_btl_openib_recv_frag_t *frag,
size_t byte_len, const int qp)
{
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
mca_btl_openib_header_t *hdr = frag->hdr;
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*(frag->hdr)));
BTL_OPENIB_HEADER_NTOH(*hdr);
}
/* advance the segment address past the header and subtract from the
* length..*/
frag->segment.seg_len = byte_len - sizeof(mca_btl_openib_header_t);
des->des_dst->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
/* call registered callback */
openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super,
frag->hdr->tag, &frag->base,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
openib_btl->ib_reg[hdr->tag].cbfunc(&openib_btl->super, hdr->tag, des,
openib_btl->ib_reg[hdr->tag].cbdata);
if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits) &&
BTL_OPENIB_CREDITS(frag->hdr->credits) > 0) {
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits) &&
BTL_OPENIB_CREDITS(hdr->credits) > 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
BTL_OPENIB_CREDITS(frag->hdr->credits));
BTL_OPENIB_CREDITS(hdr->credits));
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp) && frag->hdr->credits > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits,
frag->hdr->credits);
if(BTL_OPENIB_QP_TYPE_PP(qp) && hdr->credits > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits,
hdr->credits);
}
}
if(frag->hdr->cm_seen) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent,
-frag->hdr->cm_seen);
if(hdr->cm_seen) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -hdr->cm_seen);
}
/* We may receive credits here so try to progress only things that
@ -1258,21 +1247,19 @@ static void btl_openib_frag_progress_pending_pp(
mca_btl_base_endpoint_t *endpoint, const int qp)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
opal_list_item_t *frag;
size_t i, len = opal_list_get_size(&endpoint->qps[qp].pending_frags);
/* check to see if we need to progress any pending descriptors */
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 &&
BTL_OPENIB_TOKENS(endpoint, qp) > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item =
opal_list_remove_first(&(endpoint->qps[qp].pending_frags));
frag = opal_list_remove_first(&(endpoint->qps[qp].pending_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
if(NULL == frag)
break;
if(mca_btl_openib_endpoint_send(frag->endpoint, frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
if(mca_btl_openib_endpoint_send(endpoint, to_send_frag(frag)) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
@ -1280,43 +1267,39 @@ static void btl_openib_frag_progress_pending_pp(
static void btl_openib_frag_progress_pending_put_get(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp) {
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
opal_list_item_t *frag;
size_t i, len = opal_list_get_size(&endpoint->pending_get_frags);
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 &&
endpoint->get_tokens > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_get_frags));
frag = opal_list_remove_first(&(endpoint->pending_get_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
if(NULL == frag)
break;
if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, endpoint,
&to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE)
break;
}
len = opal_list_get_size(&endpoint->pending_put_frags);
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_put_frags));
frag = opal_list_remove_first(&(endpoint->pending_put_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
if(NULL == frag)
break;
if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, endpoint,
&to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
static void btl_openib_frag_progress_pending_srq(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp)
mca_btl_openib_module_t* openib_btl, const int qp)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
opal_list_item_t *frag;
size_t i, len;
assert(BTL_OPENIB_QP_TYPE_SRQ(qp));
@ -1325,13 +1308,13 @@ static void btl_openib_frag_progress_pending_srq(
for(i = 0; i < len && openib_btl->qps[qp].u.srq_qp.sd_credits > 0; i++) {
/* dequeue resources due to global flow control */
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
frag_item =
frag =
opal_list_remove_first(&openib_btl->qps[qp].u.srq_qp.pending_frags);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
if(NULL == frag)
break;
if(mca_btl_openib_endpoint_send(frag->endpoint, frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
if(mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint,
to_send_frag(frag)) == OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
@ -1381,7 +1364,6 @@ static int btl_openib_component_progress(void)
{
int i, j, c;
int count = 0, ret;
mca_btl_openib_frag_t* frag;
mca_btl_openib_endpoint_t* endpoint;
#if OMPI_HAVE_THREADS
@ -1395,6 +1377,7 @@ static int btl_openib_component_progress(void)
* queues.
*/
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
mca_btl_openib_recv_frag_t* frag;
mca_btl_openib_module_t* openib_btl = mca_btl_openib_component.openib_btls[i];
c = openib_btl->eager_rdma_buffers_count;
@ -1414,7 +1397,7 @@ static int btl_openib_component_progress(void)
int qp;
opal_atomic_rmb();
if(endpoint->nbo) {
BTL_OPENIB_FOOTER_NTOH((*frag->ftr));
BTL_OPENIB_FOOTER_NTOH(*frag->ftr);
}
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
#if OMPI_ENABLE_DEBUG
@ -1429,11 +1412,12 @@ static int btl_openib_component_progress(void)
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
size + sizeof(mca_btl_openib_footer_t));
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) +
to_base_frag(frag)->segment.seg_addr.pval =
((unsigned char* )frag->hdr) +
sizeof(mca_btl_openib_header_t);
ret = btl_openib_handle_incoming(openib_btl,
frag->endpoint, frag,
to_com_frag(frag)->endpoint, frag,
size - sizeof(mca_btl_openib_footer_t),
frag->qp_idx);
if (ret != MPI_SUCCESS) {
@ -1446,7 +1430,7 @@ static int btl_openib_component_progress(void)
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
while (endpoint->eager_rdma_local.tail !=
endpoint->eager_rdma_local.head) {
mca_btl_openib_frag_t *tf;
mca_btl_openib_recv_frag_t *tf;
tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->eager_rdma_local.tail);
if (MCA_BTL_OPENIB_RDMA_FRAG_LOCAL (tf))
@ -1492,8 +1476,9 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
{
static char *cq_name[] = {"HP CQ", "LP CQ"};
int cq, qp;
int count = 0,ne = 0, ret;
mca_btl_openib_frag_t* frag;
int count = 0,ne = 0;
mca_btl_openib_com_frag_t* frag;
mca_btl_base_descriptor_t *des;
mca_btl_openib_endpoint_t* endpoint;
mca_btl_openib_module_t *openib_btl = NULL;
struct ibv_wc wc;
@ -1504,15 +1489,24 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
ne = ibv_poll_cq(hca->ib_cq[cq], 1, &wc);
if(0 == ne)
continue;
if(ne < 0 || wc.status != IBV_WC_SUCCESS)
if(ne < 0)
goto error;
frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id;
qp = frag->base.order;
des = (mca_btl_base_descriptor_t*)(uintptr_t)wc.wr_id;
frag = to_com_frag(des);
/* For receive fragments "order" contains QP idx the fragment was posted
* to. For send fragments "order" contains QP idx the fragment was send
* through */
qp = des->order;
endpoint = frag->endpoint;
if(endpoint)
openib_btl = endpoint->endpoint_btl;
if(wc.status != IBV_WC_SUCCESS)
goto error;
/* Handle work completions */
switch(wc.opcode) {
case IBV_WC_RDMA_READ:
@ -1522,8 +1516,7 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
case IBV_WC_RDMA_WRITE:
case IBV_WC_SEND:
/* Process a completed send/put/get */
frag->base.des_cbfunc(&openib_btl->super, endpoint, &frag->base,
OMPI_SUCCESS);
des->des_cbfunc(&openib_btl->super, endpoint, des, OMPI_SUCCESS);
/* return send wqe */
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);
@ -1531,7 +1524,7 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
if(IBV_WC_SEND == wc.opcode && BTL_OPENIB_QP_TYPE_SRQ(qp)) {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
/* new SRQ credit available. Try to progress pending frags*/
btl_openib_frag_progress_pending_srq(openib_btl, endpoint, qp);
btl_openib_frag_progress_pending_srq(openib_btl, qp);
}
/* new wqe or/and get token available. Try to progress pending frags */
btl_openib_frag_progress_pending_pp(endpoint, qp);
@ -1548,13 +1541,15 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
}
/* Process a RECV */
ret = btl_openib_handle_incoming(openib_btl, endpoint, frag, wc.byte_len, qp);
if (ret != OMPI_SUCCESS) {
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
if(btl_openib_handle_incoming(openib_btl, endpoint,
to_recv_frag(frag), wc.byte_len, qp) != OMPI_SUCCESS) {
openib_btl->error_cb(&openib_btl->super,
MCA_BTL_ERROR_FLAGS_FATAL);
return 0;
}
OMPI_FREE_LIST_RETURN(frag->list, (ompi_free_list_item_t*) frag);
MCA_BTL_IB_FRAG_RETURN(frag);
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
OPAL_THREAD_ADD32((int32_t*)
&openib_btl->qps[qp].u.srq_qp.rd_posted, -1);
@ -1596,21 +1591,16 @@ error:
} else {
static int flush_err_printed[] = {0, 0};
ompi_proc_t* remote_proc = NULL;
frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id;
if(frag) {
endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint;
if(endpoint &&
endpoint->endpoint_proc &&
endpoint->endpoint_proc->proc_ompi) {
remote_proc = endpoint->endpoint_proc->proc_ompi;
}
if(frag && endpoint && endpoint->endpoint_proc &&
endpoint->endpoint_proc->proc_ompi) {
remote_proc = endpoint->endpoint_proc->proc_ompi;
}
if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) {
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
"status number %d for wr_id %llu opcode %d qp_idx %d",
cq_name[cq],
btl_openib_component_status_to_string(wc.status),
wc.status, wc.wr_id, wc.opcode, frag->qp_idx));
wc.status, wc.wr_id, wc.opcode, qp));
}
if(wc.status == IBV_WC_RETRY_EXC_ERR) {
opal_show_help("help-mpi-btl-openib.txt",

Просмотреть файл

@ -45,7 +45,7 @@ struct mca_btl_openib_eager_rdma_remote_t {
typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t;
#define MCA_BTL_OPENIB_RDMA_FRAG(F) \
((F)->type == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)
(openib_frag_type(F) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)
#define EAGER_RDMA_BUFFER_REMOTE (0)
#define EAGER_RDMA_BUFFER_LOCAL (0xff)

Просмотреть файл

@ -48,62 +48,65 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
static int post_send(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_frag_t *frag,
mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag,
const int qp, const int do_rdma)
{
struct ibv_send_wr *bad_wr;
struct ibv_send_wr *bad_wr;
mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment;
struct ibv_sge *sg = &to_com_frag(frag)->sg_entry;
struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
assert(!do_rdma || BTL_OPENIB_EAGER_RDMA_QP(qp));
frag->sg_entry.length = frag->segment.seg_len +
sizeof(mca_btl_openib_header_t) +
sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
(do_rdma ? sizeof(mca_btl_openib_footer_t) : 0);
if(frag->sg_entry.length <= openib_btl->ib_inline_max) {
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE;
if(sg->length <= openib_btl->ib_inline_max) {
sr_desc->send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE;
} else {
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
sr_desc->send_flags = IBV_SEND_SIGNALED;
}
if(endpoint->nbo)
BTL_OPENIB_HEADER_HTON((*(frag->hdr)));
BTL_OPENIB_HEADER_HTON(*frag->hdr);
if(do_rdma) {
int32_t head;
mca_btl_openib_footer_t* ftr =
(mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
frag->segment.seg_len);
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE;
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.length);
mca_btl_openib_footer_t* ftr = (mca_btl_openib_footer_t*)
(((char*)seg->seg_addr.pval) + seg->seg_len);
sr_desc->opcode = IBV_WR_RDMA_WRITE;
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
#if OMPI_ENABLE_DEBUG
((mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) +
frag->segment.seg_len))->seq =
endpoint->eager_rdma_remote.seq++;
((mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) +
seg->seg_len))->seq = endpoint->eager_rdma_remote.seq++;
#endif
if(endpoint->nbo)
BTL_OPENIB_FOOTER_HTON((*ftr));
frag->wr_desc.sr_desc.wr.rdma.rkey = endpoint->eager_rdma_remote.rkey;
sr_desc->wr.rdma.rkey = endpoint->eager_rdma_remote.rkey;
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(endpoint->eager_rdma_remote.head, head);
frag->wr_desc.sr_desc.wr.rdma.remote_addr =
sr_desc->wr.rdma.remote_addr =
endpoint->eager_rdma_remote.base.lval +
head * openib_btl->eager_rdma_frag_size +
sizeof(mca_btl_openib_header_t) +
mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_footer_t);
frag->wr_desc.sr_desc.wr.rdma.remote_addr -= frag->sg_entry.length;
sr_desc->wr.rdma.remote_addr -= sg->length;
} else {
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND_WITH_IMM;
frag->wr_desc.sr_desc.imm_data = endpoint->rem_info.rem_index;
sr_desc->opcode = IBV_WR_SEND_WITH_IMM;
sr_desc->imm_data = endpoint->rem_info.rem_index;
} else {
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
sr_desc->opcode = IBV_WR_SEND;
}
}
frag->base.order = qp;
return ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr);
to_base_frag(frag)->base.order = qp;
assert(sg->addr == (uint64_t)frag->hdr);
return ibv_post_send(endpoint->qps[qp].lcl_qp, sr_desc, &bad_wr);
}
/*
@ -112,7 +115,7 @@ static int post_send(mca_btl_openib_module_t *openib_btl,
static int btl_openib_acquire_send_resources(
mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_frag_t *frag, int *qp, int *do_rdma)
mca_btl_openib_send_frag_t *frag, int *qp, int *do_rdma)
{
if(*do_rdma) {
if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) {
@ -164,22 +167,23 @@ static int btl_openib_acquire_send_resources(
} while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0))
/* this function os called with endpoint->endpoint_lock held */
static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* openib_btl,
mca_btl_openib_endpoint_t * endpoint,
mca_btl_openib_frag_t * frag)
static inline int mca_btl_openib_endpoint_post_send(
mca_btl_openib_module_t* openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_header_t *hdr = frag->hdr;
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
int do_rdma = 0, qp, ib_rc;
int32_t cm_return;
frag->sg_entry.addr = (unsigned long) frag->hdr;
if(frag->base.order != MCA_BTL_NO_ORDER) {
qp = frag->base.order; /* if order is provided use it */
if(des->order != MCA_BTL_NO_ORDER) {
qp = des->order; /* if order is provided use it */
} else {
qp = frag->qp_idx;
if(frag->segment.seg_len <= mca_btl_openib_component.eager_limit &&
(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY))
if(des->des_src->seg_len <= mca_btl_openib_component.eager_limit &&
(des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY))
do_rdma = 1; /* High priority frag. Try to send over eager RDMA */
}
@ -187,42 +191,41 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
&do_rdma) == OMPI_ERR_OUT_OF_RESOURCE)
return OMPI_SUCCESS;
frag->hdr->credits = 0;
hdr->credits = 0;
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
GET_CREDITS(endpoint->eager_rdma_local.credits, frag->hdr->credits);
if(frag->hdr->credits)
frag->hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
}
if(BTL_OPENIB_QP_TYPE_PP(qp) &&
0 == frag->hdr->credits) {
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
frag->hdr->cm_seen = 255;
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
frag->hdr->cm_seen = cm_return;
hdr->cm_seen = cm_return;
}
ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma);
if(ib_rc) {
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*(frag->hdr)));
BTL_OPENIB_HEADER_NTOH(*hdr);
}
if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits)) {
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(frag->hdr->credits));
BTL_OPENIB_CREDITS(hdr->credits));
}
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);
if(do_rdma) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
} else {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
@ -371,8 +374,7 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl,
&endpoint->qps[qp].pending_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags);
if(ibv_destroy_qp(endpoint->qps[qp].lcl_qp)) {
BTL_ERROR(("Failed to destroy QP:%d\n", qp));
}
@ -382,16 +384,13 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
}
OBJ_DESTRUCT(&endpoint->endpoint_lock);
/* Clean pending lists */
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl,
&endpoint->pending_lazy_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
OBJ_DESTRUCT(&endpoint->pending_lazy_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl,
&endpoint->pending_get_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags);
OBJ_DESTRUCT(&endpoint->pending_get_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl,
&endpoint->pending_put_frags);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags);
OBJ_DESTRUCT(&endpoint->pending_put_frags);
}
@ -422,8 +421,8 @@ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t *frag;
mca_btl_openib_module_t* openib_btl;
mca_btl_openib_send_frag_t *frag;
mca_btl_openib_module_t *openib_btl;
endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
@ -434,22 +433,22 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
/* While there are frags in the list, process them */
while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
frag = (mca_btl_openib_frag_t *) frag_item;
frag = to_send_frag(frag_item);
openib_btl = endpoint->endpoint_btl;
/* We need to post this one */
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag))
if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl,
endpoint, frag))
BTL_ERROR(("Error posting send"));
}
}
/*
* Attempt to send a fragment using a given endpoint. If the endpoint is not
* connected, queue the fragment and start the connection as required.
*/
int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint,
mca_btl_openib_frag_t* frag)
mca_btl_openib_send_frag_t* frag)
{
int rc;
bool call_progress = false;
@ -530,7 +529,7 @@ static void mca_btl_openib_endpoint_credits(
int qp;
mca_btl_openib_frag_t *frag = (mca_btl_openib_frag_t*)descriptor;
mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(descriptor);
qp = frag->qp_idx;
@ -554,7 +553,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
const int qp)
{
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
mca_btl_openib_frag_t* frag;
mca_btl_openib_send_control_frag_t* frag;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
int do_rdma = 0, ib_rc;
int32_t cm_return;
@ -565,11 +564,19 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, ib_rc);
frag->qp_idx = qp;
endpoint->qps[qp].credit_frag = frag;
/* set those once and forever */
to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
to_base_frag(frag)->base.des_cbdata = NULL;
to_com_frag(frag)->endpoint = endpoint;
frag->hdr->tag = MCA_BTL_TAG_BTL;
to_base_frag(frag)->segment.seg_len =
sizeof(mca_btl_openib_rdma_credits_header_t);
}
assert(frag->qp_idx == qp);
credits_hdr =
(mca_btl_openib_rdma_credits_header_t*)frag->segment.seg_addr.pval;
(mca_btl_openib_rdma_credits_header_t*)
to_base_frag(frag)->segment.seg_addr.pval;
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
@ -587,11 +594,6 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
}
}
frag->base.des_cbfunc = mca_btl_openib_endpoint_credits;
frag->base.des_cbdata = NULL;
frag->endpoint = endpoint;
frag->hdr->tag = MCA_BTL_TAG_BTL;
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
@ -612,24 +614,26 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
if(endpoint->nbo)
BTL_OPENIB_RDMA_CREDITS_HEADER_HTON((*credits_hdr));
frag->segment.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t);
frag->sg_entry.addr = (unsigned long)frag->hdr;
if((ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma))) {
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*frag->hdr));
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH((*credits_hdr));
}
BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
if(do_rdma)
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
else
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
BTL_ERROR(("error posting send request errno %d says %s", ib_rc,
strerror(errno)));
if(!(ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma)))
return;
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH(*frag->hdr);
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
}
BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
frag->hdr->credits);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
credits_hdr->rdma_credits);
if(do_rdma)
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
else
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
BTL_ERROR(("error posting send request errno %d says %s", ib_rc,
strerror(errno)));
}
/* local callback function for completion of eager rdma connect */
@ -639,17 +643,16 @@ static void mca_btl_openib_endpoint_eager_rdma_connect_cb(
struct mca_btl_base_descriptor_t* descriptor,
int status)
{
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl),
((mca_btl_openib_frag_t*)descriptor));
MCA_BTL_IB_FRAG_RETURN(descriptor);
}
/* send the eager rdma conect message to the remote endpoint */
/* send the eager rdma connect message to the remote endpoint */
static int mca_btl_openib_endpoint_send_eager_rdma(
mca_btl_base_endpoint_t* endpoint)
{
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
mca_btl_openib_frag_t* frag;
mca_btl_openib_send_control_frag_t* frag;
int rc;
MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, rc);
@ -657,13 +660,17 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
return -1;
}
frag->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb;
frag->base.des_cbdata = NULL;
frag->endpoint = endpoint;
frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
to_base_frag(frag)->base.des_cbfunc =
mca_btl_openib_endpoint_eager_rdma_connect_cb;
to_base_frag(frag)->base.des_cbdata = NULL;
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
to_send_frag(frag)->qp_idx = 0;
to_base_frag(frag)->segment.seg_len =
sizeof(mca_btl_openib_eager_rdma_header_t);
to_com_frag(frag)->endpoint = endpoint;
frag->hdr->tag = MCA_BTL_TAG_BTL;
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval;
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval);
@ -675,7 +682,6 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
rdma_hdr->control.type,
sizeof(mca_btl_openib_eager_rdma_header_t)
));
frag->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t);
if(endpoint->nbo) {
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));
@ -688,7 +694,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
));
}
if (mca_btl_openib_endpoint_send(endpoint, frag) != OMPI_SUCCESS) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
MCA_BTL_IB_FRAG_RETURN(frag);
BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
return -1;
}
@ -742,18 +748,18 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);
init_data.length = mca_btl_openib_component.eager_limit;
init_data.order = mca_btl_openib_component.eager_rdma_qp;
init_data.type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
init_data.list = NULL;
mca_btl_openib_frag_init(item, &init_data);
frag = (mca_btl_openib_recv_frag_t*) item;
frag->ftr = (mca_btl_openib_footer_t*)((char*)frag->segment.seg_addr.pval
+ frag->size);
frag = to_recv_frag(item);
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
to_com_frag(frag)->endpoint = endpoint;
frag->ftr = (mca_btl_openib_footer_t*)
((char*)to_base_frag(frag)->segment.seg_addr.pval +
mca_btl_openib_component.eager_limit);
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
((mca_btl_openib_frag_t*)item)->endpoint = endpoint;
}
endpoint->eager_rdma_local.frags = headers_buf;

Просмотреть файл

@ -121,7 +121,7 @@ struct mca_btl_openib_endpoint_qp_t {
case of PP QP, if there is
no credit available */
int32_t rd_credit_send_lock; /**< Lock credit send fragment */
struct mca_btl_openib_frag_t *credit_frag;
mca_btl_openib_send_control_frag_t *credit_frag;
union {
mca_btl_openib_endpoint_srq_qp_t srq_qp;
mca_btl_openib_endpoint_pp_qp_t pp_qp;
@ -199,7 +199,7 @@ typedef mca_btl_base_endpoint_t mca_btl_openib_endpoint_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t);
int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint,
struct mca_btl_openib_frag_t* frag);
struct mca_btl_openib_send_frag_t* frag);
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint);
@ -233,13 +233,11 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo
for(i = 0; i < (num_post + cm_received); i++) {
ompi_free_list_item_t* item;
mca_btl_openib_frag_t* frag;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
frag = (mca_btl_openib_frag_t*)item;
frag->endpoint = endpoint;
frag->base.order = qp;
to_base_frag(item)->base.order = qp;
to_com_frag(item)->endpoint = endpoint;
if(ibv_post_recv(endpoint->qps[qp].lcl_qp,
&frag->wr_desc.rd_desc,
&to_recv_frag(item)->rd_desc,
&bad_wr)) {
BTL_ERROR(("error posting receive errno says %s\n",
strerror(errno)));

Просмотреть файл

@ -22,136 +22,172 @@
#include "btl_openib_frag.h"
#include "btl_openib_eager_rdma.h"
void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx) {
mca_btl_openib_frag_init_data_t* init_data =
(mca_btl_openib_frag_init_data_t*) ctx;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) item;
mca_btl_openib_reg_t* registration =
(mca_btl_openib_reg_t*)frag->base.super.registration;
frag->size = init_data->length;
assert(init_data->order != 255);
frag->base.order = MCA_BTL_NO_ORDER;
frag->type = init_data->type;
frag->list = init_data->list;
frag->qp_idx = init_data->order;
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
if(registration) {
frag->registration = registration;
frag->sg_entry.lkey = registration->mr->lkey;
frag->segment.seg_key.key32[0] = frag->sg_entry.lkey;
void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx)
{
mca_btl_openib_frag_init_data_t* init_data = ctx;
mca_btl_openib_frag_t *frag = to_base_frag(item);
if(MCA_BTL_OPENIB_FRAG_RECV == frag->type) {
to_recv_frag(frag)->qp_idx = init_data->order;
to_com_frag(frag)->sg_entry.length =
mca_btl_openib_component.qp_infos[init_data->order].size +
sizeof(mca_btl_openib_header_t);
}
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
return;
if(MCA_BTL_OPENIB_FRAG_SEND == frag->type)
to_send_frag(frag)->qp_idx = init_data->order;
frag->list = init_data->list;
}
static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->wr_desc.sr_desc.wr_id = (unsigned long) frag;
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
frag->wr_desc.sr_desc.num_sge = 1;
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->wr_desc.sr_desc.next = NULL;
static void base_constructor(mca_btl_openib_frag_t *frag)
{
frag->base.order = MCA_BTL_NO_ORDER;
}
static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->wr_desc.rd_desc.wr_id = (unsigned long) frag;
frag->wr_desc.rd_desc.sg_list = &frag->sg_entry;
frag->wr_desc.rd_desc.num_sge = 1;
frag->wr_desc.rd_desc.next = NULL;
static void com_constructor(mca_btl_openib_com_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
mca_btl_openib_reg_t* reg =
(mca_btl_openib_reg_t*)base_frag->base.super.registration;
frag->registration = reg;
if(reg) {
frag->sg_entry.lkey = reg->mr->lkey;
base_frag->segment.seg_key.key32[0] = reg->mr->lkey;
}
}
static void out_constructor(mca_btl_openib_out_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
static void mca_btl_openib_recv_user_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->registration = NULL;
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
base_frag->base.des_src = &base_frag->segment;
base_frag->base.des_src_cnt = 1;
base_frag->base.des_dst = NULL;
base_frag->base.des_dst_cnt = 0;
mca_btl_openib_recv_frag_common_constructor(frag);
frag->sr_desc.wr_id = (uint64_t)frag;
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
frag->sr_desc.num_sge = 1;
frag->sr_desc.opcode = IBV_WR_SEND;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.next = NULL;
}
static void in_constructor(mca_btl_openib_in_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
static void mca_btl_openib_send_user_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->registration = NULL;
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
base_frag->base.des_dst = &base_frag->segment;
base_frag->base.des_dst_cnt = 1;
base_frag->base.des_src = NULL;
base_frag->base.des_src_cnt = 0;
}
mca_btl_openib_send_frag_common_constructor(frag);
static void send_constructor(mca_btl_openib_send_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->type = MCA_BTL_OPENIB_FRAG_SEND;
frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr;
base_frag->segment.seg_addr.pval =
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
to_com_frag(frag)->sg_entry.addr = (uint64_t)frag->hdr;
}
static void recv_constructor(mca_btl_openib_recv_frag_t *frag)
{
mca_btl_openib_frag_t *base_frag = to_base_frag(frag);
base_frag->type = MCA_BTL_OPENIB_FRAG_RECV;
frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr;
base_frag->segment.seg_addr.pval =
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
to_com_frag(frag)->sg_entry.addr = (uint64_t)frag->hdr;
frag->rd_desc.wr_id = (uint64_t)frag;
frag->rd_desc.sg_list = &to_com_frag(frag)->sg_entry;
frag->rd_desc.num_sge = 1;
frag->rd_desc.next = NULL;
}
static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag)
{
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL;
}
static void put_constructor(mca_btl_openib_put_frag_t *frag)
{
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
}
static void get_constructor(mca_btl_openib_get_frag_t *frag)
{
to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER;
frag->sr_desc.wr_id = (uint64_t)frag;
frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
frag->sr_desc.num_sge = 1;
frag->sr_desc.opcode = IBV_WR_RDMA_READ;
frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->sr_desc.next = NULL;
}
OBJ_CLASS_INSTANCE(
mca_btl_openib_frag_t,
mca_btl_base_descriptor_t,
NULL,
base_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_common_constructor,
NULL);
mca_btl_openib_com_frag_t,
mca_btl_openib_frag_t,
com_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_control_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_common_constructor,
mca_btl_openib_out_frag_t,
mca_btl_openib_com_frag_t,
out_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_in_frag_t,
mca_btl_openib_com_frag_t,
in_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_t,
mca_btl_openib_out_frag_t,
send_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_t,
mca_btl_openib_in_frag_t,
recv_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_control_frag_t,
mca_btl_openib_send_frag_t,
send_control_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_put_frag_t,
mca_btl_openib_out_frag_t,
put_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_user_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_user_frag_constructor,
mca_btl_openib_get_frag_t,
mca_btl_openib_in_frag_t,
get_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_user_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_user_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_frag_common_constructor,
NULL);

Просмотреть файл

@ -43,14 +43,14 @@ typedef struct mca_btl_openib_header_t mca_btl_openib_header_t;
#define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG)
#define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG)
#define BTL_OPENIB_HEADER_HTON(h) \
do { \
h.credits = htons(h.credits); \
#define BTL_OPENIB_HEADER_HTON(h) \
do { \
(h).credits = htons((h).credits); \
} while (0)
#define BTL_OPENIB_HEADER_NTOH(h) \
do { \
h.credits = ntohs(h.credits); \
#define BTL_OPENIB_HEADER_NTOH(h) \
do { \
(h).credits = ntohs((h).credits); \
} while (0)
@ -79,13 +79,13 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
#if OMPI_ENABLE_DEBUG
#define BTL_OPENIB_FOOTER_HTON(h) \
do { \
h.seq = htonl(h.seq); \
(h).seq = htonl((h).seq); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#define BTL_OPENIB_FOOTER_NTOH(h) \
do { \
h.seq = ntohl(h.seq); \
(h).seq = ntohl((h).seq); \
MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \
} while (0)
#else
@ -144,7 +144,7 @@ do { \
#define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \
do { \
h.rdma_credits = ntohs(h.rdma_credits); \
(h).rdma_credits = ntohs((h).rdma_credits); \
} while (0)
enum mca_btl_openib_frag_type_t {
@ -157,48 +157,84 @@ enum mca_btl_openib_frag_type_t {
};
typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t;
#define openib_frag_type(f) (to_base_frag(f)->type)
/**
* IB send fragment derived type.
* IB fragment derived type.
*/
struct mca_btl_openib_frag_t {
/* base openib frag */
typedef struct mca_btl_openib_frag_t {
mca_btl_base_descriptor_t base;
struct mca_btl_base_endpoint_t *endpoint;
mca_btl_openib_footer_t *ftr;
mca_btl_openib_header_t *hdr;
mca_btl_base_segment_t segment;
size_t size;
mca_btl_openib_frag_type_t type;
union{
struct ibv_recv_wr rd_desc;
struct ibv_send_wr sr_desc;
} wr_desc;
struct ibv_sge sg_entry;
struct mca_btl_openib_reg_t *registration;
ompi_free_list_t* list;
uint8_t qp_idx;
};
typedef struct mca_btl_openib_frag_t mca_btl_openib_frag_t;
} mca_btl_openib_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_t;
#define to_base_frag(f) ((mca_btl_openib_frag_t*)(f))
/* frag used for communication */
typedef struct mca_btl_openib_com_frag_t {
mca_btl_openib_frag_t super;
struct ibv_sge sg_entry;
struct mca_btl_openib_reg_t *registration;
struct mca_btl_base_endpoint_t *endpoint;
} mca_btl_openib_com_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t);
#define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f))
typedef struct mca_btl_openib_out_frag_t {
mca_btl_openib_com_frag_t super;
struct ibv_send_wr sr_desc;
} mca_btl_openib_out_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t);
#define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f))
typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t);
#define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f))
typedef struct mca_btl_openib_send_frag_t {
mca_btl_openib_out_frag_t super;
mca_btl_openib_header_t *hdr;
mca_btl_openib_footer_t *ftr;
uint8_t qp_idx;
} mca_btl_openib_send_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_user_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_user_frag_t);
#define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f))
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_user_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_user_frag_t);
typedef struct mca_btl_openib_recv_frag_t {
mca_btl_openib_in_frag_t super;
mca_btl_openib_header_t *hdr;
mca_btl_openib_footer_t *ftr;
struct ibv_recv_wr rd_desc;
uint8_t qp_idx;
} mca_btl_openib_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f))
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_control_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t);
typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t);
#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f))
typedef struct mca_btl_openib_get_frag_t {
mca_btl_openib_in_frag_t super;
struct ibv_send_wr sr_desc;
} mca_btl_openib_get_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t);
#define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f))
typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t);
#define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f))
/*
* Allocate an IB send descriptor
*
@ -208,7 +244,7 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t);
do { \
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&(btl)->send_free_control, item, rc); \
frag = (mca_btl_openib_frag_t*)item; \
frag = to_send_control_frag(item); \
} while(0)
#define MCA_BTL_IB_FRAG_ALLOC_BY_SIZE(btl, frag, _size, rc) \
@ -222,42 +258,40 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t);
break; \
} \
} \
frag = (mca_btl_openib_frag_t*)item; \
frag = to_com_frag(item); \
} while(0);
#define MCA_BTL_IB_FRAG_ALLOC_SEND_USER(btl, frag, rc) \
do { \
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&(btl)->send_user_free, item, rc); \
frag = (mca_btl_openib_frag_t*)item; \
frag = to_com_frag(item); \
} while(0)
#define MCA_BTL_IB_FRAG_ALLOC_RECV_USER(btl, frag, rc) \
do { \
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&(btl)->recv_user_free, item, rc); \
frag = (mca_btl_openib_frag_t*) item; \
frag = to_com_frag(item); \
} while(0)
#define MCA_BTL_IB_FRAG_RETURN(btl, frag) \
#define MCA_BTL_IB_FRAG_RETURN(frag) \
do { \
OMPI_FREE_LIST_RETURN(frag->list, \
OMPI_FREE_LIST_RETURN(to_base_frag(frag)->list, \
(ompi_free_list_item_t*)(frag)); \
} while(0);
#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(btl,list) \
while(!opal_list_is_empty(list)){ \
opal_list_item_t *frag_item; \
frag_item = opal_list_remove_first(list); \
MCA_BTL_IB_FRAG_RETURN(btl, ((mca_btl_openib_frag_t*)frag_item)); \
} \
#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \
while(!opal_list_is_empty(list)){ \
opal_list_item_t *frag_item; \
frag_item = opal_list_remove_first(list); \
MCA_BTL_IB_FRAG_RETURN(frag_item); \
} \
struct mca_btl_openib_module_t;
struct mca_btl_openib_frag_init_data_t {
uint8_t order;
size_t length;
mca_btl_openib_frag_type_t type;
ompi_free_list_t* list;
};
typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;