diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index 98be9f9802..cb53a7bbf7 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -466,7 +466,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( uint8_t order, size_t size) { - mca_btl_openib_frag_t* frag = NULL; + mca_btl_openib_com_frag_t* frag = NULL; mca_btl_openib_module_t* openib_btl; int rc; openib_btl = (mca_btl_openib_module_t*) btl; @@ -475,13 +475,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( if(NULL == frag) return NULL; - /* GMS is this necessary anymore ? */ - frag->segment.seg_len = size; - frag->base.order = order; - frag->base.des_flags = 0; + /* not all upper layer users set this */ + to_base_frag(frag)->segment.seg_len = size; + to_base_frag(frag)->base.order = order; - assert(frag->qp_idx <= order); - return (mca_btl_base_descriptor_t*)frag; + assert(to_send_frag(frag)->qp_idx <= order); + return &to_base_frag(frag)->base; } /** @@ -494,19 +493,32 @@ int mca_btl_openib_free( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des) { - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des; - /* is this fragment pointing at user memory? */ - if(((MCA_BTL_OPENIB_FRAG_SEND_USER == frag->type) || - (MCA_BTL_OPENIB_FRAG_RECV_USER == frag->type)) - && frag->registration != NULL) { - btl->btl_mpool->mpool_deregister(btl->btl_mpool, - (mca_mpool_base_registration_t*) - frag->registration); - frag->registration = NULL; + if(MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des) || + MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) { + mca_btl_openib_com_frag_t* frag = to_com_frag(des); + + if(frag->registration != NULL) { + btl->btl_mpool->mpool_deregister(btl->btl_mpool, + (mca_mpool_base_registration_t*)frag->registration); + frag->registration = NULL; + } } - - MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*) btl), frag); + + /* reset those field on free so we will not have to do it on alloc */ + to_base_frag(des)->base.des_flags = 0; + if(MCA_BTL_OPENIB_FRAG_RECV == openib_frag_type(des) || + MCA_BTL_OPENIB_FRAG_RECV_USER == openib_frag_type(des)) { + to_base_frag(des)->base.des_src = NULL; + to_base_frag(des)->base.des_src_cnt = 0; + } else if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des) || + MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) { + to_base_frag(des)->base.des_dst = NULL; + to_base_frag(des)->base.des_dst_cnt = 0; + if(MCA_BTL_OPENIB_FRAG_SEND == openib_frag_type(des)) + to_com_frag(des)->sg_entry.addr = (uint64_t)to_send_frag(des)->hdr; + } + MCA_BTL_IB_FRAG_RETURN(des); return OMPI_SUCCESS; } @@ -546,8 +558,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( ) { mca_btl_openib_module_t *openib_btl; - mca_btl_openib_frag_t *frag = NULL; mca_btl_openib_reg_t *openib_reg; + mca_btl_openib_com_frag_t *frag = NULL; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; @@ -574,38 +586,35 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, max_data, 0, ®istration); if(OMPI_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + MCA_BTL_IB_FRAG_RETURN(frag); return NULL; } /* keep track of the registration we did */ - frag->registration = (mca_btl_openib_reg_t*)registration; + to_com_frag(frag)->registration = + (mca_btl_openib_reg_t*)registration; } openib_reg = (mca_btl_openib_reg_t*)registration; - frag->base.order = order; - frag->base.des_flags = 0; - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; frag->sg_entry.length = max_data; frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (unsigned long)iov.iov_base; + frag->sg_entry.addr = (uint64_t)iov.iov_base; - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; + to_base_frag(frag)->base.order = order; + to_base_frag(frag)->segment.seg_len = max_data; + to_base_frag(frag)->segment.seg_addr.pval = iov.iov_base; + to_base_frag(frag)->segment.seg_key.key32[0] = + (uint32_t)frag->sg_entry.lkey; assert(MCA_BTL_NO_ORDER == order); BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " "frag->segment.seg_key.key32[0] = %lu", frag->sg_entry.lkey, frag->sg_entry.addr, - frag->segment.seg_key.key32[0])); + frag->sg_entry.lkey)); - return &frag->base; + + return &to_base_frag(frag)->base; } } @@ -621,20 +630,15 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( return NULL; iov.iov_len = max_data; - iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve; + iov.iov_base = (unsigned char*) + to_base_frag(frag)->segment.seg_addr.pval + reserve; rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data); - *size = max_data; - frag->segment.seg_len = max_data + reserve; - frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey; - /* frag->base.order = order; */ - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->base.des_flags = 0; - frag->base.order = order; - return &frag->base; + *size = max_data; + to_base_frag(frag)->segment.seg_len = max_data + reserve; + to_base_frag(frag)->base.order = order; + + return &to_base_frag(frag)->base; } /** @@ -661,9 +665,10 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( size_t* size) { mca_btl_openib_module_t *openib_btl; - mca_btl_openib_frag_t *frag; + mca_btl_openib_com_frag_t *frag; mca_btl_openib_reg_t *openib_reg; int rc; + void *buffer; openib_btl = (mca_btl_openib_module_t*)btl; @@ -672,16 +677,16 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( return NULL; } - ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) ); + ompi_convertor_get_current_pointer(convertor, &buffer); if(NULL == registration){ /* we didn't get a memory registration passed in, so we have to * register the region ourselves */ - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - frag->segment.seg_addr.pval, *size, 0, ®istration); + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, 0, + ®istration); if(OMPI_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + MCA_BTL_IB_FRAG_RETURN(frag); return NULL; } /* keep track of the registration we did */ @@ -691,24 +696,19 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( frag->sg_entry.length = *size; frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval; + frag->sg_entry.addr = (uint64_t)buffer; - frag->segment.seg_len = *size; - frag->segment.seg_key.key32[0] = openib_reg->mr->rkey; - - frag->base.order = order; - frag->base.des_dst = &frag->segment; - frag->base.des_dst_cnt = 1; - frag->base.des_src = NULL; - frag->base.des_src_cnt = 0; - frag->base.des_flags = 0; + to_base_frag(frag)->segment.seg_addr.pval = buffer; + to_base_frag(frag)->segment.seg_len = *size; + to_base_frag(frag)->segment.seg_key.key32[0] = openib_reg->mr->rkey; + to_base_frag(frag)->base.order = order; BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu " "frag->segment.seg_key.key32[0] = %lu", frag->sg_entry.lkey, frag->sg_entry.addr, - frag->segment.seg_key.key32[0])); + openib_reg->mr->rkey)); - return &frag->base; + return &to_base_frag(frag)->base; } static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca) @@ -826,7 +826,7 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { if(BTL_OPENIB_QP_TYPE_SRQ(qp)){ - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(openib_btl, + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( &openib_btl->qps[qp].u.srq_qp.pending_frags); if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){ @@ -895,13 +895,13 @@ int mca_btl_openib_send( mca_btl_base_tag_t tag) { - - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)descriptor; - assert(frag->type == MCA_BTL_OPENIB_FRAG_SEND); + mca_btl_openib_send_frag_t* frag = to_send_frag(descriptor); + + assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); - frag->endpoint = endpoint; + to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = tag; - frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; + return mca_btl_openib_endpoint_send(endpoint, frag); } @@ -913,52 +913,50 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, mca_btl_base_endpoint_t* endpoint, mca_btl_base_descriptor_t* descriptor) { - int rc = OMPI_SUCCESS; struct ibv_send_wr* bad_wr; - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; -/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */ - int qp = frag->base.order; + mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor); + int qp = descriptor->order; + uint64_t rem_addr = descriptor->des_dst->seg_addr.lval; + uint32_t rkey = descriptor->des_dst->seg_key.key32[0]; + + assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER || + openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); if(MCA_BTL_NO_ORDER == qp) qp = mca_btl_openib_component.rdma_qp; - /* setup for queued requests */ - frag->endpoint = endpoint; - frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE; - /* check for a send wqe */ if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1); OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - opal_list_append(&endpoint->pending_put_frags, (opal_list_item_t *)frag); + opal_list_append(&endpoint->pending_put_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - return rc; - - /* post descriptor */ - } else { - int ib_rc; - - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; -#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT - if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) != - (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) { - frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_dst->seg_addr.lval); - frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_dst->seg_key.key32[0]); - } else -#endif - { - frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_dst->seg_addr.lval; - frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0]; - } - frag->sg_entry.addr = (unsigned long) frag->base.des_src->seg_addr.pval; - frag->sg_entry.length = frag->base.des_src->seg_len; - - frag->base.order = qp; - ib_rc = ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr); - if(ib_rc) - rc = OMPI_ERROR; + return OMPI_SUCCESS; } - return rc; + /* post descriptor */ +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) + != (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) { + rem_addr = opal_swap_bytes8(rem_addr); + rkey = opal_swap_bytes4(rkey); + } +#endif + frag->sr_desc.wr.rdma.remote_addr = rem_addr; + frag->sr_desc.wr.rdma.rkey = rkey; + + to_com_frag(frag)->sg_entry.addr = + (uint64_t)descriptor->des_src->seg_addr.pval; + to_com_frag(frag)->sg_entry.length = descriptor->des_src->seg_len; + to_com_frag(frag)->endpoint = endpoint; + + descriptor->order = qp; + /* Setting opcode on a frag constructor isn't enough since prepare_src + * may return send_frag instead of put_frag */ + frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; + if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->sr_desc, &bad_wr)) + return OMPI_ERROR; + + return OMPI_SUCCESS; } @@ -970,69 +968,58 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl, mca_btl_base_endpoint_t* endpoint, mca_btl_base_descriptor_t* descriptor) { - int rc; struct ibv_send_wr* bad_wr; - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor; -/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */ - int qp = frag->base.order; - frag->endpoint = endpoint; - frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ; + mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor); + int qp = descriptor->order; + uint64_t rem_addr = descriptor->des_src->seg_addr.lval; + uint32_t rkey = descriptor->des_src->seg_key.key32[0]; + + assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER); if(MCA_BTL_NO_ORDER == qp) qp = mca_btl_openib_component.rdma_qp; /* check for a send wqe */ if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1); OPAL_THREAD_LOCK(&endpoint->endpoint_lock); opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return OMPI_SUCCESS; + } /* check for a get token */ - } else if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) { - + if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1); OPAL_THREAD_ADD32(&endpoint->get_tokens,1); OPAL_THREAD_LOCK(&endpoint->endpoint_lock); opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return OMPI_SUCCESS; - - } else { - - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; -#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT - if ((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) != - (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) { - frag->wr_desc.sr_desc.wr.rdma.remote_addr = opal_swap_bytes8(frag->base.des_src->seg_addr.lval); - frag->wr_desc.sr_desc.wr.rdma.rkey = opal_swap_bytes4(frag->base.des_src->seg_key.key32[0]); - } else -#endif - { - frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_src->seg_addr.lval; - frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_src->seg_key.key32[0]; - } - frag->sg_entry.addr = (unsigned long) frag->base.des_dst->seg_addr.pval; - frag->sg_entry.length = frag->base.des_dst->seg_len; - - frag->base.order = qp; - if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr)){ - BTL_ERROR(("error posting send request errno (%d) says %s", - errno, strerror(errno))); - rc = ORTE_ERROR; - } else { - rc = ORTE_SUCCESS; - } } - - return rc; +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT + if((endpoint->endpoint_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) + != (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) { + rem_addr = opal_swap_bytes8(rem_addr); + rkey = opal_swap_bytes4(rkey); + } +#endif + frag->sr_desc.wr.rdma.remote_addr = rem_addr; + frag->sr_desc.wr.rdma.rkey = rkey; + + to_com_frag(frag)->sg_entry.addr = + (uint64_t)descriptor->des_dst->seg_addr.pval; + to_com_frag(frag)->sg_entry.length = descriptor->des_dst->seg_len; + to_com_frag(frag)->endpoint = endpoint; + + descriptor->order = qp; + if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->sr_desc, &bad_wr)) + return OMPI_ERROR; + + return OMPI_SUCCESS; } - - int mca_btl_openib_ft_event(int state) { if(OPAL_CRS_CHECKPOINT == state) { ; diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 7164c47e3c..584536fb6d 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -546,13 +546,11 @@ static inline int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, for(i = 0; i < num_post; i++) { ompi_free_list_item_t* item; - mca_btl_openib_frag_t* frag; OMPI_FREE_LIST_WAIT(free_list, item, rc); - frag = (mca_btl_openib_frag_t*)item; - frag->base.order = qp; - frag->endpoint = NULL; + to_base_frag(item)->base.order = qp; + to_com_frag(item)->endpoint = NULL; if(ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, - &frag->wr_desc.rd_desc, + &to_recv_frag(item)->rd_desc, &bad_wr)) { BTL_ERROR(("error posting receive descriptors to shared " "receive queue: %s", strerror(errno))); diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index b9da4a59d2..3e5c0b8506 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -86,7 +86,7 @@ static void merge_values(ompi_btl_openib_ini_values_t *target, ompi_btl_openib_ini_values_t *src); static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, mca_btl_openib_endpoint_t *endpoint, - mca_btl_openib_frag_t *frag, + mca_btl_openib_recv_frag_t *frag, size_t byte_len, const int prio); static char* btl_openib_component_status_to_string(enum ibv_wc_status status); static int btl_openib_component_progress(void); @@ -95,9 +95,7 @@ static void btl_openib_frag_progress_pending_pp( mca_btl_base_endpoint_t *endpoint, const int qp); static void btl_openib_frag_progress_pending_srq( - mca_btl_openib_module_t* openib_btl, - mca_btl_base_endpoint_t *endpoint, - const int qp); + mca_btl_openib_module_t* openib_btl, const int qp); static void btl_openib_frag_progress_pending_put_get( mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint, const int qp); @@ -226,20 +224,20 @@ static int btl_openib_modex_send(void) static void btl_openib_control(struct mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, - mca_btl_base_descriptor_t* descriptor, + mca_btl_base_descriptor_t* des, void* cbdata) { - /* dont return credits used for control messages */ - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)descriptor; - mca_btl_openib_endpoint_t* endpoint = frag->endpoint; - mca_btl_openib_control_header_t *ctl_hdr = frag->segment.seg_addr.pval; + /* don't return credits used for control messages */ + mca_btl_openib_endpoint_t* endpoint = to_com_frag(des)->endpoint; + mca_btl_openib_control_header_t *ctl_hdr = + to_base_frag(des)->segment.seg_addr.pval; mca_btl_openib_eager_rdma_header_t *rdma_hdr; mca_btl_openib_rdma_credits_header_t *credits_hdr; - int qp = frag->qp_idx; + int qp = to_recv_frag(des)->qp_idx; if(BTL_OPENIB_EAGER_RDMA_QP(qp)) { /* if not sent via rdma */ - if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) && + if(!MCA_BTL_OPENIB_RDMA_FRAG(des) && ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1); /* rd_posted don't account for rsv preposts for credit message but @@ -733,15 +731,13 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) openib_btl->super.btl_mpool = openib_btl->hca->mpool; init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); - length = sizeof(mca_btl_openib_send_user_frag_t); - init_data->length = length; - init_data->type = MCA_BTL_OPENIB_FRAG_SEND_USER; init_data->order = mca_btl_openib_component.rdma_qp; init_data->list = &openib_btl->send_user_free; if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_user_free, - length, 2, OBJ_CLASS(mca_btl_openib_send_user_frag_t), + sizeof(mca_btl_openib_put_frag_t), 2, + OBJ_CLASS(mca_btl_openib_put_frag_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, @@ -750,15 +746,13 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) } init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); - length = sizeof(mca_btl_openib_recv_user_frag_t); - init_data->length = length; - init_data->type = MCA_BTL_OPENIB_FRAG_RECV_USER; init_data->order = mca_btl_openib_component.rdma_qp; init_data->list = &openib_btl->recv_user_free; if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->recv_user_free, - length, 2, OBJ_CLASS(mca_btl_openib_recv_user_frag_t), + sizeof(mca_btl_openib_get_frag_t), 2, + OBJ_CLASS(mca_btl_openib_get_frag_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, @@ -767,19 +761,17 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) } init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t)); - length = sizeof(mca_btl_openib_send_frag_control_t) + + length = sizeof(mca_btl_openib_send_control_frag_t) + sizeof(mca_btl_openib_header_t) + sizeof(mca_btl_openib_footer_t) + sizeof(mca_btl_openib_eager_rdma_header_t); - init_data->length = sizeof(mca_btl_openib_eager_rdma_header_t); - init_data->type = MCA_BTL_OPENIB_FRAG_CONTROL; init_data->order = mca_btl_openib_component.eager_rdma_qp; init_data->list = &openib_btl->send_free_control; if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_free_control, length, mca_btl_openib_component.buffer_alignment, - OBJ_CLASS(mca_btl_openib_send_frag_control_t), + OBJ_CLASS(mca_btl_openib_send_control_frag_t), mca_btl_openib_component.ib_free_list_num, -1, mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool, mca_btl_openib_frag_init, @@ -805,8 +797,6 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) sizeof(mca_btl_openib_footer_t) + mca_btl_openib_component.qp_infos[qp].size; - init_data->length = mca_btl_openib_component.qp_infos[qp].size; - init_data->type = MCA_BTL_OPENIB_FRAG_SEND; init_data->order = qp; init_data->list = &openib_btl->qps[qp].send_free; @@ -827,8 +817,6 @@ static int finish_btl_init(mca_btl_openib_module_t *openib_btl) sizeof(mca_btl_openib_footer_t) + mca_btl_openib_component.qp_infos[qp].size; - init_data->length = mca_btl_openib_component.qp_infos[qp].size; - init_data->type = MCA_BTL_OPENIB_FRAG_RECV; init_data->order = qp; init_data->list = &openib_btl->qps[qp].recv_free; @@ -1135,36 +1123,37 @@ static void merge_values(ompi_btl_openib_ini_values_t *target, static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl, mca_btl_openib_endpoint_t *endpoint, - mca_btl_openib_frag_t *frag, + mca_btl_openib_recv_frag_t *frag, size_t byte_len, const int qp) { + mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; + mca_btl_openib_header_t *hdr = frag->hdr; + if(endpoint->nbo) { - BTL_OPENIB_HEADER_NTOH((*(frag->hdr))); + BTL_OPENIB_HEADER_NTOH(*hdr); } /* advance the segment address past the header and subtract from the * length..*/ - frag->segment.seg_len = byte_len - sizeof(mca_btl_openib_header_t); + des->des_dst->seg_len = byte_len - sizeof(mca_btl_openib_header_t); /* call registered callback */ - openib_btl->ib_reg[frag->hdr->tag].cbfunc(&openib_btl->super, - frag->hdr->tag, &frag->base, - openib_btl->ib_reg[frag->hdr->tag].cbdata); + openib_btl->ib_reg[hdr->tag].cbfunc(&openib_btl->super, hdr->tag, des, + openib_btl->ib_reg[hdr->tag].cbdata); - if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits) && - BTL_OPENIB_CREDITS(frag->hdr->credits) > 0) { + if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits) && + BTL_OPENIB_CREDITS(hdr->credits) > 0) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, - BTL_OPENIB_CREDITS(frag->hdr->credits)); + BTL_OPENIB_CREDITS(hdr->credits)); } else { - if(BTL_OPENIB_QP_TYPE_PP(qp) && frag->hdr->credits > 0) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, - frag->hdr->credits); + if(BTL_OPENIB_QP_TYPE_PP(qp) && hdr->credits > 0) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, + hdr->credits); } } - if(frag->hdr->cm_seen) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, - -frag->hdr->cm_seen); + if(hdr->cm_seen) { + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -hdr->cm_seen); } /* We may receive credits here so try to progress only things that @@ -1258,21 +1247,19 @@ static void btl_openib_frag_progress_pending_pp( mca_btl_base_endpoint_t *endpoint, const int qp) { - opal_list_item_t *frag_item; - mca_btl_openib_frag_t* frag; + opal_list_item_t *frag; size_t i, len = opal_list_get_size(&endpoint->qps[qp].pending_frags); /* check to see if we need to progress any pending descriptors */ for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 && BTL_OPENIB_TOKENS(endpoint, qp) > 0; i++) { OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - frag_item = - opal_list_remove_first(&(endpoint->qps[qp].pending_frags)); + frag = opal_list_remove_first(&(endpoint->qps[qp].pending_frags)); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item)) + if(NULL == frag) break; - if(mca_btl_openib_endpoint_send(frag->endpoint, frag) == - OMPI_ERR_OUT_OF_RESOURCE) + if(mca_btl_openib_endpoint_send(endpoint, to_send_frag(frag)) == + OMPI_ERR_OUT_OF_RESOURCE) break; } } @@ -1280,43 +1267,39 @@ static void btl_openib_frag_progress_pending_pp( static void btl_openib_frag_progress_pending_put_get( mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint, const int qp) { - opal_list_item_t *frag_item; - mca_btl_openib_frag_t* frag; + opal_list_item_t *frag; size_t i, len = opal_list_get_size(&endpoint->pending_get_frags); + for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 && endpoint->get_tokens > 0; i++) { OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - frag_item = opal_list_remove_first(&(endpoint->pending_get_frags)); + frag = opal_list_remove_first(&(endpoint->pending_get_frags)); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item)) + if(NULL == frag) break; - if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, - frag->endpoint, (mca_btl_base_descriptor_t*)frag) == - OMPI_ERR_OUT_OF_RESOURCE) + if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, endpoint, + &to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE) break; } len = opal_list_get_size(&endpoint->pending_put_frags); for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0; i++) { OPAL_THREAD_LOCK(&endpoint->endpoint_lock); - frag_item = opal_list_remove_first(&(endpoint->pending_put_frags)); + frag = opal_list_remove_first(&(endpoint->pending_put_frags)); OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); - if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item)) + if(NULL == frag) break; - if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, - frag->endpoint, (mca_btl_base_descriptor_t*)frag) == - OMPI_ERR_OUT_OF_RESOURCE) + if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, endpoint, + &to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE) break; } } static void btl_openib_frag_progress_pending_srq( - mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint, - const int qp) + mca_btl_openib_module_t* openib_btl, const int qp) { - opal_list_item_t *frag_item; - mca_btl_openib_frag_t* frag; + opal_list_item_t *frag; size_t i, len; assert(BTL_OPENIB_QP_TYPE_SRQ(qp)); @@ -1325,13 +1308,13 @@ static void btl_openib_frag_progress_pending_srq( for(i = 0; i < len && openib_btl->qps[qp].u.srq_qp.sd_credits > 0; i++) { /* dequeue resources due to global flow control */ OPAL_THREAD_LOCK(&openib_btl->ib_lock); - frag_item = + frag = opal_list_remove_first(&openib_btl->qps[qp].u.srq_qp.pending_frags); OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); - if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item)) + if(NULL == frag) break; - if(mca_btl_openib_endpoint_send(frag->endpoint, frag) == - OMPI_ERR_OUT_OF_RESOURCE) + if(mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint, + to_send_frag(frag)) == OMPI_ERR_OUT_OF_RESOURCE) break; } } @@ -1381,7 +1364,6 @@ static int btl_openib_component_progress(void) { int i, j, c; int count = 0, ret; - mca_btl_openib_frag_t* frag; mca_btl_openib_endpoint_t* endpoint; #if OMPI_HAVE_THREADS @@ -1395,6 +1377,7 @@ static int btl_openib_component_progress(void) * queues. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { + mca_btl_openib_recv_frag_t* frag; mca_btl_openib_module_t* openib_btl = mca_btl_openib_component.openib_btls[i]; c = openib_btl->eager_rdma_buffers_count; @@ -1414,7 +1397,7 @@ static int btl_openib_component_progress(void) int qp; opal_atomic_rmb(); if(endpoint->nbo) { - BTL_OPENIB_FOOTER_NTOH((*frag->ftr)); + BTL_OPENIB_FOOTER_NTOH(*frag->ftr); } size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr); #if OMPI_ENABLE_DEBUG @@ -1429,11 +1412,12 @@ static int btl_openib_component_progress(void) OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) - size + sizeof(mca_btl_openib_footer_t)); - frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + + to_base_frag(frag)->segment.seg_addr.pval = + ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); ret = btl_openib_handle_incoming(openib_btl, - frag->endpoint, frag, + to_com_frag(frag)->endpoint, frag, size - sizeof(mca_btl_openib_footer_t), frag->qp_idx); if (ret != MPI_SUCCESS) { @@ -1446,7 +1430,7 @@ static int btl_openib_component_progress(void) MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); while (endpoint->eager_rdma_local.tail != endpoint->eager_rdma_local.head) { - mca_btl_openib_frag_t *tf; + mca_btl_openib_recv_frag_t *tf; tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint, endpoint->eager_rdma_local.tail); if (MCA_BTL_OPENIB_RDMA_FRAG_LOCAL (tf)) @@ -1492,8 +1476,9 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) { static char *cq_name[] = {"HP CQ", "LP CQ"}; int cq, qp; - int count = 0,ne = 0, ret; - mca_btl_openib_frag_t* frag; + int count = 0,ne = 0; + mca_btl_openib_com_frag_t* frag; + mca_btl_base_descriptor_t *des; mca_btl_openib_endpoint_t* endpoint; mca_btl_openib_module_t *openib_btl = NULL; struct ibv_wc wc; @@ -1504,15 +1489,24 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) ne = ibv_poll_cq(hca->ib_cq[cq], 1, &wc); if(0 == ne) continue; - if(ne < 0 || wc.status != IBV_WC_SUCCESS) + if(ne < 0) goto error; - - frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id; - qp = frag->base.order; + + des = (mca_btl_base_descriptor_t*)(uintptr_t)wc.wr_id; + frag = to_com_frag(des); + + /* For receive fragments "order" contains QP idx the fragment was posted + * to. For send fragments "order" contains QP idx the fragment was send + * through */ + qp = des->order; endpoint = frag->endpoint; + if(endpoint) openib_btl = endpoint->endpoint_btl; + if(wc.status != IBV_WC_SUCCESS) + goto error; + /* Handle work completions */ switch(wc.opcode) { case IBV_WC_RDMA_READ: @@ -1522,8 +1516,7 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) case IBV_WC_RDMA_WRITE: case IBV_WC_SEND: /* Process a completed send/put/get */ - frag->base.des_cbfunc(&openib_btl->super, endpoint, &frag->base, - OMPI_SUCCESS); + des->des_cbfunc(&openib_btl->super, endpoint, des, OMPI_SUCCESS); /* return send wqe */ OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1); @@ -1531,7 +1524,7 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) if(IBV_WC_SEND == wc.opcode && BTL_OPENIB_QP_TYPE_SRQ(qp)) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); /* new SRQ credit available. Try to progress pending frags*/ - btl_openib_frag_progress_pending_srq(openib_btl, endpoint, qp); + btl_openib_frag_progress_pending_srq(openib_btl, qp); } /* new wqe or/and get token available. Try to progress pending frags */ btl_openib_frag_progress_pending_pp(endpoint, qp); @@ -1548,13 +1541,15 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca) } /* Process a RECV */ - ret = btl_openib_handle_incoming(openib_btl, endpoint, frag, wc.byte_len, qp); - if (ret != OMPI_SUCCESS) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); + if(btl_openib_handle_incoming(openib_btl, endpoint, + to_recv_frag(frag), wc.byte_len, qp) != OMPI_SUCCESS) { + openib_btl->error_cb(&openib_btl->super, + MCA_BTL_ERROR_FLAGS_FATAL); return 0; } - OMPI_FREE_LIST_RETURN(frag->list, (ompi_free_list_item_t*) frag); + MCA_BTL_IB_FRAG_RETURN(frag); + if(BTL_OPENIB_QP_TYPE_SRQ(qp)) { OPAL_THREAD_ADD32((int32_t*) &openib_btl->qps[qp].u.srq_qp.rd_posted, -1); @@ -1596,21 +1591,16 @@ error: } else { static int flush_err_printed[] = {0, 0}; ompi_proc_t* remote_proc = NULL; - frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id; - if(frag) { - endpoint = (mca_btl_openib_endpoint_t*) frag->endpoint; - if(endpoint && - endpoint->endpoint_proc && - endpoint->endpoint_proc->proc_ompi) { - remote_proc = endpoint->endpoint_proc->proc_ompi; - } + if(frag && endpoint && endpoint->endpoint_proc && + endpoint->endpoint_proc->proc_ompi) { + remote_proc = endpoint->endpoint_proc->proc_ompi; } if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) { BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s " "status number %d for wr_id %llu opcode %d qp_idx %d", cq_name[cq], btl_openib_component_status_to_string(wc.status), - wc.status, wc.wr_id, wc.opcode, frag->qp_idx)); + wc.status, wc.wr_id, wc.opcode, qp)); } if(wc.status == IBV_WC_RETRY_EXC_ERR) { opal_show_help("help-mpi-btl-openib.txt", diff --git a/ompi/mca/btl/openib/btl_openib_eager_rdma.h b/ompi/mca/btl/openib/btl_openib_eager_rdma.h index 53ba5ea900..ac9423a404 100644 --- a/ompi/mca/btl/openib/btl_openib_eager_rdma.h +++ b/ompi/mca/btl/openib/btl_openib_eager_rdma.h @@ -45,7 +45,7 @@ struct mca_btl_openib_eager_rdma_remote_t { typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t; #define MCA_BTL_OPENIB_RDMA_FRAG(F) \ - ((F)->type == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) + (openib_frag_type(F) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) #define EAGER_RDMA_BUFFER_REMOTE (0) #define EAGER_RDMA_BUFFER_LOCAL (0xff) diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index de06827e5a..b495534565 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -48,62 +48,65 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint); static int post_send(mca_btl_openib_module_t *openib_btl, - mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_frag_t *frag, + mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag, const int qp, const int do_rdma) { - struct ibv_send_wr *bad_wr; + struct ibv_send_wr *bad_wr; + mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment; + struct ibv_sge *sg = &to_com_frag(frag)->sg_entry; + struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc; assert(!do_rdma || BTL_OPENIB_EAGER_RDMA_QP(qp)); - frag->sg_entry.length = frag->segment.seg_len + - sizeof(mca_btl_openib_header_t) + + sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) + (do_rdma ? sizeof(mca_btl_openib_footer_t) : 0); - if(frag->sg_entry.length <= openib_btl->ib_inline_max) { - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE; + if(sg->length <= openib_btl->ib_inline_max) { + sr_desc->send_flags = IBV_SEND_SIGNALED|IBV_SEND_INLINE; } else { - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; + sr_desc->send_flags = IBV_SEND_SIGNALED; } if(endpoint->nbo) - BTL_OPENIB_HEADER_HTON((*(frag->hdr))); + BTL_OPENIB_HEADER_HTON(*frag->hdr); if(do_rdma) { int32_t head; - mca_btl_openib_footer_t* ftr = - (mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) + - frag->segment.seg_len); - frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE; - MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.length); + mca_btl_openib_footer_t* ftr = (mca_btl_openib_footer_t*) + (((char*)seg->seg_addr.pval) + seg->seg_len); + sr_desc->opcode = IBV_WR_RDMA_WRITE; + MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length); MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr); #if OMPI_ENABLE_DEBUG - ((mca_btl_openib_footer_t*)(((char*)frag->segment.seg_addr.pval) + - frag->segment.seg_len))->seq = - endpoint->eager_rdma_remote.seq++; + ((mca_btl_openib_footer_t*)(((char*)seg->seg_addr.pval) + + seg->seg_len))->seq = endpoint->eager_rdma_remote.seq++; #endif if(endpoint->nbo) BTL_OPENIB_FOOTER_HTON((*ftr)); - frag->wr_desc.sr_desc.wr.rdma.rkey = endpoint->eager_rdma_remote.rkey; + sr_desc->wr.rdma.rkey = endpoint->eager_rdma_remote.rkey; MCA_BTL_OPENIB_RDMA_MOVE_INDEX(endpoint->eager_rdma_remote.head, head); - frag->wr_desc.sr_desc.wr.rdma.remote_addr = + sr_desc->wr.rdma.remote_addr = endpoint->eager_rdma_remote.base.lval + head * openib_btl->eager_rdma_frag_size + sizeof(mca_btl_openib_header_t) + mca_btl_openib_component.eager_limit + sizeof(mca_btl_openib_footer_t); - frag->wr_desc.sr_desc.wr.rdma.remote_addr -= frag->sg_entry.length; + sr_desc->wr.rdma.remote_addr -= sg->length; } else { if(BTL_OPENIB_QP_TYPE_SRQ(qp)) { - frag->wr_desc.sr_desc.opcode = IBV_WR_SEND_WITH_IMM; - frag->wr_desc.sr_desc.imm_data = endpoint->rem_info.rem_index; + sr_desc->opcode = IBV_WR_SEND_WITH_IMM; + sr_desc->imm_data = endpoint->rem_info.rem_index; } else { - frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; + sr_desc->opcode = IBV_WR_SEND; } } - frag->base.order = qp; - return ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr); + to_base_frag(frag)->base.order = qp; + + assert(sg->addr == (uint64_t)frag->hdr); + + return ibv_post_send(endpoint->qps[qp].lcl_qp, sr_desc, &bad_wr); } /* @@ -112,7 +115,7 @@ static int post_send(mca_btl_openib_module_t *openib_btl, static int btl_openib_acquire_send_resources( mca_btl_openib_module_t *openib_btl, mca_btl_openib_endpoint_t *endpoint, - mca_btl_openib_frag_t *frag, int *qp, int *do_rdma) + mca_btl_openib_send_frag_t *frag, int *qp, int *do_rdma) { if(*do_rdma) { if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) { @@ -164,22 +167,23 @@ static int btl_openib_acquire_send_resources( } while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0)) /* this function os called with endpoint->endpoint_lock held */ -static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* openib_btl, - mca_btl_openib_endpoint_t * endpoint, - mca_btl_openib_frag_t * frag) +static inline int mca_btl_openib_endpoint_post_send( + mca_btl_openib_module_t* openib_btl, + mca_btl_openib_endpoint_t *endpoint, + mca_btl_openib_send_frag_t *frag) { + mca_btl_openib_header_t *hdr = frag->hdr; + mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int do_rdma = 0, qp, ib_rc; int32_t cm_return; - frag->sg_entry.addr = (unsigned long) frag->hdr; - - if(frag->base.order != MCA_BTL_NO_ORDER) { - qp = frag->base.order; /* if order is provided use it */ + if(des->order != MCA_BTL_NO_ORDER) { + qp = des->order; /* if order is provided use it */ } else { qp = frag->qp_idx; - if(frag->segment.seg_len <= mca_btl_openib_component.eager_limit && - (frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) + if(des->des_src->seg_len <= mca_btl_openib_component.eager_limit && + (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) do_rdma = 1; /* High priority frag. Try to send over eager RDMA */ } @@ -187,42 +191,41 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope &do_rdma) == OMPI_ERR_OUT_OF_RESOURCE) return OMPI_SUCCESS; - frag->hdr->credits = 0; + hdr->credits = 0; if(BTL_OPENIB_EAGER_RDMA_QP(qp)) { - GET_CREDITS(endpoint->eager_rdma_local.credits, frag->hdr->credits); - if(frag->hdr->credits) - frag->hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; + GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); + if(hdr->credits) + hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; } - if(BTL_OPENIB_QP_TYPE_PP(qp) && - 0 == frag->hdr->credits) { - GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); + if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) { + GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { - frag->hdr->cm_seen = 255; + hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { - frag->hdr->cm_seen = cm_return; + hdr->cm_seen = cm_return; } ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma); if(ib_rc) { if(endpoint->nbo) { - BTL_OPENIB_HEADER_NTOH((*(frag->hdr))); + BTL_OPENIB_HEADER_NTOH(*hdr); } - if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits)) { + if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, - BTL_OPENIB_CREDITS(frag->hdr->credits)); + BTL_OPENIB_CREDITS(hdr->credits)); } OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); @@ -371,8 +374,7 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl, - &endpoint->qps[qp].pending_frags); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].pending_frags); if(ibv_destroy_qp(endpoint->qps[qp].lcl_qp)) { BTL_ERROR(("Failed to destroy QP:%d\n", qp)); } @@ -382,16 +384,13 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) } OBJ_DESTRUCT(&endpoint->endpoint_lock); /* Clean pending lists */ - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl, - &endpoint->pending_lazy_frags); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags); OBJ_DESTRUCT(&endpoint->pending_lazy_frags); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl, - &endpoint->pending_get_frags); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags); OBJ_DESTRUCT(&endpoint->pending_get_frags); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(endpoint->endpoint_btl, - &endpoint->pending_put_frags); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags); OBJ_DESTRUCT(&endpoint->pending_put_frags); } @@ -422,8 +421,8 @@ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint) void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) { opal_list_item_t *frag_item; - mca_btl_openib_frag_t *frag; - mca_btl_openib_module_t* openib_btl; + mca_btl_openib_send_frag_t *frag; + mca_btl_openib_module_t *openib_btl; endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; @@ -434,22 +433,22 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) /* While there are frags in the list, process them */ while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)); - frag = (mca_btl_openib_frag_t *) frag_item; + frag = to_send_frag(frag_item); openib_btl = endpoint->endpoint_btl; /* We need to post this one */ - if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, endpoint, frag)) + if(OMPI_SUCCESS != mca_btl_openib_endpoint_post_send(openib_btl, + endpoint, frag)) BTL_ERROR(("Error posting send")); } } - /* * Attempt to send a fragment using a given endpoint. If the endpoint is not * connected, queue the fragment and start the connection as required. */ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint, - mca_btl_openib_frag_t* frag) + mca_btl_openib_send_frag_t* frag) { int rc; bool call_progress = false; @@ -530,7 +529,7 @@ static void mca_btl_openib_endpoint_credits( int qp; - mca_btl_openib_frag_t *frag = (mca_btl_openib_frag_t*)descriptor; + mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(descriptor); qp = frag->qp_idx; @@ -554,7 +553,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, const int qp) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - mca_btl_openib_frag_t* frag; + mca_btl_openib_send_control_frag_t* frag; mca_btl_openib_rdma_credits_header_t *credits_hdr; int do_rdma = 0, ib_rc; int32_t cm_return; @@ -565,11 +564,19 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, ib_rc); frag->qp_idx = qp; endpoint->qps[qp].credit_frag = frag; + /* set those once and forever */ + to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; + to_base_frag(frag)->base.des_cbdata = NULL; + to_com_frag(frag)->endpoint = endpoint; + frag->hdr->tag = MCA_BTL_TAG_BTL; + to_base_frag(frag)->segment.seg_len = + sizeof(mca_btl_openib_rdma_credits_header_t); } assert(frag->qp_idx == qp); credits_hdr = - (mca_btl_openib_rdma_credits_header_t*)frag->segment.seg_addr.pval; + (mca_btl_openib_rdma_credits_header_t*) + to_base_frag(frag)->segment.seg_addr.pval; if(BTL_OPENIB_EAGER_RDMA_QP(qp)) { if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); @@ -587,11 +594,6 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, } } - frag->base.des_cbfunc = mca_btl_openib_endpoint_credits; - frag->base.des_cbdata = NULL; - frag->endpoint = endpoint; - - frag->hdr->tag = MCA_BTL_TAG_BTL; GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); @@ -612,24 +614,26 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON((*credits_hdr)); - frag->segment.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t); - frag->sg_entry.addr = (unsigned long)frag->hdr; - if((ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma))) { - if(endpoint->nbo) { - BTL_OPENIB_HEADER_NTOH((*frag->hdr)); - BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH((*credits_hdr)); - } - BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); - OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); - if(do_rdma) - OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); - else - OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); - BTL_ERROR(("error posting send request errno %d says %s", ib_rc, - strerror(errno))); + if(!(ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma))) + return; + + if(endpoint->nbo) { + BTL_OPENIB_HEADER_NTOH(*frag->hdr); + BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); } + BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, + frag->hdr->credits); + OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, + credits_hdr->rdma_credits); + if(do_rdma) + OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); + else + OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); + + BTL_ERROR(("error posting send request errno %d says %s", ib_rc, + strerror(errno))); } /* local callback function for completion of eager rdma connect */ @@ -639,17 +643,16 @@ static void mca_btl_openib_endpoint_eager_rdma_connect_cb( struct mca_btl_base_descriptor_t* descriptor, int status) { - MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*)btl), - ((mca_btl_openib_frag_t*)descriptor)); + MCA_BTL_IB_FRAG_RETURN(descriptor); } -/* send the eager rdma conect message to the remote endpoint */ +/* send the eager rdma connect message to the remote endpoint */ static int mca_btl_openib_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_eager_rdma_header_t *rdma_hdr; - mca_btl_openib_frag_t* frag; + mca_btl_openib_send_control_frag_t* frag; int rc; MCA_BTL_IB_FRAG_ALLOC_CREDIT_WAIT(openib_btl, frag, rc); @@ -657,13 +660,17 @@ static int mca_btl_openib_endpoint_send_eager_rdma( return -1; } - frag->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb; - frag->base.des_cbdata = NULL; - frag->endpoint = endpoint; - frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; + to_base_frag(frag)->base.des_cbfunc = + mca_btl_openib_endpoint_eager_rdma_connect_cb; + to_base_frag(frag)->base.des_cbdata = NULL; + to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; + to_send_frag(frag)->qp_idx = 0; + to_base_frag(frag)->segment.seg_len = + sizeof(mca_btl_openib_eager_rdma_header_t); + to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; - rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval; + rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval); @@ -675,7 +682,6 @@ static int mca_btl_openib_endpoint_send_eager_rdma( rdma_hdr->control.type, sizeof(mca_btl_openib_eager_rdma_header_t) )); - frag->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); @@ -688,7 +694,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma( )); } if (mca_btl_openib_endpoint_send(endpoint, frag) != OMPI_SUCCESS) { - MCA_BTL_IB_FRAG_RETURN(openib_btl, frag); + MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending RDMA buffer", strerror(errno))); return -1; } @@ -742,18 +748,18 @@ void mca_btl_openib_endpoint_connect_eager_rdma( item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); - init_data.length = mca_btl_openib_component.eager_limit; init_data.order = mca_btl_openib_component.eager_rdma_qp; - init_data.type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; init_data.list = NULL; mca_btl_openib_frag_init(item, &init_data); - frag = (mca_btl_openib_recv_frag_t*) item; - frag->ftr = (mca_btl_openib_footer_t*)((char*)frag->segment.seg_addr.pval - + frag->size); + frag = to_recv_frag(item); + to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; + to_com_frag(frag)->endpoint = endpoint; + frag->ftr = (mca_btl_openib_footer_t*) + ((char*)to_base_frag(frag)->segment.seg_addr.pval + + mca_btl_openib_component.eager_limit); MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); - ((mca_btl_openib_frag_t*)item)->endpoint = endpoint; } endpoint->eager_rdma_local.frags = headers_buf; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index b0d2f6992e..97fa9f4330 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -121,7 +121,7 @@ struct mca_btl_openib_endpoint_qp_t { case of PP QP, if there is no credit available */ int32_t rd_credit_send_lock; /**< Lock credit send fragment */ - struct mca_btl_openib_frag_t *credit_frag; + mca_btl_openib_send_control_frag_t *credit_frag; union { mca_btl_openib_endpoint_srq_qp_t srq_qp; mca_btl_openib_endpoint_pp_qp_t pp_qp; @@ -199,7 +199,7 @@ typedef mca_btl_base_endpoint_t mca_btl_openib_endpoint_t; OBJ_CLASS_DECLARATION(mca_btl_openib_endpoint_t); int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint, - struct mca_btl_openib_frag_t* frag); + struct mca_btl_openib_send_frag_t* frag); void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int); void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*); int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint); @@ -233,13 +233,11 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo for(i = 0; i < (num_post + cm_received); i++) { ompi_free_list_item_t* item; - mca_btl_openib_frag_t* frag; OMPI_FREE_LIST_WAIT(free_list, item, rc); - frag = (mca_btl_openib_frag_t*)item; - frag->endpoint = endpoint; - frag->base.order = qp; + to_base_frag(item)->base.order = qp; + to_com_frag(item)->endpoint = endpoint; if(ibv_post_recv(endpoint->qps[qp].lcl_qp, - &frag->wr_desc.rd_desc, + &to_recv_frag(item)->rd_desc, &bad_wr)) { BTL_ERROR(("error posting receive errno says %s\n", strerror(errno))); diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 92da433aee..6010a7c32a 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -22,136 +22,172 @@ #include "btl_openib_frag.h" #include "btl_openib_eager_rdma.h" -void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx) { - - mca_btl_openib_frag_init_data_t* init_data = - (mca_btl_openib_frag_init_data_t*) ctx; - mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) item; - mca_btl_openib_reg_t* registration = - (mca_btl_openib_reg_t*)frag->base.super.registration; - - frag->size = init_data->length; - assert(init_data->order != 255); - frag->base.order = MCA_BTL_NO_ORDER; - frag->type = init_data->type; - frag->list = init_data->list; - frag->qp_idx = init_data->order; - - frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr; - frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - - if(registration) { - frag->registration = registration; - frag->sg_entry.lkey = registration->mr->lkey; - frag->segment.seg_key.key32[0] = frag->sg_entry.lkey; +void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx) +{ + mca_btl_openib_frag_init_data_t* init_data = ctx; + mca_btl_openib_frag_t *frag = to_base_frag(item); + + if(MCA_BTL_OPENIB_FRAG_RECV == frag->type) { + to_recv_frag(frag)->qp_idx = init_data->order; + to_com_frag(frag)->sg_entry.length = + mca_btl_openib_component.qp_infos[init_data->order].size + + sizeof(mca_btl_openib_header_t); } - - /* init the segment address to start after the btl header */ - frag->segment.seg_len = frag->size; - frag->sg_entry.addr = (unsigned long) frag->hdr; - frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t); - frag->base.des_flags = 0; - return; + if(MCA_BTL_OPENIB_FRAG_SEND == frag->type) + to_send_frag(frag)->qp_idx = init_data->order; + + frag->list = init_data->list; } - - -static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* frag) -{ - frag->base.des_src = &frag->segment; - frag->base.des_src_cnt = 1; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - - - frag->wr_desc.sr_desc.wr_id = (unsigned long) frag; - frag->wr_desc.sr_desc.sg_list = &frag->sg_entry; - frag->wr_desc.sr_desc.num_sge = 1; - frag->wr_desc.sr_desc.opcode = IBV_WR_SEND; - frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED; - frag->wr_desc.sr_desc.next = NULL; +static void base_constructor(mca_btl_openib_frag_t *frag) +{ + frag->base.order = MCA_BTL_NO_ORDER; } -static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag) -{ - frag->base.des_dst = &frag->segment; - frag->base.des_dst_cnt = 1; - frag->base.des_src = NULL; - frag->base.des_src_cnt = 0; - - frag->wr_desc.rd_desc.wr_id = (unsigned long) frag; - frag->wr_desc.rd_desc.sg_list = &frag->sg_entry; - frag->wr_desc.rd_desc.num_sge = 1; - frag->wr_desc.rd_desc.next = NULL; +static void com_constructor(mca_btl_openib_com_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); + mca_btl_openib_reg_t* reg = + (mca_btl_openib_reg_t*)base_frag->base.super.registration; + + frag->registration = reg; + + if(reg) { + frag->sg_entry.lkey = reg->mr->lkey; + base_frag->segment.seg_key.key32[0] = reg->mr->lkey; + } } +static void out_constructor(mca_btl_openib_out_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); -static void mca_btl_openib_recv_user_frag_constructor(mca_btl_openib_frag_t* frag) -{ - frag->registration = NULL; - frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr; - frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - - /* init the segment address to start after the btl header */ - frag->segment.seg_len = frag->size; - frag->sg_entry.addr = (unsigned long) frag->hdr; - frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t); - frag->base.des_flags = 0; + base_frag->base.des_src = &base_frag->segment; + base_frag->base.des_src_cnt = 1; + base_frag->base.des_dst = NULL; + base_frag->base.des_dst_cnt = 0; - mca_btl_openib_recv_frag_common_constructor(frag); + frag->sr_desc.wr_id = (uint64_t)frag; + frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; + frag->sr_desc.num_sge = 1; + frag->sr_desc.opcode = IBV_WR_SEND; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->sr_desc.next = NULL; } +static void in_constructor(mca_btl_openib_in_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); -static void mca_btl_openib_send_user_frag_constructor(mca_btl_openib_frag_t* frag) -{ - frag->registration = NULL; - frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr; - frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - - /* init the segment address to start after the btl header */ - frag->segment.seg_len = frag->size; - frag->sg_entry.addr = (unsigned long) frag->hdr; - frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t); - frag->base.des_flags = 0; + base_frag->base.des_dst = &base_frag->segment; + base_frag->base.des_dst_cnt = 1; + base_frag->base.des_src = NULL; + base_frag->base.des_src_cnt = 0; +} - mca_btl_openib_send_frag_common_constructor(frag); +static void send_constructor(mca_btl_openib_send_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); + + base_frag->type = MCA_BTL_OPENIB_FRAG_SEND; + + frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; + base_frag->segment.seg_addr.pval = + ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); + to_com_frag(frag)->sg_entry.addr = (uint64_t)frag->hdr; +} + +static void recv_constructor(mca_btl_openib_recv_frag_t *frag) +{ + mca_btl_openib_frag_t *base_frag = to_base_frag(frag); + + base_frag->type = MCA_BTL_OPENIB_FRAG_RECV; + + frag->hdr = (mca_btl_openib_header_t*)base_frag->base.super.ptr; + base_frag->segment.seg_addr.pval = + ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); + to_com_frag(frag)->sg_entry.addr = (uint64_t)frag->hdr; + + frag->rd_desc.wr_id = (uint64_t)frag; + frag->rd_desc.sg_list = &to_com_frag(frag)->sg_entry; + frag->rd_desc.num_sge = 1; + frag->rd_desc.next = NULL; +} + +static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag) +{ + to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL; +} + +static void put_constructor(mca_btl_openib_put_frag_t *frag) +{ + to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; + to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; +} + +static void get_constructor(mca_btl_openib_get_frag_t *frag) +{ + to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER; + + frag->sr_desc.wr_id = (uint64_t)frag; + frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; + frag->sr_desc.num_sge = 1; + frag->sr_desc.opcode = IBV_WR_RDMA_READ; + frag->sr_desc.send_flags = IBV_SEND_SIGNALED; + frag->sr_desc.next = NULL; } OBJ_CLASS_INSTANCE( mca_btl_openib_frag_t, mca_btl_base_descriptor_t, - NULL, + base_constructor, NULL); OBJ_CLASS_INSTANCE( - mca_btl_openib_send_frag_t, - mca_btl_base_descriptor_t, - mca_btl_openib_send_frag_common_constructor, - NULL); - + mca_btl_openib_com_frag_t, + mca_btl_openib_frag_t, + com_constructor, + NULL); OBJ_CLASS_INSTANCE( - mca_btl_openib_send_frag_control_t, - mca_btl_base_descriptor_t, - mca_btl_openib_send_frag_common_constructor, + mca_btl_openib_out_frag_t, + mca_btl_openib_com_frag_t, + out_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_in_frag_t, + mca_btl_openib_com_frag_t, + in_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_send_frag_t, + mca_btl_openib_out_frag_t, + send_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_recv_frag_t, + mca_btl_openib_in_frag_t, + recv_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_send_control_frag_t, + mca_btl_openib_send_frag_t, + send_control_constructor, + NULL); + +OBJ_CLASS_INSTANCE( + mca_btl_openib_put_frag_t, + mca_btl_openib_out_frag_t, + put_constructor, NULL); OBJ_CLASS_INSTANCE( - mca_btl_openib_send_user_frag_t, - mca_btl_base_descriptor_t, - mca_btl_openib_send_user_frag_constructor, + mca_btl_openib_get_frag_t, + mca_btl_openib_in_frag_t, + get_constructor, NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_recv_user_frag_t, - mca_btl_base_descriptor_t, - mca_btl_openib_recv_user_frag_constructor, - NULL); - -OBJ_CLASS_INSTANCE( - mca_btl_openib_recv_frag_t, - mca_btl_base_descriptor_t, - mca_btl_openib_recv_frag_common_constructor, - NULL); - diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 470b30f374..a2a5ca303b 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -43,14 +43,14 @@ typedef struct mca_btl_openib_header_t mca_btl_openib_header_t; #define BTL_OPENIB_IS_RDMA_CREDITS(I) ((I)&BTL_OPENIB_RDMA_CREDITS_FLAG) #define BTL_OPENIB_CREDITS(I) ((I)&~BTL_OPENIB_RDMA_CREDITS_FLAG) -#define BTL_OPENIB_HEADER_HTON(h) \ -do { \ - h.credits = htons(h.credits); \ +#define BTL_OPENIB_HEADER_HTON(h) \ +do { \ + (h).credits = htons((h).credits); \ } while (0) -#define BTL_OPENIB_HEADER_NTOH(h) \ -do { \ - h.credits = ntohs(h.credits); \ +#define BTL_OPENIB_HEADER_NTOH(h) \ +do { \ + (h).credits = ntohs((h).credits); \ } while (0) @@ -79,13 +79,13 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; #if OMPI_ENABLE_DEBUG #define BTL_OPENIB_FOOTER_HTON(h) \ do { \ - h.seq = htonl(h.seq); \ + (h).seq = htonl((h).seq); \ MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ } while (0) #define BTL_OPENIB_FOOTER_NTOH(h) \ do { \ - h.seq = ntohl(h.seq); \ + (h).seq = ntohl((h).seq); \ MCA_BTL_OPENIB_FTR_SIZE_REVERSE(h); \ } while (0) #else @@ -144,7 +144,7 @@ do { \ #define BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(h) \ do { \ - h.rdma_credits = ntohs(h.rdma_credits); \ + (h).rdma_credits = ntohs((h).rdma_credits); \ } while (0) enum mca_btl_openib_frag_type_t { @@ -157,48 +157,84 @@ enum mca_btl_openib_frag_type_t { }; typedef enum mca_btl_openib_frag_type_t mca_btl_openib_frag_type_t; +#define openib_frag_type(f) (to_base_frag(f)->type) /** - * IB send fragment derived type. + * IB fragment derived type. */ -struct mca_btl_openib_frag_t { + +/* base openib frag */ +typedef struct mca_btl_openib_frag_t { mca_btl_base_descriptor_t base; - struct mca_btl_base_endpoint_t *endpoint; - mca_btl_openib_footer_t *ftr; - mca_btl_openib_header_t *hdr; mca_btl_base_segment_t segment; - size_t size; mca_btl_openib_frag_type_t type; - union{ - struct ibv_recv_wr rd_desc; - struct ibv_send_wr sr_desc; - } wr_desc; - struct ibv_sge sg_entry; - struct mca_btl_openib_reg_t *registration; ompi_free_list_t* list; - uint8_t qp_idx; -}; -typedef struct mca_btl_openib_frag_t mca_btl_openib_frag_t; +} mca_btl_openib_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t); -typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_t; +#define to_base_frag(f) ((mca_btl_openib_frag_t*)(f)) + +/* frag used for communication */ +typedef struct mca_btl_openib_com_frag_t { + mca_btl_openib_frag_t super; + struct ibv_sge sg_entry; + struct mca_btl_openib_reg_t *registration; + struct mca_btl_base_endpoint_t *endpoint; +} mca_btl_openib_com_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_com_frag_t); + +#define to_com_frag(f) ((mca_btl_openib_com_frag_t*)(f)) + +typedef struct mca_btl_openib_out_frag_t { + mca_btl_openib_com_frag_t super; + struct ibv_send_wr sr_desc; +} mca_btl_openib_out_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_out_frag_t); + +#define to_out_frag(f) ((mca_btl_openib_out_frag_t*)(f)) + +typedef struct mca_btl_openib_com_frag_t mca_btl_openib_in_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_in_frag_t); + +#define to_in_frag(f) ((mca_btl_openib_in_frag_t*)(f)) + +typedef struct mca_btl_openib_send_frag_t { + mca_btl_openib_out_frag_t super; + mca_btl_openib_header_t *hdr; + mca_btl_openib_footer_t *ftr; + uint8_t qp_idx; +} mca_btl_openib_send_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t); -typedef struct mca_btl_openib_frag_t mca_btl_openib_send_user_frag_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_send_user_frag_t); +#define to_send_frag(f) ((mca_btl_openib_send_frag_t*)(f)) -typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_user_frag_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_recv_user_frag_t); +typedef struct mca_btl_openib_recv_frag_t { + mca_btl_openib_in_frag_t super; + mca_btl_openib_header_t *hdr; + mca_btl_openib_footer_t *ftr; + struct ibv_recv_wr rd_desc; + uint8_t qp_idx; +} mca_btl_openib_recv_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); -typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); +#define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) -typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_control_t; - -OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t); +typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); +#define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) + +typedef struct mca_btl_openib_get_frag_t { + mca_btl_openib_in_frag_t super; + struct ibv_send_wr sr_desc; +} mca_btl_openib_get_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); + +#define to_get_frag(f) ((mca_btl_openib_get_frag_t*)(f)) + +typedef struct mca_btl_openib_send_frag_t mca_btl_openib_send_control_frag_t; +OBJ_CLASS_DECLARATION(mca_btl_openib_send_control_frag_t); + +#define to_send_control_frag(f) ((mca_btl_openib_send_control_frag_t*)(f)) /* * Allocate an IB send descriptor * @@ -208,7 +244,7 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t); do { \ ompi_free_list_item_t *item; \ OMPI_FREE_LIST_WAIT(&(btl)->send_free_control, item, rc); \ - frag = (mca_btl_openib_frag_t*)item; \ + frag = to_send_control_frag(item); \ } while(0) #define MCA_BTL_IB_FRAG_ALLOC_BY_SIZE(btl, frag, _size, rc) \ @@ -222,42 +258,40 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t); break; \ } \ } \ - frag = (mca_btl_openib_frag_t*)item; \ + frag = to_com_frag(item); \ } while(0); #define MCA_BTL_IB_FRAG_ALLOC_SEND_USER(btl, frag, rc) \ do { \ ompi_free_list_item_t *item; \ OMPI_FREE_LIST_GET(&(btl)->send_user_free, item, rc); \ - frag = (mca_btl_openib_frag_t*)item; \ + frag = to_com_frag(item); \ } while(0) #define MCA_BTL_IB_FRAG_ALLOC_RECV_USER(btl, frag, rc) \ do { \ ompi_free_list_item_t *item; \ OMPI_FREE_LIST_GET(&(btl)->recv_user_free, item, rc); \ - frag = (mca_btl_openib_frag_t*) item; \ + frag = to_com_frag(item); \ } while(0) -#define MCA_BTL_IB_FRAG_RETURN(btl, frag) \ +#define MCA_BTL_IB_FRAG_RETURN(frag) \ do { \ - OMPI_FREE_LIST_RETURN(frag->list, \ + OMPI_FREE_LIST_RETURN(to_base_frag(frag)->list, \ (ompi_free_list_item_t*)(frag)); \ } while(0); -#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(btl,list) \ - while(!opal_list_is_empty(list)){ \ - opal_list_item_t *frag_item; \ - frag_item = opal_list_remove_first(list); \ - MCA_BTL_IB_FRAG_RETURN(btl, ((mca_btl_openib_frag_t*)frag_item)); \ - } \ +#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(list) \ + while(!opal_list_is_empty(list)){ \ + opal_list_item_t *frag_item; \ + frag_item = opal_list_remove_first(list); \ + MCA_BTL_IB_FRAG_RETURN(frag_item); \ + } \ struct mca_btl_openib_module_t; struct mca_btl_openib_frag_init_data_t { uint8_t order; - size_t length; - mca_btl_openib_frag_type_t type; ompi_free_list_t* list; }; typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;