1
1

Adding send_immediate (sendi) implementation to openib btl.

This commit was SVN r20881.
Этот коммит содержится в:
Pavel Shamis 2009-03-25 16:53:26 +00:00
родитель c2d8fae9a0
Коммит d25b7203a2
5 изменённых файлов: 291 добавлений и 90 удалений

Просмотреть файл

@ -89,7 +89,7 @@ mca_btl_openib_module_t mca_btl_openib_module = {
mca_btl_openib_prepare_src,
mca_btl_openib_prepare_dst,
mca_btl_openib_send,
NULL, /* send immediate */
mca_btl_openib_sendi, /* send immediate */
mca_btl_openib_put,
mca_btl_openib_get,
mca_btl_base_dump,
@ -1054,6 +1054,171 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
return rc;
}
/*
* Send immediate - Minimum function calls minimum checks, send the data ASAP.
* If BTL can't to send the messages imidiate, it creates messages descriptor
* returns it to PML.
*/
int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor)
{
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
size_t size = payload_size + header_size;
size_t eager_limit;
int rc,
qp = frag_size_to_order(obtl, size),
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
ib_rc;
int32_t cm_return;
bool do_rdma = false;
ompi_free_list_item_t* item = NULL;
mca_btl_openib_frag_t *frag;
mca_btl_openib_header_t *hdr;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
if (OPAL_UNLIKELY(MCA_BTL_IB_CONNECTED != ep->endpoint_state)) {
goto cant_send;
}
/* If it is pending messages on the qp - we can not send */
if(OPAL_UNLIKELY(!opal_list_is_empty(&ep->qps[qp].no_wqe_pending_frags[prio]))) {
goto cant_send;
}
/* Allocate WQE */
if(OPAL_UNLIKELY(qp_get_wqe(ep, qp) < 0)) {
goto no_credits_or_wqe;
}
/* eager rdma or send ? Check eager rdma credits */
/* Note: Maybe we want to implement isend only for eager rdma ?*/
eager_limit = mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_header_coalesced_t) +
sizeof(mca_btl_openib_control_header_t);
if(OPAL_LIKELY(size <= eager_limit)) {
if(acquire_eager_rdma_send_credit(ep) == OMPI_SUCCESS) {
do_rdma = true;
}
}
/* if(!do_rdma && acquire_send_credit(ep, frag) != OMPI_SUCCESS) { */
/* Check send credits if it is no rdma */
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
} else {
if(OPAL_UNLIKELY(OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, -1) < 0)){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
goto no_credits_or_wqe;
}
}
}
/* Allocate fragment */
OMPI_FREE_LIST_GET(&obtl->device->qps[qp].send_free, item, rc);
if(OPAL_UNLIKELY(NULL == item)) {
/* we don't return NULL because maybe later we will try to coalesce */
goto no_frags;
}
frag = to_base_frag(item);
hdr = to_send_frag(item)->hdr;
frag->segment.seg_len = size;
frag->base.order = qp;
frag->base.des_flags = flags;
hdr->tag = tag;
to_com_frag(item)->endpoint = ep;
/* put match header */
memcpy(frag->segment.seg_addr.pval, header, header_size);
/* Pack data */
if(payload_size) {
size_t max_data;
struct iovec iov;
uint32_t iov_count;
/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size);
iov.iov_len = max_data = payload_size;
iov_count = 1;
(void)ompi_convertor_pack( convertor, &iov, &iov_count, &max_data);
assert(max_data == payload_size);
}
/* Set all credits */
BTL_OPENIB_GET_CREDITS(ep->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
BTL_OPENIB_GET_CREDITS(ep->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
hdr->cm_seen = cm_return;
}
ib_rc = post_send(ep, to_send_frag(item), do_rdma);
if(!ib_rc) {
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
return OMPI_SUCCESS;
}
/* Failed to send, do clean up all allocated resources */
if(ep->nbo) {
BTL_OPENIB_HEADER_NTOH(*hdr);
}
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
OPAL_THREAD_ADD32(&ep->eager_rdma_local.credits,
BTL_OPENIB_CREDITS(hdr->credits));
}
if (!do_rdma && BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.rd_credits,
hdr->credits);
}
no_frags:
if(do_rdma) {
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, 1);
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
OPAL_THREAD_ADD32(&ep->qps[qp].u.pp_qp.sd_credits, 1);
} else if BTL_OPENIB_QP_TYPE_SRQ(qp){
OPAL_THREAD_ADD32(&obtl->qps[qp].u.srq_qp.sd_credits, 1);
}
}
no_credits_or_wqe:
qp_put_wqe(ep, qp);
cant_send:
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
/* We can not send the data directly, so we just return descriptor */
*descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags);
return OMPI_ERR_RESOURCE_BUSY;
}
/*
* Initiate a send.
*/
@ -1082,6 +1247,8 @@ int mca_btl_openib_send(
frag->hdr->tag = tag;
}
des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return mca_btl_openib_endpoint_send(ep, frag);
}
@ -1102,6 +1269,8 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER ||
openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND);
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);
@ -1171,6 +1340,8 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl,
assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER);
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
int rc;
OPAL_THREAD_LOCK(&ep->endpoint_lock);

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
@ -487,6 +487,32 @@ extern int mca_btl_openib_send(
mca_btl_base_tag_t tag
);
/**
* PML->BTL Initiate a immediate send of the specified size.
*
* @param btl (IN) BTL instance
* @param ep (IN) Endpoint
* @param convertor (IN) Datatypes converter
* @param header (IN) PML header
* @param header_size (IN) PML header size
* @param payload_size (IN) Payload size
* @param order (IN) Order
* @param flags (IN) Flags
* @param tag (IN) Tag
* @param descriptor (OUT) Messages descriptor
*/
extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct ompi_convertor_t* convertor,
void* header,
size_t header_size,
size_t payload_size,
uint8_t order,
uint32_t flags,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t** descriptor
);
/**
* PML->BTL Initiate a put of the specified size.
*

Просмотреть файл

@ -2820,7 +2820,9 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
}
/* Process a completed send/put/get */
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
des->des_cbfunc(&openib_btl->super, endpoint, des,OMPI_SUCCESS);
}
if( btl_ownership ) {
mca_btl_openib_free(&openib_btl->super, des);
}

Просмотреть файл

@ -47,69 +47,6 @@
static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
static int post_send(mca_btl_openib_endpoint_t *ep,
mca_btl_openib_send_frag_t *frag, const bool rdma)
{
mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment;
struct ibv_sge *sg = &to_com_frag(frag)->sg_entry;
struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
struct ibv_send_wr *bad_wr;
int qp = to_base_frag(frag)->base.order;
sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
(rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]));
if(ep->nbo)
BTL_OPENIB_HEADER_HTON(*frag->hdr);
if(rdma) {
int32_t head;
mca_btl_openib_footer_t* ftr =
(mca_btl_openib_footer_t*)(((char*)frag->hdr) + sg->length -
sizeof(mca_btl_openib_footer_t));
sr_desc->opcode = IBV_WR_RDMA_WRITE;
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
#if OMPI_ENABLE_DEBUG
ftr->seq = ep->eager_rdma_remote.seq++;
#endif
if(ep->nbo)
BTL_OPENIB_FOOTER_HTON(*ftr);
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
sr_desc->wr.rdma.remote_addr =
ep->eager_rdma_remote.base.lval +
head * openib_btl->eager_rdma_frag_size +
sizeof(mca_btl_openib_header_t) +
mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_footer_t);
sr_desc->wr.rdma.remote_addr -= sg->length;
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sr_desc->opcode = IBV_WR_SEND;
} else {
sr_desc->opcode = IBV_WR_SEND_WITH_IMM;
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
sr_desc->imm_data = htonl(ep->rem_info.rem_index);
#else
sr_desc->imm_data = ep->rem_info.rem_index;
#endif
}
}
#if HAVE_XRC
if(BTL_OPENIB_QP_TYPE_XRC(qp))
sr_desc->xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
}
static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
mca_btl_openib_send_frag_t *frag)
{
@ -126,17 +63,6 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
return OMPI_SUCCESS;
}
static inline int
acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint)
{
if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_SUCCESS;
}
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
{
@ -166,11 +92,6 @@ static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
return OMPI_SUCCESS;
}
#define GET_CREDITS(FROM, TO) \
do { \
TO = FROM; \
} while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0))
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_send_frag_t *frag)
@ -205,19 +126,19 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
return OMPI_ERR_RESOURCE_BUSY;
}
GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
if(hdr->credits)
hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;
if(!do_rdma) {
if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
}
} else {
hdr->credits |= (qp << 11);
}
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
/* cm_seen is only 8 bytes, but cm_return is 32 bytes */
if(cm_return > 255) {
hdr->cm_seen = 255;
@ -605,7 +526,7 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
base_des->des_cbfunc = cts_sent;
base_des->des_cbdata = NULL;
base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
base_des->order = mca_btl_openib_component.credits_qp;
openib_frag->segment.seg_len = sizeof(mca_btl_openib_control_header_t);
com_frag->endpoint = endpoint;
@ -834,6 +755,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
to_base_frag(frag)->base.des_cbdata = NULL;
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
to_com_frag(frag)->endpoint = endpoint;
frag->hdr->tag = MCA_BTL_TAG_BTL;
to_base_frag(frag)->segment.seg_len =
@ -854,10 +776,10 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
}
}
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
frag->hdr->cm_seen = 0;
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
if(cm_return > 255) {
frag->hdr->cm_seen = 255;
cm_return -= 255;
@ -866,7 +788,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
frag->hdr->cm_seen = cm_return;
}
GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
credits_hdr->qpn = qp;
credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;
@ -924,7 +846,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
to_base_frag(frag)->base.des_cbfunc =
mca_btl_openib_endpoint_eager_rdma_connect_cb;
to_base_frag(frag)->base.des_cbdata = NULL;
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
to_base_frag(frag)->segment.seg_len =
sizeof(mca_btl_openib_eager_rdma_header_t);

Просмотреть файл

@ -376,6 +376,11 @@ static inline int mca_btl_openib_endpoint_post_rr(
OPAL_ATOMIC_CMPSET_32(&(E)->qps[(Q)].rd_credit_send_lock, 0, 1)
#define BTL_OPENIB_CREDITS_SEND_UNLOCK(E, Q) \
OPAL_ATOMIC_CMPSET_32(&(E)->qps[(Q)].rd_credit_send_lock, 1, 0)
#define BTL_OPENIB_GET_CREDITS(FROM, TO) \
do { \
TO = FROM; \
} while(0 == OPAL_ATOMIC_CMPSET_32(&FROM, TO, 0))
static inline bool check_eager_rdma_credits(const mca_btl_openib_endpoint_t *ep)
{
@ -450,6 +455,81 @@ ib_send_flags(uint32_t size, mca_btl_openib_endpoint_qp_t *qp)
return IBV_SEND_SIGNALED |
((size <= qp->ib_inline_max) ? IBV_SEND_INLINE : 0);
}
static inline int
acquire_eager_rdma_send_credit(mca_btl_openib_endpoint_t *endpoint)
{
if(OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_SUCCESS;
}
static inline int post_send(mca_btl_openib_endpoint_t *ep,
mca_btl_openib_send_frag_t *frag, const bool rdma)
{
mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
mca_btl_base_segment_t *seg = &to_base_frag(frag)->segment;
struct ibv_sge *sg = &to_com_frag(frag)->sg_entry;
struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc;
struct ibv_send_wr *bad_wr;
int qp = to_base_frag(frag)->base.order;
sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) +
(rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length;
sr_desc->send_flags = ib_send_flags(sg->length, &(ep->qps[qp]));
if(ep->nbo)
BTL_OPENIB_HEADER_HTON(*frag->hdr);
if(rdma) {
int32_t head;
mca_btl_openib_footer_t* ftr =
(mca_btl_openib_footer_t*)(((char*)frag->hdr) + sg->length -
sizeof(mca_btl_openib_footer_t));
sr_desc->opcode = IBV_WR_RDMA_WRITE;
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
#if OMPI_ENABLE_DEBUG
ftr->seq = ep->eager_rdma_remote.seq++;
#endif
if(ep->nbo)
BTL_OPENIB_FOOTER_HTON(*ftr);
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
sr_desc->wr.rdma.remote_addr =
ep->eager_rdma_remote.base.lval +
head * openib_btl->eager_rdma_frag_size +
sizeof(mca_btl_openib_header_t) +
mca_btl_openib_component.eager_limit +
sizeof(mca_btl_openib_footer_t);
sr_desc->wr.rdma.remote_addr -= sg->length;
} else {
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sr_desc->opcode = IBV_WR_SEND;
} else {
sr_desc->opcode = IBV_WR_SEND_WITH_IMM;
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
sr_desc->imm_data = htonl(ep->rem_info.rem_index);
#else
sr_desc->imm_data = ep->rem_info.rem_index;
#endif
}
}
#if HAVE_XRC
if(BTL_OPENIB_QP_TYPE_XRC(qp))
sr_desc->xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr);
return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr);
}
END_C_DECLS
#endif