1
1

Implement "credit management for credit messages" protocol. On each message a

sender piggybacks a number of credit messages it received from a peer. A number
of outstanding credit messages is limited. This is needed to never ever fall
back to HW flow control.

This commit was SVN r15580.
Этот коммит содержится в:
Gleb Natapov 2007-07-24 15:19:51 +00:00
родитель 45a7a0650b
Коммит 5b7d3faedc
4 изменённых файлов: 69 добавлений и 23 удалений

Просмотреть файл

@ -228,16 +228,14 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
/* if not sent via rdma */
if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) &&
ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1);
/* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1);
OPAL_THREAD_ADD32((int32_t*)&endpoint->qps[qp].u.pp_qp.rd_posted, 1);
}
} else if (ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1);
/* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1);
OPAL_THREAD_ADD32((int32_t*)&endpoint->qps[qp].u.pp_qp.rd_posted, 1);
}
switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS:
credits_hdr = (mca_btl_openib_rdma_credits_header_t*)ctl_hdr;
@ -1163,6 +1161,11 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
}
}
if(frag->hdr->cm_seen) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent,
-frag->hdr->cm_seen);
}
/* We may receive credits here so try to progress only things that
* may be pending because of credit shortage */
if(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type ||
@ -1606,10 +1609,10 @@ error:
}
if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) {
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
"status number %d for wr_id %llu opcode %d",
"status number %d for wr_id %llu opcode %d qp_idx %d",
cq_name[cq],
btl_openib_component_status_to_string(wc.status),
wc.status, wc.wr_id, wc.opcode));
wc.status, wc.wr_id, wc.opcode, frag->qp_idx));
abort();
}
if(wc.status == IBV_WC_RETRY_EXC_ERR) {

Просмотреть файл

@ -221,6 +221,14 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
frag->hdr->credits = 0;
}
if(endpoint->qps[qp].u.pp_qp.cm_return) {
frag->hdr->cm_seen = endpoint->qps[qp].u.pp_qp.cm_return;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return,
-frag->hdr->cm_seen);
} else {
frag->hdr->cm_seen = 0;
}
ib_rc = post_send(openib_btl, endpoint, frag, qp, do_rdma);
if(ib_rc) {
if(endpoint->nbo) {
@ -287,10 +295,14 @@ static void mca_btl_openib_endpoint_construct_qp(mca_btl_base_endpoint_t *endpoi
* now has credits even if the receive buffers are not yet posted
*/
endpoint->qps[qp].u.pp_qp.rd_credits =
-(mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv);
-mca_btl_openib_component.qp_infos[qp].rd_num;
endpoint->qps[qp].u.pp_qp.rd_posted = 0;
endpoint->qps[qp].u.pp_qp.cm_sent = 0;
endpoint->qps[qp].u.pp_qp.cm_return =
-mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
endpoint->qps[qp].u.pp_qp.cm_received =
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
/* initialize the local view of credits */
endpoint->qps[qp].u.pp_qp.sd_credits =
@ -1302,6 +1314,16 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
}
}
if(0 == do_rdma) {
if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
(mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks,
-endpoint->qps[qp].rd_pending_credit_chks);
return;
}
}
frag->base.des_cbfunc = mca_btl_openib_endpoint_credits;
frag->base.des_cbdata = NULL;
frag->endpoint = endpoint;
@ -1315,6 +1337,14 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
} else {
frag->hdr->credits = 0;
}
if(endpoint->qps[qp].u.pp_qp.cm_return) {
frag->hdr->cm_seen = endpoint->qps[qp].u.pp_qp.cm_return;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return,
-frag->hdr->cm_seen);
} else {
frag->hdr->cm_seen = 0;
}
/* send eager RDMA credits only for high prio */
if(BTL_OPENIB_EAGER_RDMA_QP(qp) && endpoint->eager_rdma_local.credits > 0) {
credits_hdr->rdma_credits = endpoint->eager_rdma_local.credits;
@ -1340,6 +1370,8 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
if(do_rdma)
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
else
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
BTL_ERROR(("error posting send request errno %d says %s", ib_rc,
strerror(errno)));
}

Просмотреть файл

@ -97,6 +97,9 @@ struct mca_btl_openib_endpoint_pp_qp_t {
*/
int32_t rd_posted; /**< number of descriptors posted to the nic*/
int32_t rd_credits; /**< number of credits to return to peer */
int32_t cm_received; /**< Credit messages received */
int32_t cm_return; /**< how may credits to return */
int32_t cm_sent; /**< Outstanding number of credit messages */
}; typedef struct mca_btl_openib_endpoint_pp_qp_t mca_btl_openib_endpoint_pp_qp_t;
@ -211,26 +214,25 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo
const int qp)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
int rd_num =
mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
int cm_received, rd_posted, rd_low;
assert(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if((endpoint->qps[qp].u.pp_qp.rd_posted - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) <=
mca_btl_openib_component.qp_infos[qp].rd_low + additional &&
endpoint->qps[qp].u.pp_qp.rd_posted <
rd_num) {
cm_received = endpoint->qps[qp].u.pp_qp.cm_received;
rd_posted = endpoint->qps[qp].u.pp_qp.rd_posted;
rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
if(cm_received >= (rd_rsv >> 2) || rd_posted <= rd_low) {
int rc;
int32_t i, num_post = rd_num - endpoint->qps[qp].u.pp_qp.rd_posted;
int32_t i, num_post = rd_num - rd_posted;
struct ibv_recv_wr* bad_wr;
ompi_free_list_t *free_list;
assert(num_post >= 0);
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < num_post; i++) {
for(i = 0; i < (num_post + cm_received); i++) {
ompi_free_list_item_t* item;
mca_btl_openib_frag_t* frag;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
@ -246,8 +248,16 @@ static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpo
return OMPI_ERROR;
}
}
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
if(num_post > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
}
if(cm_received > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return,
cm_received);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received,
-cm_received);
}
assert(endpoint->qps[qp].u.pp_qp.rd_credits < rd_num);
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
}

Просмотреть файл

@ -36,6 +36,7 @@ struct mca_btl_openib_header_t {
uint8_t padding[1];
#endif
uint16_t credits;
uint16_t cm_seen;
};
typedef struct mca_btl_openib_header_t mca_btl_openib_header_t;
#define BTL_OPENIB_RDMA_CREDITS_FLAG (1<<15)