1
1

Send all explicit credits for PP QPs of all orders over smallest PP qp.

This commit was SVN r16781.
Этот коммит содержится в:
Gleb Natapov 2007-11-28 07:13:34 +00:00
родитель a9f864d15c
Коммит 5463eb892c
6 изменённых файлов: 228 добавлений и 202 удалений

Просмотреть файл

@ -195,6 +195,7 @@ struct mca_btl_openib_component_t {
int want_fork_support;
#endif
int rdma_qp;
int credits_qp; /* qp used for software flow control */
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;

Просмотреть файл

@ -71,10 +71,6 @@
static int btl_openib_component_open(void);
static int btl_openib_component_close(void);
static int btl_openib_modex_send(void);
static void btl_openib_control(struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata);
static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
uint8_t port_num, uint16_t pkey_index,
struct ibv_port_attr *ib_port_attr);
@ -84,10 +80,6 @@ static mca_btl_base_module_t **btl_openib_component_init(
bool enable_mpi_threads);
static void merge_values(ompi_btl_openib_ini_values_t *target,
ompi_btl_openib_ini_values_t *src);
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_recv_frag_t *frag,
size_t byte_len);
static char* btl_openib_component_status_to_string(enum ibv_wc_status status);
static int btl_openib_component_progress(void);
static int btl_openib_module_progress(mca_btl_openib_hca_t *hca);
@ -220,50 +212,19 @@ static int btl_openib_modex_send(void)
* Active Message Callback function on control message.
*/
static void btl_openib_control(struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata)
static void btl_openib_control(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des,
void* cbdata)
{
/* don't return credits used for control messages */
mca_btl_openib_endpoint_t* endpoint = to_com_frag(des)->endpoint;
mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint;
mca_btl_openib_control_header_t *ctl_hdr =
to_base_frag(des)->segment.seg_addr.pval;
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
int qp;
switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS:
credits_hdr = (mca_btl_openib_rdma_credits_header_t*)ctl_hdr;
if(endpoint->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
}
qp = credits_hdr->qpn;
/* if not sent via rdma */
if(!MCA_BTL_OPENIB_RDMA_FRAG(des)) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, 1);
/* rd_posted don't account for rsv preposts for credit message but
* receive path decreased it for each message received no matter if
* it is credit message or not. So fix rd_posted value here. */
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, 1);
} else {
mca_btl_openib_header_t *hdr = to_recv_frag(des)->hdr;
/* if received via rdma the update credits here since they will not
* be update in handle_incomming() function because qp num is not
* known there */
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits,
hdr->credits);
progress_pending_frags_pp(endpoint, qp);
}
if(credits_hdr->rdma_credits) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
credits_hdr->rdma_credits);
progress_pending_eager_rdma(endpoint);
}
assert(0); /* Credit message is handled elsewhere */
break;
case MCA_BTL_OPENIB_CONTROL_RDMA:
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
@ -275,7 +236,7 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
rdma_hdr->rdma_start.ival
));
if(endpoint->nbo) {
if(ep->nbo) {
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr);
}
@ -284,14 +245,13 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
(unsigned long) rdma_hdr->rdma_start.lval,
rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival));
if (endpoint->eager_rdma_remote.base.pval) {
if (ep->eager_rdma_remote.base.pval) {
BTL_ERROR(("Got RDMA connect twice!"));
return;
}
endpoint->eager_rdma_remote.rkey = rdma_hdr->rkey;
endpoint->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
endpoint->eager_rdma_remote.tokens =
mca_btl_openib_component.eager_rdma_num - 1;
ep->eager_rdma_remote.rkey = rdma_hdr->rkey;
ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1;
break;
default:
BTL_ERROR(("Unknown message type received by BTL"));
@ -1124,45 +1084,102 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
}
}
static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
{
mca_btl_openib_control_header_t* chdr =
to_base_frag(frag)->segment.seg_addr.pval;
return (MCA_BTL_TAG_BTL == frag->hdr->tag) &&
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
}
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_endpoint_t *ep,
mca_btl_openib_recv_frag_t *frag,
size_t byte_len)
{
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
mca_btl_openib_header_t *hdr = frag->hdr;
int rqp = to_base_frag(frag)->base.order, cqp;
uint16_t rcredits = 0, credits;
bool is_credit_msg;
if(endpoint->nbo) {
if(ep->nbo) {
BTL_OPENIB_HEADER_NTOH(*hdr);
}
/* advance the segment address past the header and subtract from the
* length..*/
* length.*/
des->des_dst->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
/* call registered callback */
openib_btl->ib_reg[hdr->tag].cbfunc(&openib_btl->super, hdr->tag, des,
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
/* call registered callback */
openib_btl->ib_reg[hdr->tag].cbfunc(&openib_btl->super, hdr->tag, des,
openib_btl->ib_reg[hdr->tag].cbdata);
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
if(BTL_OPENIB_CREDITS(hdr->credits) > 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
BTL_OPENIB_CREDITS(hdr->credits));
progress_pending_eager_rdma(endpoint);
cqp = rqp;
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
rcredits = BTL_OPENIB_CREDITS(hdr->credits);
hdr->credits = 0;
}
} else {
int qp = to_base_frag(frag)->base.order;
if(BTL_OPENIB_QP_TYPE_PP(qp) && hdr->credits > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits,
hdr->credits);
progress_pending_frags_pp(endpoint, qp);
mca_btl_openib_rdma_credits_header_t *chdr=des->des_dst->seg_addr.pval;
if(ep->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
}
if(hdr->cm_seen)
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent,-hdr->cm_seen);
cqp = chdr->qpn;
rcredits = chdr->rdma_credits;
}
credits = hdr->credits;
if(hdr->cm_seen)
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
/* Now return fragment. Don't touch hdr after this point! */
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
mca_btl_openib_eager_rdma_local_t *erl = &ep->eager_rdma_local;
OPAL_THREAD_LOCK(&erl->lock);
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
while(erl->tail != erl->head) {
mca_btl_openib_recv_frag_t *tf;
tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(ep, erl->tail);
if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(tf))
break;
OPAL_THREAD_ADD32(&erl->credits, 1);
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(erl->tail);
}
OPAL_THREAD_UNLOCK(&erl->lock);
} else {
MCA_BTL_IB_FRAG_RETURN(frag);
if(BTL_OPENIB_QP_TYPE_SRQ(rqp)) {
mca_btl_openib_module_t *btl = ep->endpoint_btl;
OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(btl, 0, rqp);
} else {
if(OPAL_UNLIKELY(is_credit_msg))
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
else
OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
mca_btl_openib_endpoint_post_rr(ep, cqp);
}
}
if(rcredits > 0) {
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
progress_pending_eager_rdma(ep);
}
assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
if(credits) {
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
progress_pending_frags_pp(ep, cqp);
}
send_credits(ep, (cqp != MCA_BTL_NO_ORDER) ? cqp :
mca_btl_openib_component.credits_qp);
return OMPI_SUCCESS;
}
@ -1449,23 +1466,6 @@ static int btl_openib_component_progress(void)
return 0;
}
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
while (endpoint->eager_rdma_local.tail !=
endpoint->eager_rdma_local.head) {
mca_btl_openib_recv_frag_t *tf;
tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->eager_rdma_local.tail);
if (MCA_BTL_OPENIB_RDMA_FRAG_LOCAL (tf))
break;
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, 1);
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.tail);
}
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
/* send credits over qp 0 since it should always be present
* anyway */
send_credits(endpoint, 0);
count++;
} else
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
@ -1573,21 +1573,8 @@ static int btl_openib_module_progress(mca_btl_openib_hca_t* hca)
return 0;
}
MCA_BTL_IB_FRAG_RETURN(frag);
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
OPAL_THREAD_ADD32((int32_t*)
&openib_btl->qps[qp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(openib_btl, 0, qp);
} else {
OPAL_THREAD_ADD32((int32_t*)
&endpoint->qps[qp].u.pp_qp.rd_posted, -1);
mca_btl_openib_endpoint_post_rr(endpoint, 0, qp);
}
count++;
send_credits(endpoint, qp);
/* decide if it is time to setup an eager rdma channel */
if (!endpoint->eager_rdma_local.base.pval &&
endpoint->use_eager_rdma &&

Просмотреть файл

@ -426,7 +426,7 @@ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
mca_btl_openib_post_srr(endpoint->endpoint_btl, 1, qp);
} else {
mca_btl_openib_endpoint_post_rr(endpoint, 1, qp);
mca_btl_openib_endpoint_post_rr(endpoint, qp);
}
}
@ -535,26 +535,27 @@ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* endpoint,
static void mca_btl_openib_endpoint_credits(
mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status)
{
int qp;
mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(descriptor);
mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des);
qp = frag->qp_idx;
/* we don't acquire a wqe or token for credit message - so decrement */
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, -1);
/* we don't acquire a WQE for credit message - so decrement.
* Note: doing it for QP used for credit management */
OPAL_THREAD_ADD32(&ep->qps[des->order].sd_wqe, -1);
if(check_send_credits(endpoint, qp))
mca_btl_openib_endpoint_send_credits(endpoint, qp);
if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep))
mca_btl_openib_endpoint_send_credits(ep, qp);
else {
BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp);
/* check one more time if credits are available after unlock */
send_credits(endpoint, qp);
send_credits(ep, qp);
}
}
@ -579,7 +580,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
frag->qp_idx = qp;
endpoint->qps[qp].credit_frag = frag;
/* set those once and forever */
to_base_frag(frag)->base.order = qp;
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
to_base_frag(frag)->base.des_cbdata = NULL;
to_com_frag(frag)->endpoint = endpoint;
@ -589,8 +590,7 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
}
assert(frag->qp_idx == qp);
credits_hdr =
(mca_btl_openib_rdma_credits_header_t*)
credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
to_base_frag(frag)->segment.seg_addr.pval;
if(acquire_eager_rdma_send_credit(endpoint) == MPI_SUCCESS) {
do_rdma = true;
@ -606,15 +606,13 @@ void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
frag->hdr->cm_seen = 0;
if(!do_rdma) {
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
if(cm_return > 255) {
frag->hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
frag->hdr->cm_seen = cm_return;
}
GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
if(cm_return > 255) {
frag->hdr->cm_seen = 255;
cm_return -= 255;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
} else {
frag->hdr->cm_seen = cm_return;
}
GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
@ -675,7 +673,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
mca_btl_openib_endpoint_eager_rdma_connect_cb;
to_base_frag(frag)->base.des_cbdata = NULL;
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
to_send_frag(frag)->qp_idx = 0;
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
to_base_frag(frag)->segment.seg_len =
sizeof(mca_btl_openib_eager_rdma_header_t);
to_com_frag(frag)->endpoint = endpoint;

Просмотреть файл

@ -206,72 +206,69 @@ void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*);
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*);
static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint,
const int additional,
const int qp)
static inline int post_recvs(mca_btl_base_endpoint_t *ep, const int qp,
const int num_post)
{
int i;
struct ibv_recv_wr* bad_wr;
ompi_free_list_t *free_list;
mca_btl_openib_module_t *openib_btl = ep->endpoint_btl;
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < num_post; i++) {
int rc;
ompi_free_list_item_t* item;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
to_base_frag(item)->base.order = qp;
to_com_frag(item)->endpoint = ep;
if((rc = ibv_post_recv(ep->qps[qp].lcl_qp, &to_recv_frag(item)->rd_desc,
&bad_wr))) {
BTL_ERROR(("error posting receive on qp %d (%d from %d)\n",
qp, i, num_post));
return OMPI_ERROR;
}
}
return OMPI_SUCCESS;
}
static inline int mca_btl_openib_endpoint_post_rr(
mca_btl_base_endpoint_t *endpoint, const int qp)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
int rd_rsv = mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
int cqp = mca_btl_openib_component.credits_qp, rc;
int cm_received, rd_posted, rd_low;
assert(BTL_OPENIB_QP_TYPE_PP(qp));
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
cm_received = endpoint->qps[qp].u.pp_qp.cm_received;
rd_posted = endpoint->qps[qp].u.pp_qp.rd_posted;
rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
if(cm_received >= (rd_rsv >> 2) || rd_posted <= rd_low) {
int rc;
int32_t i, num_post = rd_num - rd_posted;
struct ibv_recv_wr* bad_wr;
ompi_free_list_t *free_list;
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < (num_post + cm_received); i++) {
ompi_free_list_item_t* item;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
to_base_frag(item)->base.order = qp;
to_com_frag(item)->endpoint = endpoint;
if(ibv_post_recv(endpoint->qps[qp].lcl_qp,
&to_recv_frag(item)->rd_desc,
&bad_wr)) {
BTL_ERROR(("error posting receive errno says %s\n",
strerror(errno)));
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERROR;
}
}
if(num_post > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
}
if(cm_received > 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return,
cm_received);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received,
-cm_received);
}
assert(endpoint->qps[qp].u.pp_qp.rd_credits <= rd_num);
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
/* post receive buffers */
if(rd_posted <= rd_low) {
int num_post = rd_num - rd_posted;
if((rc = post_recvs(endpoint, qp, num_post)) != OMPI_SUCCESS)
return rc;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
}
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS;
}
static inline int mca_btl_openib_endpoint_post_rr_all(mca_btl_base_endpoint_t *endpoint,
const int additional)
{
int qp;
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++){
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
mca_btl_openib_endpoint_post_rr(endpoint, additional, qp);
}
/* post buffers for credit management on credit management qp */
if(cm_received >= (rd_rsv >> 2)) {
if((rc = post_recvs(endpoint, cqp, cm_received)) != OMPI_SUCCESS)
return rc;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_received);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_received, -cm_received);
}
assert(endpoint->qps[qp].u.pp_qp.rd_credits <= rd_num);
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
assert(endpoint->qps[qp].u.pp_qp.rd_credits <= rd_num);
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
return OMPI_SUCCESS;
}
@ -280,28 +277,38 @@ static inline int mca_btl_openib_endpoint_post_rr_all(mca_btl_base_endpoint_t *e
#define BTL_OPENIB_CREDITS_SEND_UNLOCK(E, Q) \
OPAL_ATOMIC_CMPSET_32(&(E)->qps[(Q)].rd_credit_send_lock, 1, 0)
static inline bool check_send_credits(mca_btl_openib_endpoint_t *endpoint,
const int qp)
static inline bool check_eager_rdma_credits(const mca_btl_openib_endpoint_t *ep)
{
if(endpoint->eager_rdma_local.credits > endpoint->eager_rdma_local.rd_win)
return true;
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(endpoint->qps[qp].u.pp_qp.rd_credits >=
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) {
return true;
}
}
return false;
return (ep->eager_rdma_local.credits > ep->eager_rdma_local.rd_win) ? true :
false;
}
static inline void send_credits(mca_btl_openib_endpoint_t *endpoint,
const int qp)
static inline bool
check_send_credits(const mca_btl_openib_endpoint_t *ep, const int qp)
{
if(check_send_credits(endpoint, qp) &&
BTL_OPENIB_CREDITS_SEND_TRYLOCK(endpoint, qp))
mca_btl_openib_endpoint_send_credits(endpoint, qp);
if(!BTL_OPENIB_QP_TYPE_PP(qp))
return false;
return (ep->qps[qp].u.pp_qp.rd_credits >=
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) ? true : false;
}
static inline void send_credits(mca_btl_openib_endpoint_t *ep, int qp)
{
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
if(check_send_credits(ep, qp))
goto try_send;
} else {
qp = mca_btl_openib_component.credits_qp;
}
if(!check_eager_rdma_credits(ep))
return;
try_send:
if(BTL_OPENIB_CREDITS_SEND_TRYLOCK(ep, qp))
mca_btl_openib_endpoint_send_credits(ep, qp);
}
END_C_DECLS

Просмотреть файл

@ -483,7 +483,8 @@ static int mca_btl_openib_mca_setup_qps(void)
char *default_qps = "P,128,256,128,16:S,1024,256,128,32:S,4096,256,128,32:S,65536,256,128,32";
uint32_t max_qp_size, max_size_needed;
int32_t min_freelist_size = 0;
int smallest_pp_qp = 0;
reg_string("receive_queues",
"Colon-delimited, coma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
default_qps, &str, 0);
@ -498,7 +499,9 @@ static int mca_btl_openib_mca_setup_qps(void)
while (queues[qp] != NULL) {
if (0 == strncmp("P,", queues[qp], 2)) {
num_pp_qps++;
num_pp_qps++;
if(smallest_pp_qp > qp)
smallest_pp_qp = qp;
} else if (0 == strncmp("S,", queues[qp], 2)) {
num_srq_qps++;
} else {
@ -638,6 +641,7 @@ static int mca_btl_openib_mca_setup_qps(void)
}
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
mca_btl_openib_component.credits_qp = smallest_pp_qp;
/* Register any MCA params for the connect pseudo-components */

Просмотреть файл

@ -238,7 +238,10 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
attr.qp_state = IBV_QPS_RTS;
attr.timeout = mca_btl_openib_component.ib_timeout;
attr.retry_cnt = mca_btl_openib_component.ib_retry_count;
attr.rnr_retry = mca_btl_openib_component.ib_rnr_retry;
/* On PP QPs we have SW flow control, no need for rnr retries. Setting
* it to zero helps to catch bugs */
attr.rnr_retry = BTL_OPENIB_QP_TYPE_PP(i) ? 0 :
mca_btl_openib_component.ib_rnr_retry;
attr.sq_psn = endpoint->qps[i].lcl_psn;
attr.max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
if (ibv_modify_qp(qp, &attr,
@ -264,25 +267,51 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
*/
static int qp_create_all(mca_btl_base_endpoint_t* endpoint)
{
int qp, rc, prio;
int qp, rc, prio, pp_qp_num = 0;
int32_t rd_rsv_total = 0;
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp)
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
rd_rsv_total +=
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
pp_qp_num++;
}
/* if there is no pp QPs we still need reserved WQE for eager rdma flow
* control */
if(0 == pp_qp_num && true == endpoint->use_eager_rdma)
pp_qp_num = 1;
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
struct ibv_srq *srq = NULL;
uint32_t max_recv_wr, max_send_wr;
int32_t rd_rsv, rd_num_credits;
/* If the size for this qp is <= the eager limit, make it a
high priority QP. Otherwise, make it a low priority QP. */
prio = (mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit) ?
BTL_OPENIB_HP_CQ : BTL_OPENIB_LP_CQ;
if(MCA_BTL_OPENIB_PP_QP == mca_btl_openib_component.qp_infos[qp].type) {
max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num + 1;
if(qp == 0)
prio = BTL_OPENIB_HP_CQ; /* smallest qp is always HP */
/* QP used for SW flow control need some additional recourses */
if(qp == mca_btl_openib_component.credits_qp) {
rd_rsv = rd_rsv_total;
rd_num_credits = pp_qp_num;
} else {
rd_rsv = rd_num_credits = 0;
}
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num + rd_rsv;
max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
rd_num_credits;
} else {
srq = endpoint->endpoint_btl->qps[qp].u.srq_qp.srq;
max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num;
max_send_wr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max
+ 1;
+ rd_num_credits;
}
rc = qp_create_one(endpoint, prio, qp, srq, max_recv_wr, max_send_wr);