1
1

btl/openib: fix message coalescing

There was a bug in the openib btl handling this valid sequence of
calls:

desc = btl_alloc ();
btl_free (desc);

When triggered the bug would cause either fragment loss or undefined
behavior (SEGV, etc). The problem occured because btl_alloc contained
the logic to modify the pending fragment (length, etc) and these
changes were not corrected if the fragment was freed instead of sent.

To fix this issue I 1) moved some of the coalescing logic to the
btl_send function, and 2) retry the coalesced fragment on btl_free
if it was never sent. This appears to completely address the issue.
Этот коммит содержится в:
Nathan Hjelm 2014-11-04 14:26:17 -07:00
родитель 9aaac11648
Коммит 9bae131589
2 изменённых файлов: 46 добавлений и 19 удалений

Просмотреть файл

@ -1133,7 +1133,8 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order,
/* check if pending fragment has enough space for coalescing */ /* check if pending fragment has enough space for coalescing */
static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list, static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size) opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size,
mca_btl_openib_coalesced_frag_t **cfrag)
{ {
mca_btl_openib_send_frag_t *frag = NULL; mca_btl_openib_send_frag_t *frag = NULL;
@ -1158,11 +1159,21 @@ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
qp = to_base_frag(frag)->base.order; qp = to_base_frag(frag)->base.order;
if(total_length <= mca_btl_openib_component.qp_infos[qp].size) if(total_length <= mca_btl_openib_component.qp_infos[qp].size) {
/* make sure we can allocate a coalescing frag before returning success */
*cfrag = alloc_coalesced_frag();
if (OPAL_LIKELY(NULL != cfrag)) {
(*cfrag)->send_frag = frag;
(*cfrag)->sent = false;
opal_list_remove_first(frag_list); opal_list_remove_first(frag_list);
else } else {
frag = NULL; frag = NULL;
} }
} else {
frag = NULL;
}
}
OPAL_THREAD_UNLOCK(lock); OPAL_THREAD_UNLOCK(lock);
return frag; return frag;
@ -1188,7 +1199,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl; mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
int qp = frag_size_to_order(obtl, size); int qp = frag_size_to_order(obtl, size);
mca_btl_openib_send_frag_t *sfrag = NULL; mca_btl_openib_send_frag_t *sfrag = NULL;
mca_btl_openib_coalesced_frag_t *cfrag; mca_btl_openib_coalesced_frag_t *cfrag = NULL;
assert(qp != MCA_BTL_NO_ORDER); assert(qp != MCA_BTL_NO_ORDER);
@ -1197,26 +1208,25 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY); int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio], sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
&ep->endpoint_lock, ep, size); &ep->endpoint_lock, ep, size, &cfrag);
if (NULL == sfrag) { if (NULL == sfrag) {
if(BTL_OPENIB_QP_TYPE_PP(qp)) { if(BTL_OPENIB_QP_TYPE_PP(qp)) {
sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio], sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio],
&ep->endpoint_lock, ep, size); &ep->endpoint_lock, ep, size, &cfrag);
} else { } else {
sfrag = check_coalescing( sfrag = check_coalescing(
&obtl->qps[qp].u.srq_qp.pending_frags[prio], &obtl->qps[qp].u.srq_qp.pending_frags[prio],
&obtl->ib_lock, ep, size); &obtl->ib_lock, ep, size, &cfrag);
} }
} }
} }
if(NULL == sfrag) if (NULL == sfrag) {
return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags); return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags);
}
/* begin coalescing message */ /* begin coalescing message */
cfrag = alloc_coalesced_frag();
cfrag->send_frag = sfrag;
/* fix up new coalescing header if this is the first coalesced frag */ /* fix up new coalescing header if this is the first coalesced frag */
if(sfrag->hdr != sfrag->chdr) { if(sfrag->hdr != sfrag->chdr) {
@ -1250,10 +1260,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1; to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1;
to_base_frag(cfrag)->segment.base.seg_len = size; to_base_frag(cfrag)->segment.base.seg_len = size;
/* save coalesced fragment on a main fragment; we will need it after send /* NTH: there is no reason to append the coalesced fragment here. No more
* completion to free it and to call upper layer callback */ * fragments will be added until either send or free has been called on
opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag); * the coalesced frag. */
sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t));
to_base_frag(cfrag)->base.des_flags = flags; to_base_frag(cfrag)->base.des_flags = flags;
@ -1306,6 +1315,15 @@ int mca_btl_openib_free(
default: default:
break; break;
} }
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) {
mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag;
/* the coalesced fragment would have sent the original fragment but that
* will not happen so send the fragment now */
mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag);
}
MCA_BTL_IB_FRAG_RETURN(des); MCA_BTL_IB_FRAG_RETURN(des);
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -1888,11 +1906,19 @@ int mca_btl_openib_send(
openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED); openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED);
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) { if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) {
frag = to_coalesced_frag(des)->send_frag;
/* save coalesced fragment on a main fragment; we will need it after send
* completion to free it and to call upper layer callback */
opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des);
frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size +
sizeof(mca_btl_openib_header_coalesced_t);
to_coalesced_frag(des)->sent = true;
to_coalesced_frag(des)->hdr->tag = tag; to_coalesced_frag(des)->hdr->tag = tag;
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len; to_coalesced_frag(des)->hdr->size = des->des_local->seg_len;
if(ep->nbo) if(ep->nbo)
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr); BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
frag = to_coalesced_frag(des)->send_frag;
} else { } else {
frag = to_send_frag(des); frag = to_send_frag(des);
to_com_frag(des)->endpoint = ep; to_com_frag(des)->endpoint = ep;

Просмотреть файл

@ -371,6 +371,7 @@ typedef struct mca_btl_openib_coalesced_frag_t {
mca_btl_openib_frag_t super; mca_btl_openib_frag_t super;
mca_btl_openib_send_frag_t *send_frag; mca_btl_openib_send_frag_t *send_frag;
mca_btl_openib_header_coalesced_t *hdr; mca_btl_openib_header_coalesced_t *hdr;
bool sent;
} mca_btl_openib_coalesced_frag_t; } mca_btl_openib_coalesced_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t); OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);