1
1

btl/ugni: fix race condition that causes completions to be dropped

The send code in the ugni btl has an optimization that enables it to
return 1 (fragment gone) in some cases. This optimization involved
removing the btl ownership and callback flags to ensure the fragment
stuck around long enough for its completion flag to be checked. This
works fine for the single-threaded case but not in the multi-threaded
case. It is possible that a fragment will be completed by another
thread while a thread is in mca_btl_ugni_send. This competition can
lead to a leaked fragment, missed callback, or both. To fix the issue
without removing the optimization a reference count has been added to
the fragment. Callbacks and fragment release will not be made until
the fragment reference count has reach 0. The count is incremented
before sending the frag and decremented after the completion flag has
been checked. The fix has been verified to work using a multi-threaded
RMA benchmark with the osc/pt2pt component.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-02-02 12:01:05 -07:00
родитель 14704201e2
Коммит cd11fc3081
2 изменённых файлов: 40 добавлений и 15 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
@ -66,6 +66,7 @@ struct mca_btl_ugni_base_frag_t;
typedef struct mca_btl_ugni_base_frag_t {
mca_btl_base_descriptor_t base;
volatile int32_t ref_cnt;
uint32_t msg_id;
uint16_t hdr_size;
uint16_t flags;
@ -148,6 +149,7 @@ static inline int mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep,
if (OPAL_LIKELY(NULL != *frag)) {
(*frag)->my_list = list;
(*frag)->endpoint = ep;
(*frag)->ref_cnt = 1;
return OPAL_SUCCESS;
}
@ -169,10 +171,16 @@ static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag)
return OPAL_SUCCESS;
}
static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, int rc) {
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
static inline bool mca_btl_ugni_frag_del_ref (mca_btl_ugni_base_frag_t *frag, int rc) {
int32_t ref_cnt;
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
opal_atomic_mb ();
ref_cnt = OPAL_THREAD_ADD32(&frag->ref_cnt, -1);
if (ref_cnt) {
assert (ref_cnt > 0);
return false;
}
/* call callback if specified */
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
@ -182,6 +190,20 @@ static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, i
if (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
mca_btl_ugni_frag_return (frag);
}
return true;
}
static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, int rc) {
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
mca_btl_ugni_frag_del_ref (frag, rc);
}
static inline bool mca_btl_ugni_frag_check_complete (mca_btl_ugni_base_frag_t *frag) {
return !!(MCA_BTL_UGNI_FRAG_COMPLETE & frag->flags);
}
#define MCA_BTL_UGNI_FRAG_ALLOC_SMSG(ep, frag) \

Просмотреть файл

@ -25,7 +25,6 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
int flags_save = frag->base.des_flags;
int rc;
/* tag and len are at the same location in eager and smsg frag hdrs */
@ -43,34 +42,38 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor,
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size));
/* temporarily disable ownership and callback flags so we can reliably check the complete flag */
frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
/* add a reference to prevent the fragment from being returned until after the
* completion flag is checked. */
++frag->ref_cnt;
frag->flags &= ~MCA_BTL_UGNI_FRAG_COMPLETE;
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_LIKELY(frag->flags & MCA_BTL_UGNI_FRAG_COMPLETE)) {
if (OPAL_LIKELY(mca_btl_ugni_frag_check_complete (frag))) {
/* fast path: remote side has received the frag */
frag->base.des_flags = flags_save;
mca_btl_ugni_frag_complete (frag, OPAL_SUCCESS);
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
return 1;
}
if ((OPAL_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (flags_save & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
if ((OPAL_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (frag->flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
/* fast(ish) path: btl owned buffered frag. report send as complete */
frag->base.des_flags = flags_save & ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
bool call_callback = !!(frag->flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
frag->flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (OPAL_LIKELY(flags_save & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
if (call_callback) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
return 1;
}
/* slow(ish) path: remote side hasn't received the frag. call the frag's callback when
we get the local smsg/msgq or remote rdma completion */
frag->base.des_flags = flags_save | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
/* queue up request */