1
1
openmpi/opal/mca/btl/ugni/btl_ugni_send.c
Nathan Hjelm cd11fc3081 btl/ugni: fix race condition that causes completions to be dropped
The send code in the ugni btl has an optimization that enables it to
return 1 (fragment gone) in some cases. This optimization involved
removing the btl ownership and callback flags to ensure the fragment
stuck around long enough for its completion flag to be checked. This
works fine for the single-threaded case but not in the multi-threaded
case. It is possible that a fragment will be completed by another
thread while a thread is in mca_btl_ugni_send. This competition can
lead to a leaked fragment, missed callback, or both. To fix the issue
without removing the optimization a reference count has been added to
the fragment. Callbacks and fragment release will not be made until
the fragment reference count has reach 0. The count is incremented
before sending the frag and decremented after the completion flag has
been checked. The fix has been verified to work using a multi-threaded
RMA benchmark with the osc/pt2pt component.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2016-02-02 12:14:31 -07:00

182 строки
6.6 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "btl_ugni.h"
#include "btl_ugni_frag.h"
#include "btl_ugni_smsg.h"
#include "btl_ugni_prepare.h"
int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *descriptor,
mca_btl_base_tag_t tag)
{
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
int rc;
/* tag and len are at the same location in eager and smsg frag hdrs */
frag->hdr.send.lag = (tag << 24) | size;
rc = mca_btl_ugni_check_endpoint_state (endpoint);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
OPAL_THREAD_LOCK(&endpoint->lock);
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&endpoint->lock);
return OPAL_SUCCESS;
}
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor,
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size));
/* add a reference to prevent the fragment from being returned until after the
* completion flag is checked. */
++frag->ref_cnt;
frag->flags &= ~MCA_BTL_UGNI_FRAG_COMPLETE;
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_LIKELY(mca_btl_ugni_frag_check_complete (frag))) {
/* fast path: remote side has received the frag */
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
return 1;
}
if ((OPAL_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (frag->flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
/* fast(ish) path: btl owned buffered frag. report send as complete */
bool call_callback = !!(frag->flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
frag->flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
if (call_callback) {
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
}
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
return 1;
}
/* slow(ish) path: remote side hasn't received the frag. call the frag's callback when
we get the local smsg/msgq or remote rdma completion */
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
/* queue up request */
if (false == endpoint->wait_listed) {
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
if (false == endpoint->wait_listed) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
endpoint->wait_listed = true;
}
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
}
OPAL_THREAD_LOCK(&endpoint->lock);
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&endpoint->lock);
rc = OPAL_SUCCESS;
}
return rc;
}
int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint,
struct opal_convertor_t *convertor,
void *header, size_t header_size,
size_t payload_size, uint8_t order,
uint32_t flags, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t **descriptor)
{
size_t total_size = header_size + payload_size;
mca_btl_ugni_base_frag_t *frag = NULL;
size_t packed_size = payload_size;
int rc;
do {
if (OPAL_UNLIKELY(OPAL_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) {
break;
}
flags |= MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
if (0 == payload_size) {
frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_nodata (btl, endpoint, order, header_size,
flags);
} else {
frag = (mca_btl_ugni_base_frag_t *) mca_btl_ugni_prepare_src_send_buffered (btl, endpoint, convertor, order,
header_size, &packed_size, flags);
}
assert (packed_size == payload_size);
if (OPAL_UNLIKELY(NULL == frag)) {
break;
}
frag->hdr.send.lag = (tag << 24) | total_size;
memcpy (frag->segments[0].seg_addr.pval, header, header_size);
rc = mca_btl_ugni_send_frag (endpoint, frag);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
mca_btl_ugni_frag_return (frag);
break;
}
return OPAL_SUCCESS;
} while (0);
if (NULL != descriptor) {
*descriptor = NULL;
}
return OPAL_ERR_OUT_OF_RESOURCE;
}
int mca_btl_ugni_progress_send_wait_list (mca_btl_base_endpoint_t *endpoint)
{
mca_btl_ugni_base_frag_t *frag=NULL;
int rc;
do {
OPAL_THREAD_LOCK(&endpoint->lock);
frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&endpoint->frag_wait_list);
OPAL_THREAD_UNLOCK(&endpoint->lock);
if (NULL == frag) {
break;
}
if (OPAL_LIKELY(!(frag->flags & MCA_BTL_UGNI_FRAG_RESPONSE))) {
rc = mca_btl_ugni_send_frag (endpoint, frag);
} else {
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
}
if (OPAL_UNLIKELY(OPAL_SUCCESS > rc)) {
if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
OPAL_THREAD_LOCK(&endpoint->lock);
opal_list_prepend (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&endpoint->lock);
} else {
mca_btl_ugni_frag_complete (frag, rc);
}
return rc;
}
} while(1);
return OPAL_SUCCESS;
}