1
1

btl/ugni: fix race condition when adding endpoint to wait list

This commit fixes a race condition that can cause an endpoint to be
added to the wait list multiple times. To fix the issue an additional
check has been added to ensure the endpoint is not on the wait list
after the wait list lock is held. The wait list processing code has
also been updated to keep the wait list lock until all wait listed
endpoints have been handled. This reduces the chance that an endpoint
that is being processed by the wait list code is not re-added to the
list by a competing send.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-02-02 11:57:35 -07:00
родитель 728a97c558
Коммит 14704201e2
3 изменённых файлов: 17 добавлений и 24 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -589,32 +589,21 @@ mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
int count;
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
count = opal_list_get_size(&ugni_module->ep_wait_list);
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
count = opal_list_get_size(&ugni_module->ep_wait_list);
do {
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
if (endpoint != NULL) {
endpoint->wait_listed = false;
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
if (OPAL_SUCCESS != rc && false == endpoint->wait_listed) {
endpoint->wait_listed = true;
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
if (OPAL_SUCCESS != rc) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
} else {
endpoint->wait_listed = false;
}
}
--count;
if (count == 0) break;
} while (endpoint != NULL) ;
} while (endpoint != NULL && --count > 0) ;
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
return rc;
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -158,9 +158,11 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
rc = mca_btl_ugni_progress_send_wait_list (ep);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
ep->wait_listed = true;
OPAL_THREAD_LOCK(&ep->btl->ep_wait_list_lock);
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
if (false == ep->wait_listed) {
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
ep->wait_listed = true;
}
OPAL_THREAD_UNLOCK(&ep->btl->ep_wait_list_lock);
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -76,9 +76,11 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
/* queue up request */
if (false == endpoint->wait_listed) {
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
if (false == endpoint->wait_listed) {
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
endpoint->wait_listed = true;
}
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
endpoint->wait_listed = true;
}
OPAL_THREAD_LOCK(&endpoint->lock);