btl/ugni: fix race condition when adding endpoint to wait list
This commit fixes a race condition that can cause an endpoint to be added to the wait list multiple times. To fix the issue an additional check has been added to ensure the endpoint is not on the wait list after the wait list lock is held. The wait list processing code has also been updated to keep the wait list lock until all wait listed endpoints have been handled. This reduces the chance that an endpoint that is being processed by the wait list code is not re-added to the list by a competing send. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
728a97c558
Коммит
14704201e2
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -589,32 +589,21 @@ mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
|
||||
int count;
|
||||
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
count = opal_list_get_size(&ugni_module->ep_wait_list);
|
||||
|
||||
do {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
if (endpoint != NULL) {
|
||||
|
||||
endpoint->wait_listed = false;
|
||||
|
||||
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
|
||||
|
||||
if (OPAL_SUCCESS != rc && false == endpoint->wait_listed) {
|
||||
|
||||
endpoint->wait_listed = true;
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
} else {
|
||||
endpoint->wait_listed = false;
|
||||
}
|
||||
}
|
||||
|
||||
--count;
|
||||
if (count == 0) break;
|
||||
|
||||
} while (endpoint != NULL) ;
|
||||
} while (endpoint != NULL && --count > 0) ;
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -158,9 +158,11 @@ static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
|
||||
|
||||
rc = mca_btl_ugni_progress_send_wait_list (ep);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
|
||||
ep->wait_listed = true;
|
||||
OPAL_THREAD_LOCK(&ep->btl->ep_wait_list_lock);
|
||||
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
|
||||
if (false == ep->wait_listed) {
|
||||
opal_list_append (&ep->btl->ep_wait_list, &ep->super);
|
||||
ep->wait_listed = true;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ep->btl->ep_wait_list_lock);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
@ -76,9 +76,11 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
|
||||
/* queue up request */
|
||||
if (false == endpoint->wait_listed) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
if (false == endpoint->wait_listed) {
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
endpoint->wait_listed = true;
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
endpoint->wait_listed = true;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user