1
1

opal/free_list: fix race condition

There was a race condition in opal_free_list_get. Code throughout the
Open MPI codebase was assuming that a NULL return from this function
was due to an out-of-memory condition. In some cases this can lead to
a fatal condition (MPI_Irecv and MPI_Isend in pml/ob1 for
example). Before this commit opal_free_list_get_mt looked like this:

```c
static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *flist)
{
    opal_free_list_item_t *item =
        (opal_free_list_item_t*) opal_lifo_pop_atomic (&flist->super);

    if (OPAL_UNLIKELY(NULL == item)) {
        opal_mutex_lock (&flist->fl_lock);
        opal_free_list_grow_st (flist, flist->fl_num_per_alloc);
        opal_mutex_unlock (&flist->fl_lock);
        item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super);
    }

    return item;
}
```

The problem is in a multithreaded environment is *is* possible for the
free list to be grown successfully but the thread calling
opal_free_list_get_mt to be left without an item. The happens if
between the calls to opal_lifo_push_atomic in opal_free_list_grow_st
and the call to opal_lifo_pop_atomic other threads pop all the items
added to the free list.

This commit fixes the issue by ensuring the thread that successfully
grew the free list **always** gets a free list item.

Fixes #2921

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2018-10-16 12:33:03 -06:00
родитель d1881519f9
Коммит 5c770a7bec
2 изменённых файлов: 31 добавлений и 17 удалений

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -155,13 +155,13 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_
flist->ctx = ctx;
if (num_elements_to_alloc) {
return opal_free_list_grow_st (flist, num_elements_to_alloc);
return opal_free_list_grow_st (flist, num_elements_to_alloc, NULL);
}
return OPAL_SUCCESS;
}
int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)
int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements, opal_free_list_item_t **item_out)
{
unsigned char *ptr, *payload_ptr = NULL;
opal_free_list_memory_t *alloc_ptr;
@ -263,10 +263,16 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)
/* NTH: in case the free list may be accessed from multiple threads
* use the atomic lifo push. The overhead is small compared to the
* overall overhead of opal_free_list_grow(). */
opal_lifo_push_atomic (&flist->super, &item->super);
if (item_out && 0 == i) {
/* ensure the thread that is growing the free list always gets an item
* if one is available */
*item_out = item;
} else {
opal_lifo_push_atomic (&flist->super, &item->super);
}
ptr += head_size;
payload_ptr += elem_size;
}
if (OPAL_SUCCESS != rc && 0 == num_elements) {
@ -298,7 +304,7 @@ int opal_free_list_resize_mt(opal_free_list_t *flist, size_t size)
opal_mutex_lock (&flist->fl_lock);
do {
ret = opal_free_list_grow_st (flist, flist->fl_num_per_alloc);
ret = opal_free_list_grow_st (flist, flist->fl_num_per_alloc, NULL);
if (OPAL_SUCCESS != ret) {
break;
}

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -146,6 +146,7 @@ OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list,
*
* @param flist (IN) Free list to grow
* @param num_elements (IN) Number of elements to add
* @param item_out (OUT) Location to store new free list item (can be NULL)
*
* @returns OPAL_SUCCESS if any elements were added
* @returns OPAL_ERR_OUT_OF_RESOURCE if no elements could be added
@ -155,8 +156,14 @@ OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list,
* that may be accessed by multiple threads simultaneously. Note: this is an
* internal function that will be used when needed by opal_free_list_get* and
* opal_free_list_wait*.
*
* The item_out parameter can be used to ensure that the thread calling this
* function always gets a free list item if the list is successfully grown.
* This eliminates a race condition with code that simply calls free_list_get
* and assumes NULL is an out of memory condition (which it wasn't necessarily
* before this parameter was added).
*/
OPAL_DECLSPEC int opal_free_list_grow_st (opal_free_list_t *flist, size_t num_elements);
OPAL_DECLSPEC int opal_free_list_grow_st (opal_free_list_t *flist, size_t num_elements, opal_free_list_item_t **item_out);
/**
* Grow the free list to be at least size elements.
@ -195,9 +202,8 @@ static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *fl
if (OPAL_UNLIKELY(NULL == item)) {
opal_mutex_lock (&flist->fl_lock);
opal_free_list_grow_st (flist, flist->fl_num_per_alloc);
opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item);
opal_mutex_unlock (&flist->fl_lock);
item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super);
}
return item;
@ -209,8 +215,7 @@ static inline opal_free_list_item_t *opal_free_list_get_st (opal_free_list_t *fl
(opal_free_list_item_t*) opal_lifo_pop_st (&flist->super);
if (OPAL_UNLIKELY(NULL == item)) {
opal_free_list_grow_st (flist, flist->fl_num_per_alloc);
item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super);
opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item);
}
return item;
@ -253,7 +258,7 @@ static inline opal_free_list_item_t *opal_free_list_wait_mt (opal_free_list_t *f
while (NULL == item) {
if (!opal_mutex_trylock (&fl->fl_lock)) {
if (fl->fl_max_to_alloc <= fl->fl_num_allocated ||
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc)) {
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) {
fl->fl_num_waiting++;
opal_condition_wait (&fl->fl_condition, &fl->fl_lock);
fl->fl_num_waiting--;
@ -274,7 +279,9 @@ static inline opal_free_list_item_t *opal_free_list_wait_mt (opal_free_list_t *f
opal_mutex_lock (&fl->fl_lock);
}
opal_mutex_unlock (&fl->fl_lock);
item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super);
if (NULL == item) {
item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super);
}
}
return item;
@ -287,12 +294,13 @@ static inline opal_free_list_item_t *opal_free_list_wait_st (opal_free_list_t *f
while (NULL == item) {
if (fl->fl_max_to_alloc <= fl->fl_num_allocated ||
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc)) {
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) {
/* try to make progress */
opal_progress ();
}
item = (opal_free_list_item_t *) opal_lifo_pop (&fl->super);
if (NULL == item) {
item = (opal_free_list_item_t *) opal_lifo_pop (&fl->super);
}
}
return item;