1
1
openmpi/opal/class/opal_free_list.h
Nathan Hjelm 5c770a7bec opal/free_list: fix race condition
There was a race condition in opal_free_list_get. Code throughout the
Open MPI codebase was assuming that a NULL return from this function
was due to an out-of-memory condition. In some cases this can lead to
a fatal condition (MPI_Irecv and MPI_Isend in pml/ob1 for
example). Before this commit opal_free_list_get_mt looked like this:

```c
static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *flist)
{
    opal_free_list_item_t *item =
        (opal_free_list_item_t*) opal_lifo_pop_atomic (&flist->super);

    if (OPAL_UNLIKELY(NULL == item)) {
        opal_mutex_lock (&flist->fl_lock);
        opal_free_list_grow_st (flist, flist->fl_num_per_alloc);
        opal_mutex_unlock (&flist->fl_lock);
        item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super);
    }

    return item;
}
```

The problem is in a multithreaded environment is *is* possible for the
free list to be grown successfully but the thread calling
opal_free_list_get_mt to be left without an item. The happens if
between the calls to opal_lifo_push_atomic in opal_free_list_grow_st
and the call to opal_lifo_pop_atomic other threads pop all the items
added to the free list.

This commit fixes the issue by ensuring the thread that successfully
grew the free list **always** gets a free list item.

Fixes #2921

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2018-10-16 13:17:09 -06:00

376 строки
14 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 IBM Corporation. All rights reserved.
* Copyright (c) 2010-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_FREE_LIST_H
#define OPAL_FREE_LIST_H
#include "opal_config.h"
#include "opal/class/opal_lifo.h"
#include "opal/prefetch.h"
#include "opal/threads/condition.h"
#include "opal/constants.h"
#include "opal/runtime/opal.h"
BEGIN_C_DECLS
struct mca_mem_pool_t;
struct opal_free_list_item_t;
/**
* Free list item initializtion function.
*
* @param item (IN) Free list item to initialize
* @param ctx (IN) Free list initialization context
*
* @returns OPAL_SUCCESS on success
* @returns opal error code on failure
*
* This function attempts to initialize the free list item
* specified in item. If the item can be initialized the
* function should return OPAL_SUCCESS. On any error
* opal_free_list_grow will stop initializing new items.
*/
typedef int (*opal_free_list_item_init_fn_t) (
struct opal_free_list_item_t *item, void *ctx);
struct opal_free_list_t {
/** Items in a free list are stored last-in first-out */
opal_lifo_t super;
/** Maximum number of items to allocate in the free list */
size_t fl_max_to_alloc;
/** Current number of items allocated */
size_t fl_num_allocated;
/** Number of items to allocate when growing the free list */
size_t fl_num_per_alloc;
/** Number of threads waiting on free list item availability */
size_t fl_num_waiting;
/** Size of each free list item */
size_t fl_frag_size;
/** Free list item alignment */
size_t fl_frag_alignment;
/** Free list item buffer size */
size_t fl_payload_buffer_size;
/** Free list item buffer alignment */
size_t fl_payload_buffer_alignment;
/** Class of free list items */
opal_class_t *fl_frag_class;
/** mpool to use for free list buffer allocation (posix_memalign/malloc
* are used if this is NULL) */
struct mca_mpool_base_module_t *fl_mpool;
/** registration cache */
struct mca_rcache_base_module_t *fl_rcache;
/** Multi-threaded lock. Used when the free list is empty. */
opal_mutex_t fl_lock;
/** Multi-threaded condition. Used when threads are waiting on free
* list item availability. */
opal_condition_t fl_condition;
/** List of free list allocation */
opal_list_t fl_allocations;
/** Flags to pass to the rcache register function */
int fl_rcache_reg_flags;
/** Free list item initialization function */
opal_free_list_item_init_fn_t item_init;
/** Initialization function context */
void *ctx;
};
typedef struct opal_free_list_t opal_free_list_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_free_list_t);
struct mca_mpool_base_registration_t;
struct opal_free_list_item_t
{
opal_list_item_t super;
struct mca_rcache_base_registration_t *registration;
void *ptr;
};
typedef struct opal_free_list_item_t opal_free_list_item_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_free_list_item_t);
/**
* Initialize a free list.
*
* @param free_list (IN) Free list.
* @param frag_size (IN) Size of each element - allocated by malloc.
* @param frag_alignment (IN) Fragment alignment.
* @param frag_class (IN) opal_class_t of element - used to initialize allocated elements.
* @param payload_buffer_size (IN) Size of payload buffer - allocated from mpool.
* @param payload_buffer_alignment (IN) Payload buffer alignment.
* @param num_elements_to_alloc (IN) Initial number of elements to allocate.
* @param max_elements_to_alloc (IN) Maximum number of elements to allocate.
* @param num_elements_per_alloc (IN) Number of elements to grow by per allocation.
* @param mpool (IN) Optional memory pool for allocations.
* @param rcache_reg_flags (IN) Flags to pass to rcache registration function.
* @param rcache (IN) Optional registration cache.
* @param item_init (IN) Optional item initialization function
* @param ctx (IN) Initialization function context.
*/
OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list,
size_t frag_size,
size_t frag_alignment,
opal_class_t* frag_class,
size_t payload_buffer_size,
size_t payload_buffer_alignment,
int num_elements_to_alloc,
int max_elements_to_alloc,
int num_elements_per_alloc,
struct mca_mpool_base_module_t *mpool,
int rcache_reg_flags,
struct mca_rcache_base_module_t *rcache,
opal_free_list_item_init_fn_t item_init,
void *ctx);
/**
* Grow the free list by at most num_elements elements.
*
* @param flist (IN) Free list to grow
* @param num_elements (IN) Number of elements to add
* @param item_out (OUT) Location to store new free list item (can be NULL)
*
* @returns OPAL_SUCCESS if any elements were added
* @returns OPAL_ERR_OUT_OF_RESOURCE if no elements could be added
*
* This function will attempt to grow the free list by num_elements items. The
* caller must hold the free list lock if calling this function on a free list
* that may be accessed by multiple threads simultaneously. Note: this is an
* internal function that will be used when needed by opal_free_list_get* and
* opal_free_list_wait*.
*
* The item_out parameter can be used to ensure that the thread calling this
* function always gets a free list item if the list is successfully grown.
* This eliminates a race condition with code that simply calls free_list_get
* and assumes NULL is an out of memory condition (which it wasn't necessarily
* before this parameter was added).
*/
OPAL_DECLSPEC int opal_free_list_grow_st (opal_free_list_t *flist, size_t num_elements, opal_free_list_item_t **item_out);
/**
* Grow the free list to be at least size elements.
*
* @param flist (IN) Free list to resize.
* @param size (IN) New size
*
* @returns OPAL_SUCCESS if the free list was resized
* @returns OPAL_ERR_OUT_OF_RESOURCE if resources could not be allocated
*
* This function will not shrink the list if it is already larger than size
* and may grow it past size if necessary (it will grow in num_elements_per_alloc
* chunks). This function is thread-safe and will obtain the free list lock before
* growing the free list.
*/
OPAL_DECLSPEC int opal_free_list_resize_mt (opal_free_list_t *flist, size_t size);
/**
* Attemp to obtain an item from a free list.
*
* @param fl (IN) Free list.
* @param item (OUT) Allocated item.
*
* If the requested item is not available the free list is grown to
* accomodate the request - unless the max number of allocations has
* been reached. If this is the case - a NULL pointer is returned
* to the caller. This function comes in three flavor: thread safe
* (opal_free_list_get_mt), single threaded (opal_free_list_get_st),
* and opal_using_threads conditioned (opal_free_list_get).
*/
static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *flist)
{
opal_free_list_item_t *item =
(opal_free_list_item_t*) opal_lifo_pop_atomic (&flist->super);
if (OPAL_UNLIKELY(NULL == item)) {
opal_mutex_lock (&flist->fl_lock);
opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item);
opal_mutex_unlock (&flist->fl_lock);
}
return item;
}
static inline opal_free_list_item_t *opal_free_list_get_st (opal_free_list_t *flist)
{
opal_free_list_item_t *item =
(opal_free_list_item_t*) opal_lifo_pop_st (&flist->super);
if (OPAL_UNLIKELY(NULL == item)) {
opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item);
}
return item;
}
static inline opal_free_list_item_t *opal_free_list_get (opal_free_list_t *flist)
{
if (opal_using_threads ()) {
return opal_free_list_get_mt (flist);
}
return opal_free_list_get_st (flist);
}
/** compatibility macro */
#define OPAL_FREE_LIST_GET(fl, item) \
(item) = opal_free_list_get (fl)
/**
* Blocking call to obtain an item from a free list.
*
* @param fl (IN) Free list.
* @param item (OUT) Allocated item.
*
* If the requested item is not available the free list is grown to
* accomodate the request - unless the max number of allocations has
* been reached. In this case the caller is blocked until an item
* is returned to the list.
*/
/** compatibility macro */
#define OPAL_FREE_LIST_WAIT(fl, item) \
(item) = opal_free_list_wait (fl)
static inline opal_free_list_item_t *opal_free_list_wait_mt (opal_free_list_t *fl)
{
opal_free_list_item_t *item =
(opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super);
while (NULL == item) {
if (!opal_mutex_trylock (&fl->fl_lock)) {
if (fl->fl_max_to_alloc <= fl->fl_num_allocated ||
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) {
fl->fl_num_waiting++;
opal_condition_wait (&fl->fl_condition, &fl->fl_lock);
fl->fl_num_waiting--;
} else {
if (0 < fl->fl_num_waiting) {
if (1 == fl->fl_num_waiting) {
opal_condition_signal (&fl->fl_condition);
} else {
opal_condition_broadcast (&fl->fl_condition);
}
}
}
} else {
/* If I wasn't able to get the lock in the begining when I finaly grab it
* the one holding the lock in the begining already grow the list. I will
* release the lock and try to get a new element until I succeed.
*/
opal_mutex_lock (&fl->fl_lock);
}
opal_mutex_unlock (&fl->fl_lock);
if (NULL == item) {
item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super);
}
}
return item;
}
static inline opal_free_list_item_t *opal_free_list_wait_st (opal_free_list_t *fl)
{
opal_free_list_item_t *item =
(opal_free_list_item_t *) opal_lifo_pop (&fl->super);
while (NULL == item) {
if (fl->fl_max_to_alloc <= fl->fl_num_allocated ||
OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) {
/* try to make progress */
opal_progress ();
}
if (NULL == item) {
item = (opal_free_list_item_t *) opal_lifo_pop (&fl->super);
}
}
return item;
}
static inline opal_free_list_item_t *opal_free_list_wait (opal_free_list_t *fl)
{
if (opal_using_threads ()) {
return opal_free_list_wait_mt (fl);
} else {
return opal_free_list_wait_st (fl);
}
}
/**
* Return an item to a free list.
*
* @param fl (IN) Free list.
* @param item (OUT) Allocated item.
*
*/
static inline void opal_free_list_return_mt (opal_free_list_t *flist,
opal_free_list_item_t *item)
{
opal_list_item_t* original;
original = opal_lifo_push_atomic (&flist->super, &item->super);
if (&flist->super.opal_lifo_ghost == original) {
if (flist->fl_num_waiting > 0) {
/* one one item is being returned so it doesn't make sense to wake
* more than a single waiting thread. additionally, posix thread
* semantics do not require that the lock be held to signal the
* condition variable. */
opal_condition_signal (&flist->fl_condition);
}
}
}
static inline void opal_free_list_return_st (opal_free_list_t *flist,
opal_free_list_item_t *item)
{
opal_list_item_t* original;
original = opal_lifo_push_st (&flist->super, &item->super);
if (&flist->super.opal_lifo_ghost == original) {
if (flist->fl_num_waiting > 0) {
/* one one item is being returned so it doesn't make sense to wake
* more than a single waiting thread. additionally, posix thread
* semantics do not require that the lock be held to signal the
* condition variable. */
opal_condition_signal (&flist->fl_condition);
}
}
}
static inline void opal_free_list_return (opal_free_list_t *flist,
opal_free_list_item_t *item)
{
if (opal_using_threads ()) {
opal_free_list_return_mt (flist, item);
} else {
opal_free_list_return_st (flist, item);
}
}
/** compatibility macro */
#define OPAL_FREE_LIST_RETURN(fl, item) \
opal_free_list_return (fl, item)
END_C_DECLS
#endif