From eaa98af52cc65c3c101d389d307db45ea081ef5d Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 16 Oct 2018 12:33:03 -0600 Subject: [PATCH] opal/free_list: fix race condition There was a race condition in opal_free_list_get. Code throughout the Open MPI codebase was assuming that a NULL return from this function was due to an out-of-memory condition. In some cases this can lead to a fatal condition (MPI_Irecv and MPI_Isend in pml/ob1 for example). Before this commit opal_free_list_get_mt looked like this: ```c static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *flist) { opal_free_list_item_t *item = (opal_free_list_item_t*) opal_lifo_pop_atomic (&flist->super); if (OPAL_UNLIKELY(NULL == item)) { opal_mutex_lock (&flist->fl_lock); opal_free_list_grow_st (flist, flist->fl_num_per_alloc); opal_mutex_unlock (&flist->fl_lock); item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super); } return item; } ``` The problem is in a multithreaded environment is *is* possible for the free list to be grown successfully but the thread calling opal_free_list_get_mt to be left without an item. The happens if between the calls to opal_lifo_push_atomic in opal_free_list_grow_st and the call to opal_lifo_pop_atomic other threads pop all the items added to the free list. This commit fixes the issue by ensuring the thread that successfully grew the free list **always** gets a free list item. Fixes #2921 Signed-off-by: Nathan Hjelm (cherry picked from commit 5c770a7becc496f63b9f9a59151206236416f4f4) Signed-off-by: Nathan Hjelm --- opal/class/opal_free_list.c | 18 ++++++++++++------ opal/class/opal_free_list.h | 30 +++++++++++++++++++----------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/opal/class/opal_free_list.c b/opal/class/opal_free_list.c index b7c38b22f4..517d8ee0d3 100644 --- a/opal/class/opal_free_list.c +++ b/opal/class/opal_free_list.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -155,13 +155,13 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_ flist->ctx = ctx; if (num_elements_to_alloc) { - return opal_free_list_grow_st (flist, num_elements_to_alloc); + return opal_free_list_grow_st (flist, num_elements_to_alloc, NULL); } return OPAL_SUCCESS; } -int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) +int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements, opal_free_list_item_t **item_out) { unsigned char *ptr, *payload_ptr = NULL; opal_free_list_memory_t *alloc_ptr; @@ -263,10 +263,16 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) /* NTH: in case the free list may be accessed from multiple threads * use the atomic lifo push. The overhead is small compared to the * overall overhead of opal_free_list_grow(). */ - opal_lifo_push_atomic (&flist->super, &item->super); + if (item_out && 0 == i) { + /* ensure the thread that is growing the free list always gets an item + * if one is available */ + *item_out = item; + } else { + opal_lifo_push_atomic (&flist->super, &item->super); + } + ptr += head_size; payload_ptr += elem_size; - } if (OPAL_SUCCESS != rc && 0 == num_elements) { @@ -298,7 +304,7 @@ int opal_free_list_resize_mt(opal_free_list_t *flist, size_t size) opal_mutex_lock (&flist->fl_lock); do { - ret = opal_free_list_grow_st (flist, flist->fl_num_per_alloc); + ret = opal_free_list_grow_st (flist, flist->fl_num_per_alloc, NULL); if (OPAL_SUCCESS != ret) { break; } diff --git a/opal/class/opal_free_list.h b/opal/class/opal_free_list.h index 1e1de3e8e8..b7fd192021 100644 --- a/opal/class/opal_free_list.h +++ b/opal/class/opal_free_list.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2010-2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -146,6 +146,7 @@ OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list, * * @param flist (IN) Free list to grow * @param num_elements (IN) Number of elements to add + * @param item_out (OUT) Location to store new free list item (can be NULL) * * @returns OPAL_SUCCESS if any elements were added * @returns OPAL_ERR_OUT_OF_RESOURCE if no elements could be added @@ -155,8 +156,14 @@ OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list, * that may be accessed by multiple threads simultaneously. Note: this is an * internal function that will be used when needed by opal_free_list_get* and * opal_free_list_wait*. + * + * The item_out parameter can be used to ensure that the thread calling this + * function always gets a free list item if the list is successfully grown. + * This eliminates a race condition with code that simply calls free_list_get + * and assumes NULL is an out of memory condition (which it wasn't necessarily + * before this parameter was added). */ -OPAL_DECLSPEC int opal_free_list_grow_st (opal_free_list_t *flist, size_t num_elements); +OPAL_DECLSPEC int opal_free_list_grow_st (opal_free_list_t *flist, size_t num_elements, opal_free_list_item_t **item_out); /** * Grow the free list to be at least size elements. @@ -195,9 +202,8 @@ static inline opal_free_list_item_t *opal_free_list_get_mt (opal_free_list_t *fl if (OPAL_UNLIKELY(NULL == item)) { opal_mutex_lock (&flist->fl_lock); - opal_free_list_grow_st (flist, flist->fl_num_per_alloc); + opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item); opal_mutex_unlock (&flist->fl_lock); - item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super); } return item; @@ -209,8 +215,7 @@ static inline opal_free_list_item_t *opal_free_list_get_st (opal_free_list_t *fl (opal_free_list_item_t*) opal_lifo_pop_st (&flist->super); if (OPAL_UNLIKELY(NULL == item)) { - opal_free_list_grow_st (flist, flist->fl_num_per_alloc); - item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&flist->super); + opal_free_list_grow_st (flist, flist->fl_num_per_alloc, &item); } return item; @@ -253,7 +258,7 @@ static inline opal_free_list_item_t *opal_free_list_wait_mt (opal_free_list_t *f while (NULL == item) { if (!opal_mutex_trylock (&fl->fl_lock)) { if (fl->fl_max_to_alloc <= fl->fl_num_allocated || - OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc)) { + OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) { fl->fl_num_waiting++; opal_condition_wait (&fl->fl_condition, &fl->fl_lock); fl->fl_num_waiting--; @@ -274,7 +279,9 @@ static inline opal_free_list_item_t *opal_free_list_wait_mt (opal_free_list_t *f opal_mutex_lock (&fl->fl_lock); } opal_mutex_unlock (&fl->fl_lock); - item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super); + if (NULL == item) { + item = (opal_free_list_item_t *) opal_lifo_pop_atomic (&fl->super); + } } return item; @@ -287,12 +294,13 @@ static inline opal_free_list_item_t *opal_free_list_wait_st (opal_free_list_t *f while (NULL == item) { if (fl->fl_max_to_alloc <= fl->fl_num_allocated || - OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc)) { + OPAL_SUCCESS != opal_free_list_grow_st (fl, fl->fl_num_per_alloc, &item)) { /* try to make progress */ opal_progress (); } - - item = (opal_free_list_item_t *) opal_lifo_pop (&fl->super); + if (NULL == item) { + item = (opal_free_list_item_t *) opal_lifo_pop (&fl->super); + } } return item;