From 76002ada84436db394ba93496c4a94d40f3f880f Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 7 Jan 2020 21:43:58 -0700 Subject: [PATCH 1/2] opal: make interval tree resilient to similar intervals There are cases where the same interval may be in the tree multiple times. This generally isn't a problem when searching the tree but may cause issues when attempting to delete a particular registration from the tree. The issue is fixed by breaking a low value tie by checking the high value then the interval data. If the high, low, and data of a new insertion exactly matches an existing interval then an assertion is raised. Signed-off-by: Nathan Hjelm (cherry picked from commit 1145abc0b790f82ea25e24a3becad91ff502769c) --- opal/class/opal_interval_tree.c | 55 +++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/opal/class/opal_interval_tree.c b/opal/class/opal_interval_tree.c index e8ccda2024..110dccdacc 100644 --- a/opal/class/opal_interval_tree.c +++ b/opal/class/opal_interval_tree.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,7 +40,7 @@ static opal_interval_tree_node_t *opal_interval_tree_next (opal_interval_tree_t opal_interval_tree_node_t *node); static opal_interval_tree_node_t * opal_interval_tree_find_node(opal_interval_tree_t *tree, uint64_t low, uint64_t high, - bool exact, void *data); + void *data); static opal_interval_tree_node_t *left_rotate (opal_interval_tree_t *tree, opal_interval_tree_node_t *x); static opal_interval_tree_node_t *right_rotate (opal_interval_tree_t *tree, opal_interval_tree_node_t *x); @@ -355,31 +356,54 @@ int opal_interval_tree_insert (opal_interval_tree_t *tree, void *value, uint64_t return OPAL_SUCCESS; } +static int opal_interval_tree_compare_node(opal_interval_tree_node_t *node, uint64_t low, uint64_t high, void *data) { + if ((data && node->low == low && node->high == high && node->data == data) || + (!data && node->low <= low && node->high >= high)) { + return 0; + } + if (node->low > low) { + return -1; + } + if (node->low < low) { + return 1; + } + if (node->high < high) { + return -1; + } + if (node->high > high) { + return 1; + } + if (node->data > data) { + return -1; + } + return 1; +} + static opal_interval_tree_node_t *opal_interval_tree_find_interval(opal_interval_tree_t *tree, opal_interval_tree_node_t *node, uint64_t low, - uint64_t high, bool exact, void *data) + uint64_t high, void *data) { if (node == &tree->nill) { return NULL; } - if (((exact && node->low == low && node->high == high) || (!exact && node->low <= low && node->high >= high)) && - (!data || node->data == data)) { + int check = opal_interval_tree_compare_node(node, low, high, data); + if (0 == check) { return node; } - if (low <= node->low) { - return opal_interval_tree_find_interval (tree, node->left, low, high, exact, data); + if (-1 == check) { + return opal_interval_tree_find_interval (tree, node->left, low, high, data); } - return opal_interval_tree_find_interval (tree, node->right, low, high, exact, data); + return opal_interval_tree_find_interval (tree, node->right, low, high, data); } /* Finds the node in the tree based on the key and returns a pointer * to the node. This is a bit a code duplication, but this has to be fast * so we go ahead with the duplication */ -static opal_interval_tree_node_t *opal_interval_tree_find_node(opal_interval_tree_t *tree, uint64_t low, uint64_t high, bool exact, void *data) +static opal_interval_tree_node_t *opal_interval_tree_find_node(opal_interval_tree_t *tree, uint64_t low, uint64_t high, void *data) { - return opal_interval_tree_find_interval (tree, tree->root.left, low, high, exact, data); + return opal_interval_tree_find_interval (tree, tree->root.left, low, high, data); } void *opal_interval_tree_find_overlapping (opal_interval_tree_t *tree, uint64_t low, uint64_t high) @@ -388,7 +412,7 @@ void *opal_interval_tree_find_overlapping (opal_interval_tree_t *tree, uint64_t opal_interval_tree_node_t *node; token = opal_interval_tree_reader_get_token (tree); - node = opal_interval_tree_find_node (tree, low, high, true, NULL); + node = opal_interval_tree_find_node (tree, low, high, NULL); opal_interval_tree_reader_return_token (tree, token); return node ? node->data : NULL; @@ -536,7 +560,7 @@ int opal_interval_tree_delete (opal_interval_tree_t *tree, uint64_t low, uint64_ opal_interval_tree_node_t *node; opal_interval_tree_write_lock (tree); - node = opal_interval_tree_find_node (tree, low, high, true, data); + node = opal_interval_tree_find_node (tree, low, high, data); if (NULL == node) { opal_interval_tree_write_unlock (tree); return OPAL_ERR_NOT_FOUND; @@ -618,18 +642,23 @@ static void opal_interval_tree_insert_node (opal_interval_tree_t *tree, opal_int node->right = nill; /* find the leaf where we will insert the node */ + int check = -1; while (n != nill) { + check = opal_interval_tree_compare_node(n, node->low, node->high, node->data); + /* node already exists */ + assert (0 != check); + if (n->max < node->high) { n->max = node->high; } parent = n; - n = ((node->low < n->low) ? n->left : n->right); + n = (-1 == check) ? n->left : n->right; assert (nill == n || n->parent == parent); } /* place it on either the left or the right */ - if ((node->low < parent->low)) { + if (-1 == check) { parent->left = node; } else { parent->right = node; From a64a7c8a0a20b6b883b55abc1b92d6db1f75adb8 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 7 Jan 2020 21:48:01 -0700 Subject: [PATCH 2/2] btl/vader: fix issues with xpmem registration invalidation This commit fixes an issue discovered in the XPMEM registration cache. It was possible for a registration to be invalidated by multiple threads leading to a double-free situation or re-use of an invalidated registration. This commit fixes the issue by setting the INVALID flag on a registation when it will be deleted. The flag is set while iterating over the tree to take advantage of the fact that a registration can not be removed from the VMA tree by a thread while another thread is traversing the VMA tree. References #6524 References #7030 Closes #6534 Signed-off-by: Nathan Hjelm (cherry picked from commit f86f805be1145ace46b570c5c518555b38e58cee) --- opal/mca/btl/vader/btl_vader_xpmem.c | 69 +++++++++++++++++++--------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/opal/mca/btl/vader/btl_vader_xpmem.c b/opal/mca/btl/vader/btl_vader_xpmem.c index 219c0bd5f7..5bfe8e1a64 100644 --- a/opal/mca/btl/vader/btl_vader_xpmem.c +++ b/opal/mca/btl/vader/btl_vader_xpmem.c @@ -5,6 +5,7 @@ * Copyright (c) 2014 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,8 +45,7 @@ static int vader_check_reg (mca_rcache_base_registration_t *reg, void *ctx) { vader_check_reg_ctx_t *vader_ctx = (vader_check_reg_ctx_t *) ctx; - if ((intptr_t) reg->alloc_base != vader_ctx->ep->peer_smp_rank || - (reg->flags & MCA_RCACHE_FLAGS_PERSIST)) { + if ((intptr_t) reg->alloc_base != vader_ctx->ep->peer_smp_rank) { /* ignore this registration */ return OPAL_SUCCESS; } @@ -53,10 +53,26 @@ static int vader_check_reg (mca_rcache_base_registration_t *reg, void *ctx) vader_ctx->reg[0] = reg; if (vader_ctx->bound <= (uintptr_t) reg->bound && vader_ctx->base >= (uintptr_t) reg->base) { - opal_atomic_add (®->ref_count, 1); + if (0 == opal_atomic_fetch_add_32 (®->ref_count, 1)) { + /* registration is being deleted by a thread in vader_return_registration. the + * VMA tree implementation will block in mca_rcache_delete until we finish + * iterating over the VMA tree so it is safe to just ignore this registration + * and continue. */ + vader_ctx->reg[0] = NULL; + return OPAL_SUCCESS; + } return 1; } + if (MCA_RCACHE_FLAGS_INVALID & opal_atomic_fetch_or_32(®->flags, MCA_RCACHE_FLAGS_INVALID)) { + /* another thread has already marked this registration as invalid. ignore and continue. */ + vader_ctx->reg[0] = NULL; + return OPAL_SUCCESS; + } + + /* let the caller know we found an overlapping registration that can be coalesced into + * the requested interval. the caller will remove the last reference and delete the + * registration. */ return 2; } @@ -67,8 +83,12 @@ void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_ ref_count = opal_atomic_add_fetch_32 (®->ref_count, -1); if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) { - mca_rcache_base_vma_delete (vma_module, reg); - +#if OPAL_DEBUG + int ret = mca_rcache_base_vma_delete (vma_module, reg); + assert (OPAL_SUCCESS == ret); +#else + (void) mca_rcache_base_vma_delete (vma_module, reg); +#endif opal_memchecker_base_mem_noaccess (reg->rcache_context, (uintptr_t)(reg->bound - reg->base)); (void)xpmem_detach (reg->rcache_context); OBJ_RELEASE (reg); @@ -100,16 +120,9 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo /* several segments may match the base pointer */ rc = mca_rcache_base_vma_iterate (vma_module, (void *) base, bound - base, true, vader_check_reg, &check_ctx); if (2 == rc) { - /* remove this pointer from the rcache and decrement its reference count - (so it is detached later) */ - mca_rcache_base_vma_delete (vma_module, reg); - - /* start the new segment from the lower of the two bases */ - base = (uintptr_t) reg->base < base ? (uintptr_t) reg->base : base; - - /* remove the last reference to this registration */ - vader_return_registration (reg, ep); - + bound = bound < (uintptr_t) reg->bound ? (uintptr_t) reg->bound : bound; + base = base > (uintptr_t) reg->base ? (uintptr_t) reg->base : base; + vader_return_registration(reg, ep); reg = NULL; } @@ -151,13 +164,16 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo return reg; } +struct vader_cleanup_reg_ctx { + mca_btl_vader_endpoint_t *ep; + opal_list_t *registrations; +}; + static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx) { - mca_btl_vader_endpoint_t *ep = (mca_btl_vader_endpoint_t *) ctx; - if ((intptr_t) reg->alloc_base == ep->peer_smp_rank) { - /* otherwise dereg will fail on assert */ - reg->ref_count = 0; - OBJ_RELEASE(reg); + struct vader_cleanup_reg_ctx *cleanup_ctx = (struct vader_cleanup_reg_ctx *) ctx; + if ((intptr_t) reg->alloc_base == cleanup_ctx->ep->peer_smp_rank) { + opal_list_append(cleanup_ctx->registrations, ®->super.super); } return OPAL_SUCCESS; @@ -165,11 +181,22 @@ static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registra void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep) { + mca_rcache_base_registration_t *reg; + opal_list_t registrations; + struct vader_cleanup_reg_ctx cleanup_ctx = {.ep = ep, .registrations = ®istrations}; + + OBJ_CONSTRUCT(®istrations, opal_list_t); + /* clean out the registration cache */ (void) mca_rcache_base_vma_iterate (mca_btl_vader_component.vma_module, NULL, (size_t) -1, true, mca_btl_vader_endpoint_xpmem_rcache_cleanup, - (void *) ep); + (void *) &cleanup_ctx); + while (NULL != (reg = (mca_rcache_base_registration_t *) opal_list_remove_first(®istrations))) { + vader_return_registration (reg, ep); + } + OBJ_DESTRUCT(®istrations); + if (ep->segment_base) { xpmem_release (ep->segment_data.xpmem.apid); ep->segment_data.xpmem.apid = 0;