1
1
openmpi/opal/mca/rcache/base/rcache_base_vma_tree.c
Nathan Hjelm d4afb16f5a opal: rework mpool and rcache frameworks
This commit rewrites both the mpool and rcache frameworks. Summary of
changes:

 - Before this change a significant portion of the rcache
   functionality lived in mpool components. This meant that it was
   impossible to add a new memory pool to use with rdma networks
   (ugni, openib, etc) without duplicating the functionality of an
   existing mpool component. All the registration functionality has
   been removed from the mpool and placed in the rcache framework.

 - All registration cache mpools components (udreg, grdma, gpusm,
   rgpusm) have been changed to rcache components. rcaches are
   allocated and released in the same way mpool components were.

 - It is now valid to pass NULL as the resources argument when
   creating an rcache. At this time the gpusm and rgpusm components
   support this. All other rcache components require non-NULL
   resources.

 - A new mpool component has been added: hugepage. This component
   supports huge page allocations on linux.

 - Memory pools are now allocated using "hints". Each mpool component
   is queried with the hints and returns a priority. The current hints
   supported are NULL (uses posix_memalign/malloc), page_size=x (huge
   page mpool), and mpool=x.

 - The sm mpool has been moved to common/sm. This reflects that the sm
   mpool is specialized and not meant for any general
   allocations. This mpool may be moved back into the mpool framework
   if there is any objection.

 - The opal_free_list_init arguments have been updated. The unused0
   argument is not used to pass in the registration cache module. The
   mpool registration flags are now rcache registration flags.

 - All components have been updated to make use of the new framework
   interfaces.

As this commit makes significant changes to both the mpool and rcache
frameworks both versions have been bumped to 3.0.0.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
2016-03-14 10:50:41 -06:00

566 строки
18 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2013 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
*
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal/util/output.h"
#include "rcache_base_vma_tree.h"
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_reg_list_item_t, opal_list_item_t, NULL, NULL);
static void mca_rcache_base_vma_item_construct (mca_rcache_base_vma_item_t *vma_item)
{
OBJ_CONSTRUCT(&vma_item->reg_list, opal_list_t);
}
static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_item)
{
OPAL_LIST_DESTRUCT(&vma_item->reg_list);
}
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_list_item_t,
mca_rcache_base_vma_item_construct,
mca_rcache_base_vma_item_destruct);
/**
* Function for the red black tree to compare 2 keys
*
* @param key1 a pointer to the 1st key
* @param key2 a pointer to the second key
*
* @retval -1 if key1 is below key2
* @retval 1 if key 1 is above key2
* @retval 0 if the keys are the same
*/
static int mca_rcache_base_vma_tree_node_compare(void *key1, void *key2)
{
mca_rcache_base_vma_item_t *vma1 = (mca_rcache_base_vma_item_t *) key1,
*vma2 = (mca_rcache_base_vma_item_t *) key2;
if (vma1->start < vma2->start) {
return -1;
}
if (vma1->start > vma2->start) {
return 1;
}
return 0;
}
static int mca_rcache_base_vma_tree_node_compare_search(void *key1, void *key2)
{
mca_rcache_base_vma_item_t *vma = (mca_rcache_base_vma_item_t *) key2;
uintptr_t addr = (uintptr_t) key1;
if (vma->end < addr) {
return 1;
}
if (vma->start <= addr) {
return 0;
}
return -1;
}
static int mca_rcache_base_vma_tree_node_compare_closest(void *key1, void *key2)
{
mca_rcache_base_vma_item_t *vma = (mca_rcache_base_vma_item_t *) key2, *prev_vma;
uintptr_t addr = (uintptr_t) key1;
if (vma->end < addr) {
return 1;
}
if (vma->start <= addr) {
return 0;
}
prev_vma = (mca_rcache_base_vma_item_t *) opal_list_get_prev (&vma->super);
if (prev_vma == (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma->vma_module->vma_list)
|| prev_vma->end < addr) {
return 0;
}
return -1;
}
static inline
mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_t *vma_module,
uintptr_t start, uintptr_t end)
{
mca_rcache_base_vma_item_t *vma_item = OBJ_NEW(mca_rcache_base_vma_item_t);
if (NULL == vma_item) {
return NULL;
}
vma_item->start = start;
vma_item->end = end;
vma_item->vma_module = vma_module;
(void) opal_rb_tree_insert (&vma_module->rb_tree, vma_item, vma_item);
return vma_item;
}
static inline int mca_rcache_base_vma_compare_regs (mca_rcache_base_registration_t *reg1,
mca_rcache_base_registration_t *reg2)
{
/* persisten registration are on top */
if ((reg1->flags & MCA_RCACHE_FLAGS_PERSIST) &&
!(reg2->flags & MCA_RCACHE_FLAGS_PERSIST)) {
return 1;
}
if (!(reg1->flags & MCA_RCACHE_FLAGS_PERSIST) &&
(reg2->flags & MCA_RCACHE_FLAGS_PERSIST)) {
return -1;
}
if (reg1->bound != reg2->bound) {
return (int)(reg1->bound - reg2->bound);
}
/* tie breaker */
return (int)((intptr_t)reg1 - (intptr_t)reg2);
}
static inline int mca_rcache_base_vma_add_reg (mca_rcache_base_vma_item_t *vma_item,
struct mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_reg_list_item_t *item, *entry;
entry = OBJ_NEW(mca_rcache_base_vma_reg_list_item_t);
if (!entry) {
return -1;
}
entry->reg = reg;
OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if (mca_rcache_base_vma_compare_regs(item->reg, reg) > 0) {
continue;
}
opal_list_insert_pos (&vma_item->reg_list, &item->super, &entry->super);
return 0;
}
opal_list_append (&vma_item->reg_list, &entry->super);
return 0;
}
static inline void mca_rcache_base_vma_remove_reg (mca_rcache_base_vma_item_t *vma_item,
struct mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_reg_list_item_t *item;
OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if(item->reg == reg) {
opal_list_remove_item(&vma_item->reg_list, &item->super);
OBJ_RELEASE(item);
break;
}
}
}
static inline int mca_rcache_base_vma_copy_reg_list (mca_rcache_base_vma_item_t *to,
mca_rcache_base_vma_item_t *from)
{
mca_rcache_base_vma_reg_list_item_t *item_f, *item_t;
OPAL_LIST_FOREACH(item_f, &from->reg_list, mca_rcache_base_vma_reg_list_item_t) {
item_t = OBJ_NEW(mca_rcache_base_vma_reg_list_item_t);
if (NULL == item_t) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
item_t->reg = item_f->reg;
opal_list_append (&to->reg_list, &item_t->super);
}
return OPAL_SUCCESS;
}
/* returns 1 iff two lists contain the same entries */
static inline int mca_rcache_base_vma_compare_reg_lists (mca_rcache_base_vma_item_t *vma1,
mca_rcache_base_vma_item_t *vma2)
{
mca_rcache_base_vma_reg_list_item_t *i1, *i2;
if (!vma1 || !vma2 || opal_list_get_size (&vma1->reg_list) != opal_list_get_size (&vma2->reg_list)) {
return 0;
}
i2 = (mca_rcache_base_vma_reg_list_item_t *) opal_list_get_first(&vma2->reg_list);
OPAL_LIST_FOREACH(i1, &vma1->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if ((void *) i2 == (void *) opal_list_get_end (&vma2->reg_list) || i1->reg != i2->reg) {
return 0;
}
i2 = (mca_rcache_base_vma_reg_list_item_t *) opal_list_get_next (&i2->super);
}
return 1;
}
int mca_rcache_base_vma_tree_init (mca_rcache_base_vma_module_t *vma_module)
{
OBJ_CONSTRUCT(&vma_module->rb_tree, opal_rb_tree_t);
OBJ_CONSTRUCT(&vma_module->vma_list, opal_list_t);
vma_module->reg_cur_cache_size = 0;
return opal_rb_tree_init (&vma_module->rb_tree, mca_rcache_base_vma_tree_node_compare);
}
void mca_rcache_base_vma_tree_finalize (mca_rcache_base_vma_module_t *vma_module)
{
opal_rb_tree_init(&vma_module->rb_tree, mca_rcache_base_vma_tree_node_compare);
OBJ_DESTRUCT(&vma_module->vma_list);
OBJ_DESTRUCT(&vma_module->rb_tree);
}
mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, unsigned char *bound)
{
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *item;
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
return NULL;
}
OPAL_LIST_FOREACH(item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if(item->reg->flags & MCA_RCACHE_FLAGS_INVALID) {
continue;
}
if(item->reg->bound >= bound)
return item->reg;
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST))
break;
}
return NULL;
}
static inline bool is_reg_in_array (mca_rcache_base_registration_t **regs,
int cnt, mca_rcache_base_registration_t *p)
{
for (int i = 0 ; i < cnt ; ++i) {
if (regs[i] == p) {
return true;
}
}
return false;
}
int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module, unsigned char *base,
unsigned char *bound, mca_rcache_base_registration_t **regs,
int reg_cnt)
{
int cnt = 0;
if(opal_list_get_size(&vma_module->vma_list) == 0)
return cnt;
do {
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *vma_item;
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_closest);
if (NULL == vma) {
/* base is bigger than any registered memory */
break;
}
if (base < (unsigned char *) vma->start) {
base = (unsigned char *) vma->start;
continue;
}
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) ||
is_reg_in_array (regs, cnt, vma_item->reg)) {
continue;
}
regs[cnt++] = vma_item->reg;
if (cnt == reg_cnt) {
return cnt; /* no space left in the provided array */
}
}
base = (unsigned char *)vma->end + 1;
} while(bound >= base);
return cnt;
}
static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
}
static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module,
size_t nbytes)
{
vma_module->reg_cur_cache_size += nbytes;
}
int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg, size_t limit)
{
mca_rcache_base_vma_item_t *i;
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;
i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree,
(void *) begin, mca_rcache_base_vma_tree_node_compare_closest);
if (!i) {
i = (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list);
}
while (begin <= end) {
mca_rcache_base_vma_item_t *vma = NULL;
if (opal_list_get_end (&vma_module->vma_list) == &i->super) {
if (mca_rcache_base_vma_can_insert (vma_module, end - begin + 1, limit)) {
vma = mca_rcache_base_vma_new(vma_module, begin, end);
}
if (!vma) {
goto remove;
}
mca_rcache_base_vma_update_byte_count (vma_module, end - begin + 1);
opal_list_append(&vma_module->vma_list, &vma->super);
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
return OPAL_SUCCESS;
}
if (i->start > begin) {
uintptr_t tend = (i->start <= end) ? (i->start - 1) : end;
if (mca_rcache_base_vma_can_insert(vma_module, tend - begin + 1, limit)) {
vma = mca_rcache_base_vma_new(vma_module, begin, tend);
}
if (!vma) {
goto remove;
}
mca_rcache_base_vma_update_byte_count (vma_module, tend - begin + 1);
/* insert before */
opal_list_insert_pos(&vma_module->vma_list, &i->super, &vma->super);
i = vma;
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
} else if(i->start == begin) {
if (i->end > end) {
vma = mca_rcache_base_vma_new (vma_module, end + 1, i->end);
if (!vma) {
goto remove;
}
i->end = end;
mca_rcache_base_vma_copy_reg_list (vma, i);
/* add after */
opal_list_insert_pos (&vma_module->vma_list,
opal_list_get_next (&i->super),
&vma->super);
mca_rcache_base_vma_add_reg (i, reg);
begin = end + 1;
} else {
mca_rcache_base_vma_add_reg(i, reg);
begin = i->end + 1;
}
} else {
vma = mca_rcache_base_vma_new (vma_module, begin, i->end);
if (!vma) {
goto remove;
}
i->end = begin - 1;
mca_rcache_base_vma_copy_reg_list (vma, i);
/* add after */
opal_list_insert_pos (&vma_module->vma_list,
opal_list_get_next (&i->super),
&vma->super);
}
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
}
return OPAL_SUCCESS;
remove:
mca_rcache_base_vma_tree_delete (vma_module, reg);
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
}
/**
* Function to remove previously memory from the tree without freeing it
*
* @param base pointer to the memory to free
*
* @retval OPAL_SUCCESS
* @retval OPAL_ERR_BAD_PARAM if the passed base pointer was invalid
*/
int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_item_t *vma;
vma = (mca_rcache_base_vma_item_t *)
opal_rb_tree_find_with (&vma_module->rb_tree, reg->base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
return OPAL_ERROR;
}
while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list)
&& vma->start <= (uintptr_t) reg->bound) {
mca_rcache_base_vma_remove_reg(vma, reg);
if(opal_list_is_empty(&vma->reg_list)) {
mca_rcache_base_vma_item_t *next =
(mca_rcache_base_vma_item_t *) opal_list_get_next (&vma->super);
opal_rb_tree_delete (&vma_module->rb_tree, vma);
mca_rcache_base_vma_update_byte_count (vma_module,
vma->start - vma->end - 1);
opal_list_remove_item (&vma_module->vma_list, &vma->super);
OBJ_RELEASE(vma);
vma = next;
} else {
int merged;
do {
mca_rcache_base_vma_item_t *prev = NULL, *next = NULL;
if (opal_list_get_first (&vma_module->vma_list) != &vma->super) {
prev = (mca_rcache_base_vma_item_t *) opal_list_get_prev(vma);
}
merged = 0;
if (prev && vma->start == prev->end + 1 &&
mca_rcache_base_vma_compare_reg_lists(vma, prev)) {
prev->end = vma->end;
opal_list_remove_item(&vma_module->vma_list, &vma->super);
opal_rb_tree_delete(&vma_module->rb_tree, vma);
OBJ_RELEASE(vma);
vma = prev;
merged = 1;
}
if (opal_list_get_last (&vma_module->vma_list) != &vma->super) {
next = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma);
}
if (next && vma->end + 1 == next->start &&
mca_rcache_base_vma_compare_reg_lists (vma, next)) {
vma->end = next->end;
opal_list_remove_item(&vma_module->vma_list, &next->super);
opal_rb_tree_delete(&vma_module->rb_tree, next);
OBJ_RELEASE(next);
merged = 1;
}
} while (merged);
vma = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma);
}
}
return 0;
}
/* Dump out rcache entries within a range of memory. Useful for debugging. */
void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg)
{
unsigned char * bound = base + size -1;
mca_rcache_base_registration_t *reg;
if (NULL == msg) {
msg = "";
}
opal_output(0, "Dumping rcache entries: %s", msg);
if(opal_list_is_empty(&vma_module->vma_list)) {
opal_output(0, " rcache is empty");
return;
}
do {
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *vma_item;
vma = (mca_rcache_base_vma_item_t *)
opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_closest);
if (NULL == vma) {
/* base is bigger than any registered memory */
break;
}
if (base < (unsigned char *) vma->start) {
base = (unsigned char *) vma->start;
continue;
}
opal_output(0, " vma: base=%p, bound=%p, size=%lu, number of registrations=%d",
(void *)vma->start, (void *)vma->end, vma->end - vma->start + 1,
(int) opal_list_get_size(&vma->reg_list));
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
reg = vma_item->reg;
opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x",
reg->base, reg->bound, reg->ref_count, reg->flags);
}
base = (unsigned char *)vma->end + 1;
} while (bound >= base);
}