From e6b6f87f08345e8e14d168fb59e9316742137fe9 Mon Sep 17 00:00:00 2001 From: Richard Braun Date: Fri, 7 Dec 2012 20:29:45 +0100 Subject: kern/slab: rework buffer-to-slab lookup Instead of using a red-black tree, rely on the VM system to store kmem specific private data. --- kern/slab.c | 142 +++++++++++++++++++++++++++++------------------------------- kern/slab.h | 7 +-- 2 files changed, 70 insertions(+), 79 deletions(-) diff --git a/kern/slab.c b/kern/slab.c index 9ccbb43..cc0c8a8 100644 --- a/kern/slab.c +++ b/kern/slab.c @@ -52,21 +52,15 @@ * differences are outlined below. * * The per-cache self-scaling hash table for buffer-to-bufctl conversion, - * described in 3.2.3 "Slab Layout for Large Objects", has been replaced by - * a red-black tree storing slabs, sorted by address. The use of a - * self-balancing tree for buffer-to-slab conversions provides a few advantages - * over a hash table. Unlike a hash table, a BST provides a "lookup nearest" - * operation, so obtaining the slab data (whether it is embedded in the slab or - * off slab) from a buffer address simply consists of a "lookup nearest towards - * 0" tree search. Storing slabs instead of buffers also considerably reduces - * the number of elements to retain. Finally, a self-balancing tree is a true - * self-scaling data structure, whereas a hash table requires periodic - * maintenance and complete resizing, which is expensive. The only drawback is - * that releasing a buffer to the slab layer takes logarithmic time instead of - * constant time. But as the data set size is kept reasonable (because slabs - * are stored instead of buffers) and because the CPU pool layer services most - * requests, avoiding many accesses to the slab layer, it is considered an - * acceptable tradeoff. + * described in 3.2.3 "Slab Layout for Large Objects", has been replaced with + * a constant time buffer-to-slab lookup that relies on the VM system. When + * slabs are created, their backing virtual pages are mapped to physical pages + * that are never aliased by other virtual addresses (unless explicitely by + * other kernel code). The lookup operation provides the associated physical + * page descriptor which can store data private to this allocator. The main + * drawback of this method is that it needs to walk the low level page tables, + * but it's expected that these have already been cached by the CPU due to + * prior access by the allocator user, depending on the hardware properties. * * This implementation uses per-cpu pools of objects, which service most * allocation requests. These pools act as caches (but are named differently @@ -448,8 +442,7 @@ static struct kmem_slab * kmem_slab_create(struct kmem_cache *cache, slab = (struct kmem_slab *)(slab_buf + cache->slab_size) - 1; } - list_node_init(&slab->list_node); - rbtree_node_init(&slab->tree_node); + list_node_init(&slab->node); slab->nr_refs = 0; slab->first_free = NULL; slab->addr = slab_buf + color; @@ -521,33 +514,34 @@ static void kmem_slab_destroy(struct kmem_slab *slab, struct kmem_cache *cache) kmem_cache_free(&kmem_slab_cache, (vm_offset_t)slab); } -static inline int kmem_slab_use_tree(int flags) +static inline unsigned long +kmem_slab_buf(const struct kmem_slab *slab) { - return !(flags & KMEM_CF_DIRECT) || (flags & KMEM_CF_VERIFY); + return P2ALIGN((unsigned long)slab->addr, PAGE_SIZE); } -static inline int kmem_slab_cmp_lookup(const void *addr, - const struct rbtree_node *node) +static void +kmem_slab_vmref(struct kmem_slab *slab, size_t size) { - struct kmem_slab *slab; + struct vm_page *page; + unsigned long va, end; - slab = rbtree_entry(node, struct kmem_slab, tree_node); + va = kmem_slab_buf(slab); + end = va + size; - if (addr == slab->addr) - return 0; - else if (addr < slab->addr) - return -1; - else - return 1; + do { + page = vm_kmem_lookup_page(va); + assert(page != NULL); + assert(page->slab_priv == NULL); + page->slab_priv = slab; + va += PAGE_SIZE; + } while (va < end); } -static inline int kmem_slab_cmp_insert(const struct rbtree_node *a, - const struct rbtree_node *b) +static inline int +kmem_slab_lookup_needed(int flags) { - struct kmem_slab *slab; - - slab = rbtree_entry(a, struct kmem_slab, tree_node); - return kmem_slab_cmp_lookup(slab->addr, b); + return !(flags & KMEM_CF_DIRECT) || (flags & KMEM_CF_VERIFY); } #if SLAB_USE_CPU_POOLS @@ -788,7 +782,6 @@ void kmem_cache_init(struct kmem_cache *cache, const char *name, list_node_init(&cache->node); list_init(&cache->partial_slabs); list_init(&cache->free_slabs); - rbtree_init(&cache->active_slabs); cache->obj_size = obj_size; cache->align = align; cache->buf_size = buf_size; @@ -865,10 +858,13 @@ static int kmem_cache_grow(struct kmem_cache *cache) simple_lock(&cache->lock); if (slab != NULL) { - list_insert_head(&cache->free_slabs, &slab->list_node); + list_insert_head(&cache->free_slabs, &slab->node); cache->nr_bufs += cache->bufs_per_slab; cache->nr_slabs++; cache->nr_free_slabs++; + + if (kmem_slab_lookup_needed(cache->flags)) + kmem_slab_vmref(slab, cache->slab_size); } /* @@ -898,8 +894,8 @@ static void kmem_cache_reap(struct kmem_cache *cache) simple_unlock(&cache->lock); while (!list_empty(&dead_slabs)) { - slab = list_first_entry(&dead_slabs, struct kmem_slab, list_node); - list_remove(&slab->list_node); + slab = list_first_entry(&dead_slabs, struct kmem_slab, node); + list_remove(&slab->node); kmem_slab_destroy(slab, cache); nr_free_slabs--; } @@ -918,11 +914,9 @@ static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache) union kmem_bufctl *bufctl; if (!list_empty(&cache->partial_slabs)) - slab = list_first_entry(&cache->partial_slabs, struct kmem_slab, - list_node); + slab = list_first_entry(&cache->partial_slabs, struct kmem_slab, node); else if (!list_empty(&cache->free_slabs)) - slab = list_first_entry(&cache->free_slabs, struct kmem_slab, - list_node); + slab = list_first_entry(&cache->free_slabs, struct kmem_slab, node); else return NULL; @@ -934,7 +928,7 @@ static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache) if (slab->nr_refs == cache->bufs_per_slab) { /* The slab has become complete */ - list_remove(&slab->list_node); + list_remove(&slab->node); if (slab->nr_refs == 1) cache->nr_free_slabs--; @@ -943,15 +937,11 @@ static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache) * The slab has become partial. Insert the new slab at the end of * the list to reduce fragmentation. */ - list_remove(&slab->list_node); - list_insert_tail(&cache->partial_slabs, &slab->list_node); + list_remove(&slab->node); + list_insert_tail(&cache->partial_slabs, &slab->node); cache->nr_free_slabs--; } - if ((slab->nr_refs == 1) && kmem_slab_use_tree(cache->flags)) - rbtree_insert(&cache->active_slabs, &slab->tree_node, - kmem_slab_cmp_insert); - return kmem_bufctl_to_buf(bufctl, cache); } @@ -970,14 +960,14 @@ static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf) slab = (struct kmem_slab *)P2END((unsigned long)buf, cache->slab_size) - 1; } else { - struct rbtree_node *node; - - node = rbtree_lookup_nearest(&cache->active_slabs, buf, - kmem_slab_cmp_lookup, RBTREE_LEFT); - assert(node != NULL); - slab = rbtree_entry(node, struct kmem_slab, tree_node); - assert((unsigned long)buf < (P2ALIGN((unsigned long)slab->addr - + cache->slab_size, PAGE_SIZE))); + struct vm_page *page; + + page = vm_kmem_lookup_page((unsigned long)buf); + assert(page != NULL); + slab = page->slab_priv; + assert(slab != NULL); + assert((unsigned long)buf >= kmem_slab_buf(slab)); + assert((unsigned long)buf < (kmem_slab_buf(slab) + cache->slab_size)); } assert(slab->nr_refs >= 1); @@ -988,20 +978,23 @@ static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf) slab->nr_refs--; cache->nr_objs--; + /* + * The slab has become free. + */ if (slab->nr_refs == 0) { - /* The slab has become free */ - - if (kmem_slab_use_tree(cache->flags)) - rbtree_remove(&cache->active_slabs, &slab->tree_node); - + /* + * The slab was partial. + */ if (cache->bufs_per_slab > 1) - list_remove(&slab->list_node); + list_remove(&slab->node); - list_insert_head(&cache->free_slabs, &slab->list_node); + list_insert_head(&cache->free_slabs, &slab->node); cache->nr_free_slabs++; } else if (slab->nr_refs == (cache->bufs_per_slab - 1)) { - /* The slab has become partial */ - list_insert_head(&cache->partial_slabs, &slab->list_node); + /* + * The slab has become partial. + */ + list_insert_head(&cache->partial_slabs, &slab->node); } } @@ -1105,22 +1098,23 @@ slab_alloc: static void kmem_cache_free_verify(struct kmem_cache *cache, void *buf) { - struct rbtree_node *node; struct kmem_buftag *buftag; struct kmem_slab *slab; union kmem_bufctl *bufctl; + struct vm_page *page; unsigned char *redzone_byte; unsigned long slabend; - simple_lock(&cache->lock); - node = rbtree_lookup_nearest(&cache->active_slabs, buf, - kmem_slab_cmp_lookup, RBTREE_LEFT); - simple_unlock(&cache->lock); + page = vm_page_lookup_pa(vm_page_direct_pa((unsigned long)buf)); + + if (page == NULL) + kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL); + + slab = page->slab_priv; - if (node == NULL) + if (slab == NULL) kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL); - slab = rbtree_entry(node, struct kmem_slab, tree_node); slabend = P2ALIGN((unsigned long)slab->addr + cache->slab_size, PAGE_SIZE); if ((unsigned long)buf >= slabend) diff --git a/kern/slab.h b/kern/slab.h index bbd9d41..64daa61 100644 --- a/kern/slab.h +++ b/kern/slab.h @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -120,8 +119,7 @@ struct kmem_buftag { * Page-aligned collection of unconstructed buffers. */ struct kmem_slab { - struct list list_node; - struct rbtree_node tree_node; + struct list node; unsigned long nr_refs; union kmem_bufctl *first_free; void *addr; @@ -162,7 +160,7 @@ typedef void (*kmem_slab_free_fn_t)(vm_offset_t, vm_size_t); #define KMEM_CF_NO_CPU_POOL 0x1 /* CPU pool layer disabled */ #define KMEM_CF_SLAB_EXTERNAL 0x2 /* Slab data is off slab */ #define KMEM_CF_VERIFY 0x4 /* Debugging facilities enabled */ -#define KMEM_CF_DIRECT 0x8 /* No buf-to-slab tree lookup */ +#define KMEM_CF_DIRECT 0x8 /* Quick buf-to-slab lookup */ /* * Cache of objects. @@ -185,7 +183,6 @@ struct kmem_cache { struct list node; /* Cache list linkage */ struct list partial_slabs; struct list free_slabs; - struct rbtree active_slabs; int flags; size_t bufctl_dist; /* Distance from buffer to bufctl */ size_t slab_size; -- cgit v1.2.3