Import the slab allocator

As it is intended to completely replace the zone allocator, remove it on the way. So long to the venerable code ! * Makefrag.am (libkernel_a_SOURCES): Add kern/slab.{c,h}, remove kern/kalloc.c and kern/zalloc.{c,h}. * configfrag.ac (SLAB_VERIFY, SLAB_USE_CPU_POOLS): Add defines. * i386/Makefrag.am (libkernel_a_SOURCES): Remove i386/i386/zalloc.h. * i386/configfrag.ac (CPU_L1_SHIFT): Remove define. * include/mach_debug/slab_info.h: New file. * kern/slab.c: Likewise. * kern/slab.h: Likewise. * i386/i386/zalloc.h: Remove file. * include/mach_debug/zone_info.h: Likewise. * kern/kalloc.c: Likewise. * kern/zalloc.c: Likewise. * kern/zalloc.h: Likewise.
author: Richard Braun <rbraun@sceen.net> 2011-12-13 20:27:56 +0000
committer: Richard Braun <rbraun@sceen.net> 2011-12-17 22:12:34 +0000
commit: 7bc54a622e0c57a1085cd2990a1deedc8bd4743d (patch)
tree: 0356aefb0a935c30d295a86cec2386d5197c4754 /kern
parent: d25bd66fe0bd4cddb18890390198c86b9e9b56b4 (diff)
5 files changed, 1798 insertions, 1397 deletions
diff --git a/kern/kalloc.c b/kern/kalloc.c
deleted file mode 100644
index 8256305..0000000
--- a/kern/kalloc.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- *	File:	kern/kalloc.c
- *	Author:	Avadis Tevanian, Jr.
- *	Date:	1985
- *
- *	General kernel memory allocator.  This allocator is designed
- *	to be used by the kernel to manage dynamic memory fast.
- */
-
-#include <mach/machine/vm_types.h>
-#include <mach/vm_param.h>
-
-#include <kern/debug.h>
-#include <kern/zalloc.h>
-#include <kern/kalloc.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_object.h>
-#include <vm/vm_map.h>
-
-
-
-vm_map_t kalloc_map;
-vm_size_t kalloc_map_size = 64 * 1024 * 1024;
-vm_size_t kalloc_max;
-
-/*
- *	All allocations of size less than kalloc_max are rounded to the
- *	next highest power of 2.  This allocator is built on top of
- *	the zone allocator.  A zone is created for each potential size
- *	that we are willing to get in small blocks.
- *
- *	We assume that kalloc_max is not greater than 64K;
- *	thus 16 is a safe array size for k_zone and k_zone_name.
- */
-
-int first_k_zone = -1;
-struct zone *k_zone[16];
-static char *k_zone_name[16] = {
-	"kalloc.1",		"kalloc.2",
-	"kalloc.4",		"kalloc.8",
-	"kalloc.16",		"kalloc.32",
-	"kalloc.64",		"kalloc.128",
-	"kalloc.256",		"kalloc.512",
-	"kalloc.1024",		"kalloc.2048",
-	"kalloc.4096",		"kalloc.8192",
-	"kalloc.16384",		"kalloc.32768"
-};
-
-/*
- *  Max number of elements per zone.  zinit rounds things up correctly
- *  Doing things this way permits each zone to have a different maximum size
- *  based on need, rather than just guessing; it also
- *  means its patchable in case you're wrong!
- */
-unsigned long k_zone_max[16] = {
-      1024,		/*      1 Byte  */
-      1024,		/*      2 Byte  */
-      1024,		/*      4 Byte  */
-      1024,		/*      8 Byte  */
-      1024,		/*     16 Byte  */
-      4096,		/*     32 Byte  */
-      4096,		/*     64 Byte  */
-      4096,		/*    128 Byte  */
-      4096,		/*    256 Byte  */
-      1024,		/*    512 Byte  */
-      1024,		/*   1024 Byte  */
-      1024,		/*   2048 Byte  */
-      1024,		/*   4096 Byte  */
-      4096,		/*   8192 Byte  */
-      64,		/*  16384 Byte  */
-      64,		/*  32768 Byte  */
-};
-
-/*
- *	Initialize the memory allocator.  This should be called only
- *	once on a system wide basis (i.e. first processor to get here
- *	does the initialization).
- *
- *	This initializes all of the zones.
- */
-
-#ifndef NDEBUG
-static int kalloc_init_called;
-#endif
-
-void kalloc_init()
-{
-	vm_offset_t min, max;
-	vm_size_t size;
-	register int i;
-
-	assert (! kalloc_init_called);
-
-	kalloc_map = kmem_suballoc(kernel_map, &min, &max,
-				   kalloc_map_size, FALSE);
-
-	/*
-	 *	Ensure that zones up to size 8192 bytes exist.
-	 *	This is desirable because messages are allocated
-	 *	with kalloc, and messages up through size 8192 are common.
-	 */
-
-	if (PAGE_SIZE < 16*1024)
-		kalloc_max = 16*1024;
-	else
-		kalloc_max = PAGE_SIZE;
-
-	/*
-	 *	Allocate a zone for each size we are going to handle.
-	 *	We specify non-paged memory.
-	 */
-	for (i = 0, size = 1; size < kalloc_max; i++, size <<= 1) {
-		if (size < MINSIZE) {
-			k_zone[i] = 0;
-			continue;
-		}
-		if (size == MINSIZE) {
-			first_k_zone = i;
-		}
-		k_zone[i] = zinit(size, 0, k_zone_max[i] * size, size,
-				  size >= PAGE_SIZE ? ZONE_COLLECTABLE : 0,
-				  k_zone_name[i]);
-	}
-
-#ifndef NDEBUG
-	kalloc_init_called = 1;
-#endif
-}
-
-vm_offset_t kalloc(size)
-	vm_size_t size;
-{
-	register int zindex;
-	register vm_size_t allocsize;
-	vm_offset_t addr;
-
-	/* compute the size of the block that we will actually allocate */
-
-	assert (kalloc_init_called);
-
-	allocsize = size;
-	if (size < kalloc_max) {
-		allocsize = MINSIZE;
-		zindex = first_k_zone;
-		while (allocsize < size) {
-			allocsize <<= 1;
-			zindex++;
-		}
-	}
-
-	/*
-	 * If our size is still small enough, check the queue for that size
-	 * and allocate.
-	 */
-
-	if (allocsize < kalloc_max) {
-		addr = zalloc(k_zone[zindex]);
-	} else {
-		if (kmem_alloc_wired(kalloc_map, &addr, allocsize)
-							!= KERN_SUCCESS)
-			addr = 0;
-	}
-	return(addr);
-}
-
-vm_offset_t kget(size)
-	vm_size_t size;
-{
-	register int zindex;
-	register vm_size_t allocsize;
-	vm_offset_t addr;
-
-	assert (kalloc_init_called);
-
-	/* compute the size of the block that we will actually allocate */
-
-	allocsize = size;
-	if (size < kalloc_max) {
-		allocsize = MINSIZE;
-		zindex = first_k_zone;
-		while (allocsize < size) {
-			allocsize <<= 1;
-			zindex++;
-		}
-	}
-
-	/*
-	 * If our size is still small enough, check the queue for that size
-	 * and allocate.
-	 */
-
-	if (allocsize < kalloc_max) {
-		addr = zget(k_zone[zindex]);
-	} else {
-		/* This will never work, so we might as well panic */
-		panic("kget");
-	}
-	return(addr);
-}
-
-void
-kfree(data, size)
-	vm_offset_t data;
-	vm_size_t size;
-{
-	register int zindex;
-	register vm_size_t freesize;
-
-	assert (kalloc_init_called);
-
-	freesize = size;
-	if (size < kalloc_max) {
-		freesize = MINSIZE;
-		zindex = first_k_zone;
-		while (freesize < size) {
-			freesize <<= 1;
-			zindex++;
-		}
-	}
-
-	if (freesize < kalloc_max) {
-		zfree(k_zone[zindex], data);
-	} else {
-		kmem_free(kalloc_map, data, freesize);
-	}
-}
diff --git a/kern/slab.c b/kern/slab.c
new file mode 100644
index 0000000..38413e8
--- /dev/null
+++ b/kern/slab.c
@@ -0,0 +1,1576 @@
+/*
+ * Copyright (c) 2009, 2010, 2011 Richard Braun.
+ * Copyright (c) 2011 Maksym Planeta.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * Object caching and general purpose memory allocator.
+ *
+ * This allocator is based on the paper "The Slab Allocator: An Object-Caching
+ * Kernel Memory Allocator" by Jeff Bonwick.
+ *
+ * It allows the allocation of objects (i.e. fixed-size typed buffers) from
+ * caches and is efficient in both space and time. This implementation follows
+ * many of the indications from the paper mentioned. The most notable
+ * differences are outlined below.
+ *
+ * The per-cache self-scaling hash table for buffer-to-bufctl conversion,
+ * described in 3.2.3 "Slab Layout for Large Objects", has been replaced by
+ * a red-black tree storing slabs, sorted by address. The use of a
+ * self-balancing tree for buffer-to-slab conversions provides a few advantages
+ * over a hash table. Unlike a hash table, a BST provides a "lookup nearest"
+ * operation, so obtaining the slab data (whether it is embedded in the slab or
+ * off slab) from a buffer address simply consists of a "lookup nearest towards
+ * 0" tree search. Storing slabs instead of buffers also considerably reduces
+ * the number of elements to retain. Finally, a self-balancing tree is a true
+ * self-scaling data structure, whereas a hash table requires periodic
+ * maintenance and complete resizing, which is expensive. The only drawback is
+ * that releasing a buffer to the slab layer takes logarithmic time instead of
+ * constant time. But as the data set size is kept reasonable (because slabs
+ * are stored instead of buffers) and because the CPU pool layer services most
+ * requests, avoiding many accesses to the slab layer, it is considered an
+ * acceptable tradeoff.
+ *
+ * This implementation uses per-cpu pools of objects, which service most
+ * allocation requests. These pools act as caches (but are named differently
+ * to avoid confusion with CPU caches) that reduce contention on multiprocessor
+ * systems. When a pool is empty and cannot provide an object, it is filled by
+ * transferring multiple objects from the slab layer. The symmetric case is
+ * handled likewise.
+ */
+
+#include <string.h>
+#include <kern/assert.h>
+#include <kern/mach_clock.h>
+#include <kern/printf.h>
+#include <kern/slab.h>
+#include <kern/kalloc.h>
+#include <kern/cpu_number.h>
+#include <mach/vm_param.h>
+#include <mach/machine/vm_types.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_types.h>
+#include <sys/types.h>
+
+#ifdef MACH_DEBUG
+#include <mach_debug/slab_info.h>
+#endif
+
+/*
+ * Utility macros.
+ */
+#define ARRAY_SIZE(x)   (sizeof(x) / sizeof((x)[0]))
+#define P2ALIGNED(x, a) (((x) & ((a) - 1)) == 0)
+#define ISP2(x)         P2ALIGNED(x, x)
+#define P2ALIGN(x, a)   ((x) & -(a))
+#define P2ROUND(x, a)   (-(-(x) & -(a)))
+#define P2END(x, a)     (-(~(x) & -(a)))
+#define likely(expr)    __builtin_expect(!!(expr), 1)
+#define unlikely(expr)  __builtin_expect(!!(expr), 0)
+
+/*
+ * Minimum required alignment.
+ */
+#define KMEM_ALIGN_MIN 8
+
+/*
+ * Minimum number of buffers per slab.
+ *
+ * This value is ignored when the slab size exceeds a threshold.
+ */
+#define KMEM_MIN_BUFS_PER_SLAB 8
+
+/*
+ * Special slab size beyond which the minimum number of buffers per slab is
+ * ignored when computing the slab size of a cache.
+ */
+#define KMEM_SLAB_SIZE_THRESHOLD (8 * PAGE_SIZE)
+
+/*
+ * Special buffer size under which slab data is unconditionnally allocated
+ * from its associated slab.
+ */
+#define KMEM_BUF_SIZE_THRESHOLD (PAGE_SIZE / 8)
+
+/*
+ * Time (in seconds) between two garbage collection operations.
+ */
+#define KMEM_GC_INTERVAL (1 * hz)
+
+/*
+ * The transfer size of a CPU pool is computed by dividing the pool size by
+ * this value.
+ */
+#define KMEM_CPU_POOL_TRANSFER_RATIO 2
+
+/*
+ * Redzone guard word.
+ */
+#ifdef __LP64__
+#if _HOST_BIG_ENDIAN
+#define KMEM_REDZONE_WORD 0xfeedfacefeedfaceUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_REDZONE_WORD 0xcefaedfecefaedfeUL
+#endif /* _HOST_BIG_ENDIAN */
+#else /* __LP64__ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_REDZONE_WORD 0xfeedfaceUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_REDZONE_WORD 0xcefaedfeUL
+#endif /* _HOST_BIG_ENDIAN */
+#endif /* __LP64__ */
+
+/*
+ * Redzone byte for padding.
+ */
+#define KMEM_REDZONE_BYTE 0xbb
+
+/*
+ * Size of the VM submap from which default backend functions allocate.
+ */
+#define KMEM_MAP_SIZE (64 * 1024 * 1024)
+
+/*
+ * Shift for the first kalloc cache size.
+ */
+#define KALLOC_FIRST_SHIFT 5
+
+/*
+ * Number of caches backing general purpose allocations.
+ */
+#define KALLOC_NR_CACHES 13
+
+/*
+ * Size of the VM submap for general purpose allocations.
+ */
+#define KALLOC_MAP_SIZE (64 * 1024 * 1024)
+
+/*
+ * Values the buftag state member can take.
+ */
+#ifdef __LP64__
+#if _HOST_BIG_ENDIAN
+#define KMEM_BUFTAG_ALLOC   0xa110c8eda110c8edUL
+#define KMEM_BUFTAG_FREE    0xf4eeb10cf4eeb10cUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_BUFTAG_ALLOC   0xedc810a1edc810a1UL
+#define KMEM_BUFTAG_FREE    0x0cb1eef40cb1eef4UL
+#endif /* _HOST_BIG_ENDIAN */
+#else /* __LP64__ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_BUFTAG_ALLOC   0xa110c8edUL
+#define KMEM_BUFTAG_FREE    0xf4eeb10cUL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_BUFTAG_ALLOC   0xedc810a1UL
+#define KMEM_BUFTAG_FREE    0x0cb1eef4UL
+#endif /* _HOST_BIG_ENDIAN */
+#endif /* __LP64__ */
+
+/*
+ * Free and uninitialized patterns.
+ *
+ * These values are unconditionnally 64-bit wide since buffers are at least
+ * 8-byte aligned.
+ */
+#if _HOST_BIG_ENDIAN
+#define KMEM_FREE_PATTERN   0xdeadbeefdeadbeefULL
+#define KMEM_UNINIT_PATTERN 0xbaddcafebaddcafeULL
+#else /* _HOST_BIG_ENDIAN */
+#define KMEM_FREE_PATTERN   0xefbeaddeefbeaddeULL
+#define KMEM_UNINIT_PATTERN 0xfecaddbafecaddbaULL
+#endif /* _HOST_BIG_ENDIAN */
+
+/*
+ * Cache flags.
+ *
+ * The flags don't change once set and can be tested without locking.
+ */
+#define KMEM_CF_NO_CPU_POOL     0x01    /* CPU pool layer disabled */
+#define KMEM_CF_SLAB_EXTERNAL   0x02    /* Slab data is off slab */
+#define KMEM_CF_NO_RECLAIM      0x04    /* Slabs are not reclaimable */
+#define KMEM_CF_VERIFY          0x08    /* Debugging facilities enabled */
+#define KMEM_CF_DIRECT          0x10    /* No buf-to-slab tree lookup */
+
+/*
+ * Options for kmem_cache_alloc_verify().
+ */
+#define KMEM_AV_NOCONSTRUCT 0
+#define KMEM_AV_CONSTRUCT   1
+
+/*
+ * Error codes for kmem_cache_error().
+ */
+#define KMEM_ERR_INVALID    0   /* Invalid address being freed */
+#define KMEM_ERR_DOUBLEFREE 1   /* Freeing already free address */
+#define KMEM_ERR_BUFTAG     2   /* Invalid buftag content */
+#define KMEM_ERR_MODIFIED   3   /* Buffer modified while free */
+#define KMEM_ERR_REDZONE    4   /* Redzone violation */
+
+#if SLAB_USE_CPU_POOLS
+/*
+ * Available CPU pool types.
+ *
+ * For each entry, the CPU pool size applies from the entry buf_size
+ * (excluded) up to (and including) the buf_size of the preceding entry.
+ *
+ * See struct kmem_cpu_pool_type for a description of the values.
+ */
+static struct kmem_cpu_pool_type kmem_cpu_pool_types[] = {
+    {  32768,   1, 0,           NULL },
+    {   4096,   8, CPU_L1_SIZE, NULL },
+    {    256,  64, CPU_L1_SIZE, NULL },
+    {      0, 128, CPU_L1_SIZE, NULL }
+};
+
+/*
+ * Caches where CPU pool arrays are allocated from.
+ */
+static struct kmem_cache kmem_cpu_array_caches[ARRAY_SIZE(kmem_cpu_pool_types)];
+#endif /* SLAB_USE_CPU_POOLS */
+
+/*
+ * Cache for off slab data.
+ */
+static struct kmem_cache kmem_slab_cache;
+
+/*
+ * General purpose caches array.
+ */
+static struct kmem_cache kalloc_caches[KALLOC_NR_CACHES];
+
+/*
+ * List of all caches managed by the allocator.
+ */
+static struct list kmem_cache_list;
+static unsigned int kmem_nr_caches;
+static simple_lock_data_t __attribute__((used)) kmem_cache_list_lock;
+
+/*
+ * VM submap for slab caches (except general purpose allocations).
+ */
+static struct vm_map kmem_map_store;
+vm_map_t kmem_map = &kmem_map_store;
+
+/*
+ * VM submap for general purpose allocations.
+ */
+static struct vm_map kalloc_map_store;
+vm_map_t kalloc_map = &kalloc_map_store;
+
+/*
+ * Time of the last memory reclaim, in clock ticks.
+ */
+static unsigned int kmem_gc_last_tick;
+
+#define kmem_error(format, ...)                         \
+    printf("mem: error: %s(): " format "\n", __func__,  \
+           ## __VA_ARGS__)
+
+#define kmem_warn(format, ...)                              \
+    printf("mem: warning: %s(): " format "\n", __func__,    \
+           ## __VA_ARGS__)
+
+#define kmem_print(format, ...) \
+    printf(format "\n", ## __VA_ARGS__)
+
+static void kmem_cache_error(struct kmem_cache *cache, void *buf, int error,
+                             void *arg);
+static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache);
+static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf);
+
+static void * kmem_buf_verify_bytes(void *buf, void *pattern, size_t size)
+{
+    char *ptr, *pattern_ptr, *end;
+
+    end = buf + size;
+
+    for (ptr = buf, pattern_ptr = pattern; ptr < end; ptr++, pattern_ptr++)
+        if (*ptr != *pattern_ptr)
+            return ptr;
+
+    return NULL;
+}
+
+static void * kmem_buf_verify(void *buf, uint64_t pattern, vm_size_t size)
+{
+    uint64_t *ptr, *end;
+
+    assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+    assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+    end = buf + size;
+
+    for (ptr = buf; ptr < end; ptr++)
+        if (*ptr != pattern)
+            return kmem_buf_verify_bytes(ptr, &pattern, sizeof(pattern));
+
+    return NULL;
+}
+
+static void kmem_buf_fill(void *buf, uint64_t pattern, size_t size)
+{
+    uint64_t *ptr, *end;
+
+    assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+    assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+    end = buf + size;
+
+    for (ptr = buf; ptr < end; ptr++)
+        *ptr = pattern;
+}
+
+static void * kmem_buf_verify_fill(void *buf, uint64_t old, uint64_t new,
+                                   size_t size)
+{
+    uint64_t *ptr, *end;
+
+    assert(P2ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+    assert(P2ALIGNED(size, sizeof(uint64_t)));
+
+    end = buf + size;
+
+    for (ptr = buf; ptr < end; ptr++) {
+        if (*ptr != old)
+            return kmem_buf_verify_bytes(ptr, &old, sizeof(old));
+
+        *ptr = new;
+    }
+
+    return NULL;
+}
+
+static inline union kmem_bufctl *
+kmem_buf_to_bufctl(void *buf, struct kmem_cache *cache)
+{
+    return (union kmem_bufctl *)(buf + cache->bufctl_dist);
+}
+
+static inline struct kmem_buftag *
+kmem_buf_to_buftag(void *buf, struct kmem_cache *cache)
+{
+    return (struct kmem_buftag *)(buf + cache->buftag_dist);
+}
+
+static inline void * kmem_bufctl_to_buf(union kmem_bufctl *bufctl,
+                                        struct kmem_cache *cache)
+{
+    return (void *)bufctl - cache->bufctl_dist;
+}
+
+static vm_offset_t kmem_pagealloc(vm_size_t size)
+{
+    vm_offset_t addr;
+    kern_return_t kr;
+
+    kr = kmem_alloc_wired(kmem_map, &addr, size);
+
+    if (kr != KERN_SUCCESS)
+        return 0;
+
+    return addr;
+}
+
+static void kmem_pagefree(vm_offset_t ptr, vm_size_t size)
+{
+    kmem_free(kmem_map, ptr, size);
+}
+
+static void kmem_slab_create_verify(struct kmem_slab *slab,
+                                    struct kmem_cache *cache)
+{
+    struct kmem_buftag *buftag;
+    size_t buf_size;
+    unsigned long buffers;
+    void *buf;
+
+    buf_size = cache->buf_size;
+    buf = slab->addr;
+    buftag = kmem_buf_to_buftag(buf, cache);
+
+    for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+        kmem_buf_fill(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+        buftag->state = KMEM_BUFTAG_FREE;
+        buf += buf_size;
+        buftag = kmem_buf_to_buftag(buf, cache);
+    }
+}
+
+/*
+ * Create an empty slab for a cache.
+ *
+ * The caller must drop all locks before calling this function.
+ */
+static struct kmem_slab * kmem_slab_create(struct kmem_cache *cache,
+                                           size_t color)
+{
+    struct kmem_slab *slab;
+    union kmem_bufctl *bufctl;
+    size_t buf_size;
+    unsigned long buffers;
+    void *slab_buf;
+
+    if (cache->slab_alloc_fn == NULL)
+        slab_buf = (void *)kmem_pagealloc(cache->slab_size);
+    else
+        slab_buf = (void *)cache->slab_alloc_fn(cache->slab_size);
+
+    if (slab_buf == NULL)
+        return NULL;
+
+    if (cache->flags & KMEM_CF_SLAB_EXTERNAL) {
+        assert(!(cache->flags & KMEM_CF_NO_RECLAIM));
+        slab = (struct kmem_slab *)kmem_cache_alloc(&kmem_slab_cache);
+
+        if (slab == NULL) {
+            if (cache->slab_free_fn == NULL)
+                kmem_pagefree((vm_offset_t)slab_buf, cache->slab_size);
+            else
+                cache->slab_free_fn((vm_offset_t)slab_buf, cache->slab_size);
+
+            return NULL;
+        }
+    } else {
+        slab = (struct kmem_slab *)(slab_buf + cache->slab_size) - 1;
+    }
+
+    list_node_init(&slab->list_node);
+    rbtree_node_init(&slab->tree_node);
+    slab->nr_refs = 0;
+    slab->first_free = NULL;
+    slab->addr = slab_buf + color;
+
+    buf_size = cache->buf_size;
+    bufctl = kmem_buf_to_bufctl(slab->addr, cache);
+
+    for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+        bufctl->next = slab->first_free;
+        slab->first_free = bufctl;
+        bufctl = (union kmem_bufctl *)((void *)bufctl + buf_size);
+    }
+
+    if (cache->flags & KMEM_CF_VERIFY)
+        kmem_slab_create_verify(slab, cache);
+
+    return slab;
+}
+
+static void kmem_slab_destroy_verify(struct kmem_slab *slab,
+                                     struct kmem_cache *cache)
+{
+    struct kmem_buftag *buftag;
+    size_t buf_size;
+    unsigned long buffers;
+    void *buf, *addr;
+
+    buf_size = cache->buf_size;
+    buf = slab->addr;
+    buftag = kmem_buf_to_buftag(buf, cache);
+
+    for (buffers = cache->bufs_per_slab; buffers != 0; buffers--) {
+        if (buftag->state != KMEM_BUFTAG_FREE)
+            kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+
+        addr = kmem_buf_verify(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+
+        if (addr != NULL)
+            kmem_cache_error(cache, buf, KMEM_ERR_MODIFIED, addr);
+
+        buf += buf_size;
+        buftag = kmem_buf_to_buftag(buf, cache);
+    }
+}
+
+/*
+ * Destroy a slab.
+ *
+ * The caller must drop all locks before calling this function.
+ */
+static void kmem_slab_destroy(struct kmem_slab *slab, struct kmem_cache *cache)
+{
+    vm_offset_t slab_buf;
+
+    assert(slab->nr_refs == 0);
+    assert(slab->first_free != NULL);
+    assert(!(cache->flags & KMEM_CF_NO_RECLAIM));
+
+    if (cache->flags & KMEM_CF_VERIFY)
+        kmem_slab_destroy_verify(slab, cache);
+
+    slab_buf = (vm_offset_t)P2ALIGN((unsigned long)slab->addr, PAGE_SIZE);
+
+    if (cache->slab_free_fn == NULL)
+        kmem_pagefree(slab_buf, cache->slab_size);
+    else
+        cache->slab_free_fn(slab_buf, cache->slab_size);
+
+    if (cache->flags & KMEM_CF_SLAB_EXTERNAL)
+        kmem_cache_free(&kmem_slab_cache, (vm_offset_t)slab);
+}
+
+static inline int kmem_slab_use_tree(int flags)
+{
+    return !(flags & KMEM_CF_DIRECT) || (flags & KMEM_CF_VERIFY);
+}
+
+static inline int kmem_slab_cmp_lookup(const void *addr,
+                                       const struct rbtree_node *node)
+{
+    struct kmem_slab *slab;
+
+    slab = rbtree_entry(node, struct kmem_slab, tree_node);
+
+    if (addr == slab->addr)
+        return 0;
+    else if (addr < slab->addr)
+        return -1;
+    else
+        return 1;
+}
+
+static inline int kmem_slab_cmp_insert(const struct rbtree_node *a,
+                                       const struct rbtree_node *b)
+{
+    struct kmem_slab *slab;
+
+    slab = rbtree_entry(a, struct kmem_slab, tree_node);
+    return kmem_slab_cmp_lookup(slab->addr, b);
+}
+
+#if SLAB_USE_CPU_POOLS
+static void kmem_cpu_pool_init(struct kmem_cpu_pool *cpu_pool,
+                               struct kmem_cache *cache)
+{
+    simple_lock_init(&cpu_pool->lock);
+    cpu_pool->flags = cache->flags;
+    cpu_pool->size = 0;
+    cpu_pool->transfer_size = 0;
+    cpu_pool->nr_objs = 0;
+    cpu_pool->array = NULL;
+}
+
+/*
+ * Return a CPU pool.
+ *
+ * This function will generally return the pool matching the CPU running the
+ * calling thread. Because of context switches and thread migration, the
+ * caller might be running on another processor after this function returns.
+ * Although not optimal, this should rarely happen, and it doesn't affect the
+ * allocator operations in any other way, as CPU pools are always valid, and
+ * their access is serialized by a lock.
+ */
+static inline struct kmem_cpu_pool * kmem_cpu_pool_get(struct kmem_cache *cache)
+{
+    return &cache->cpu_pools[cpu_number()];
+}
+
+static inline void kmem_cpu_pool_build(struct kmem_cpu_pool *cpu_pool,
+                                       struct kmem_cache *cache, void **array)
+{
+    cpu_pool->size = cache->cpu_pool_type->array_size;
+    cpu_pool->transfer_size = (cpu_pool->size
+                               + KMEM_CPU_POOL_TRANSFER_RATIO - 1)
+                              / KMEM_CPU_POOL_TRANSFER_RATIO;
+    cpu_pool->array = array;
+}
+
+static inline void * kmem_cpu_pool_pop(struct kmem_cpu_pool *cpu_pool)
+{
+    cpu_pool->nr_objs--;
+    return cpu_pool->array[cpu_pool->nr_objs];
+}
+
+static inline void kmem_cpu_pool_push(struct kmem_cpu_pool *cpu_pool, void *obj)
+{
+    cpu_pool->array[cpu_pool->nr_objs] = obj;
+    cpu_pool->nr_objs++;
+}
+
+static int kmem_cpu_pool_fill(struct kmem_cpu_pool *cpu_pool,
+                              struct kmem_cache *cache)
+{
+    void *obj;
+    int i;
+
+    simple_lock(&cache->lock);
+
+    for (i = 0; i < cpu_pool->transfer_size; i++) {
+        obj = kmem_cache_alloc_from_slab(cache);
+
+        if (obj == NULL)
+            break;
+
+        kmem_cpu_pool_push(cpu_pool, obj);
+    }
+
+    simple_unlock(&cache->lock);
+
+    return i;
+}
+
+static void kmem_cpu_pool_drain(struct kmem_cpu_pool *cpu_pool,
+                                struct kmem_cache *cache)
+{
+    void *obj;
+    int i;
+
+    simple_lock(&cache->lock);
+
+    for (i = cpu_pool->transfer_size; i > 0; i--) {
+        obj = kmem_cpu_pool_pop(cpu_pool);
+        kmem_cache_free_to_slab(cache, obj);
+    }
+
+    simple_unlock(&cache->lock);
+}
+#endif /* SLAB_USE_CPU_POOLS */
+
+static void kmem_cache_error(struct kmem_cache *cache, void *buf, int error,
+                             void *arg)
+{
+    struct kmem_buftag *buftag;
+
+    kmem_error("cache: %s, buffer: %p", cache->name, (void *)buf);
+
+    switch(error) {
+    case KMEM_ERR_INVALID:
+        kmem_error("freeing invalid address");
+        break;
+    case KMEM_ERR_DOUBLEFREE:
+        kmem_error("attempting to free the same address twice");
+        break;
+    case KMEM_ERR_BUFTAG:
+        buftag = arg;
+        kmem_error("invalid buftag content, buftag state: %p",
+                   (void *)buftag->state);
+        break;
+    case KMEM_ERR_MODIFIED:
+        kmem_error("free buffer modified, fault address: %p, "
+                   "offset in buffer: %td", arg, arg - buf);
+        break;
+    case KMEM_ERR_REDZONE:
+        kmem_error("write beyond end of buffer, fault address: %p, "
+                   "offset in buffer: %td", arg, arg - buf);
+        break;
+    default:
+        kmem_error("unknown error");
+    }
+
+    /*
+     * Never reached.
+     */
+}
+
+/*
+ * Compute an appropriate slab size for the given cache.
+ *
+ * Once the slab size is known, this function sets the related properties
+ * (buffers per slab and maximum color). It can also set the KMEM_CF_DIRECT
+ * and/or KMEM_CF_SLAB_EXTERNAL flags depending on the resulting layout.
+ */
+static void kmem_cache_compute_sizes(struct kmem_cache *cache, int flags)
+{
+    size_t i, buffers, buf_size, slab_size, free_slab_size, optimal_size;
+    size_t waste, waste_min;
+    int embed, optimal_embed = optimal_embed;
+
+    buf_size = cache->buf_size;
+
+    if (buf_size < KMEM_BUF_SIZE_THRESHOLD)
+        flags |= KMEM_CACHE_NOOFFSLAB;
+
+    i = 0;
+    waste_min = (size_t)-1;
+
+    do {
+        i++;
+        slab_size = P2ROUND(i * buf_size, PAGE_SIZE);
+        free_slab_size = slab_size;
+
+        if (flags & KMEM_CACHE_NOOFFSLAB)
+            free_slab_size -= sizeof(struct kmem_slab);
+
+        buffers = free_slab_size / buf_size;
+        waste = free_slab_size % buf_size;
+
+        if (buffers > i)
+            i = buffers;
+
+        if (flags & KMEM_CACHE_NOOFFSLAB)
+            embed = 1;
+        else if (sizeof(struct kmem_slab) <= waste) {
+            embed = 1;
+            waste -= sizeof(struct kmem_slab);
+        } else {
+            embed = 0;
+        }
+
+        if (waste <= waste_min) {
+            waste_min = waste;
+            optimal_size = slab_size;
+            optimal_embed = embed;
+        }
+    } while ((buffers < KMEM_MIN_BUFS_PER_SLAB)
+             && (slab_size < KMEM_SLAB_SIZE_THRESHOLD));
+
+    assert(!(flags & KMEM_CACHE_NOOFFSLAB) || optimal_embed);
+
+    cache->slab_size = optimal_size;
+    slab_size = cache->slab_size - (optimal_embed
+                ? sizeof(struct kmem_slab)
+                : 0);
+    cache->bufs_per_slab = slab_size / buf_size;
+    cache->color_max = slab_size % buf_size;
+
+    if (cache->color_max >= PAGE_SIZE)
+        cache->color_max = PAGE_SIZE - 1;
+
+    if (optimal_embed) {
+        if (cache->slab_size == PAGE_SIZE)
+            cache->flags |= KMEM_CF_DIRECT;
+    } else {
+        cache->flags |= KMEM_CF_SLAB_EXTERNAL;
+    }
+}
+
+void kmem_cache_init(struct kmem_cache *cache, const char *name,
+                     size_t obj_size, size_t align, kmem_cache_ctor_t ctor,
+                     kmem_slab_alloc_fn_t slab_alloc_fn,
+                     kmem_slab_free_fn_t slab_free_fn, int flags)
+{
+#if SLAB_USE_CPU_POOLS
+    struct kmem_cpu_pool_type *cpu_pool_type;
+    size_t i;
+#endif /* SLAB_USE_CPU_POOLS */
+    size_t buf_size;
+
+#if SLAB_VERIFY
+    cache->flags = KMEM_CF_VERIFY;
+#else /* SLAB_VERIFY */
+    cache->flags = 0;
+#endif /* SLAB_VERIFY */
+
+    if (flags & KMEM_CACHE_NOCPUPOOL)
+        cache->flags |= KMEM_CF_NO_CPU_POOL;
+
+    if (flags & KMEM_CACHE_NORECLAIM) {
+        assert(slab_free_fn == NULL);
+        flags |= KMEM_CACHE_NOOFFSLAB;
+        cache->flags |= KMEM_CF_NO_RECLAIM;
+    }
+
+    if (flags & KMEM_CACHE_VERIFY)
+        cache->flags |= KMEM_CF_VERIFY;
+
+    if (align < KMEM_ALIGN_MIN)
+        align = KMEM_ALIGN_MIN;
+
+    assert(obj_size > 0);
+    assert(ISP2(align));
+    assert(align < PAGE_SIZE);
+
+    buf_size = P2ROUND(obj_size, align);
+
+    simple_lock_init(&cache->lock);
+    list_node_init(&cache->node);
+    list_init(&cache->partial_slabs);
+    list_init(&cache->free_slabs);
+    rbtree_init(&cache->active_slabs);
+    cache->obj_size = obj_size;
+    cache->align = align;
+    cache->buf_size = buf_size;
+    cache->bufctl_dist = buf_size - sizeof(union kmem_bufctl);
+    cache->color = 0;
+    cache->nr_objs = 0;
+    cache->nr_bufs = 0;
+    cache->nr_slabs = 0;
+    cache->nr_free_slabs = 0;
+    cache->ctor = ctor;
+    cache->slab_alloc_fn = slab_alloc_fn;
+    cache->slab_free_fn = slab_free_fn;
+    strncpy(cache->name, name, sizeof(cache->name));
+    cache->name[sizeof(cache->name) - 1] = '\0';
+    cache->buftag_dist = 0;
+    cache->redzone_pad = 0;
+
+    if (cache->flags & KMEM_CF_VERIFY) {
+        cache->bufctl_dist = buf_size;
+        cache->buftag_dist = cache->bufctl_dist + sizeof(union kmem_bufctl);
+        cache->redzone_pad = cache->bufctl_dist - cache->obj_size;
+        buf_size += sizeof(union kmem_bufctl) + sizeof(struct kmem_buftag);
+        buf_size = P2ROUND(buf_size, align);
+        cache->buf_size = buf_size;
+    }
+
+    kmem_cache_compute_sizes(cache, flags);
+
+#if SLAB_USE_CPU_POOLS
+    for (cpu_pool_type = kmem_cpu_pool_types;
+         buf_size <= cpu_pool_type->buf_size;
+         cpu_pool_type++);
+
+    cache->cpu_pool_type = cpu_pool_type;
+
+    for (i = 0; i < ARRAY_SIZE(cache->cpu_pools); i++)
+        kmem_cpu_pool_init(&cache->cpu_pools[i], cache);
+#endif /* SLAB_USE_CPU_POOLS */
+
+    simple_lock(&kmem_cache_list_lock);
+    list_insert_tail(&kmem_cache_list, &cache->node);
+    kmem_nr_caches++;
+    simple_unlock(&kmem_cache_list_lock);
+}
+
+static inline int kmem_cache_empty(struct kmem_cache *cache)
+{
+    return cache->nr_objs == cache->nr_bufs;
+}
+
+static int kmem_cache_grow(struct kmem_cache *cache)
+{
+    struct kmem_slab *slab;
+    size_t color;
+    int empty;
+
+    simple_lock(&cache->lock);
+
+    if (!kmem_cache_empty(cache)) {
+        simple_unlock(&cache->lock);
+        return 1;
+    }
+
+    color = cache->color;
+    cache->color += cache->align;
+
+    if (cache->color > cache->color_max)
+        cache->color = 0;
+
+    simple_unlock(&cache->lock);
+
+    slab = kmem_slab_create(cache, color);
+
+    simple_lock(&cache->lock);
+
+    if (slab != NULL) {
+        list_insert_tail(&cache->free_slabs, &slab->list_node);
+        cache->nr_bufs += cache->bufs_per_slab;
+        cache->nr_slabs++;
+        cache->nr_free_slabs++;
+    }
+
+    /*
+     * Even if our slab creation failed, another thread might have succeeded
+     * in growing the cache.
+     */
+    empty = kmem_cache_empty(cache);
+
+    simple_unlock(&cache->lock);
+
+    return !empty;
+}
+
+static void kmem_cache_reap(struct kmem_cache *cache)
+{
+    struct kmem_slab *slab;
+    struct list dead_slabs;
+
+    if (cache->flags & KMEM_CF_NO_RECLAIM)
+        return;
+
+    list_init(&dead_slabs);
+
+    simple_lock(&cache->lock);
+
+    while (!list_empty(&cache->free_slabs)) {
+        slab = list_first_entry(&cache->free_slabs, struct kmem_slab,
+                                list_node);
+        list_remove(&slab->list_node);
+        list_insert(&dead_slabs, &slab->list_node);
+        cache->nr_bufs -= cache->bufs_per_slab;
+        cache->nr_slabs--;
+        cache->nr_free_slabs--;
+    }
+
+    simple_unlock(&cache->lock);
+
+    while (!list_empty(&dead_slabs)) {
+        slab = list_first_entry(&dead_slabs, struct kmem_slab, list_node);
+        list_remove(&slab->list_node);
+        kmem_slab_destroy(slab, cache);
+    }
+}
+
+/*
+ * Allocate a raw (unconstructed) buffer from the slab layer of a cache.
+ *
+ * The cache must be locked before calling this function.
+ */
+static void * kmem_cache_alloc_from_slab(struct kmem_cache *cache)
+{
+    struct kmem_slab *slab;
+    union kmem_bufctl *bufctl;
+
+    if (!list_empty(&cache->partial_slabs))
+        slab = list_first_entry(&cache->partial_slabs, struct kmem_slab,
+                                list_node);
+    else if (!list_empty(&cache->free_slabs))
+        slab = list_first_entry(&cache->free_slabs, struct kmem_slab,
+                                list_node);
+    else
+        return NULL;
+
+    bufctl = slab->first_free;
+    assert(bufctl != NULL);
+    slab->first_free = bufctl->next;
+    slab->nr_refs++;
+    cache->nr_objs++;
+
+    /*
+     * The slab has become complete.
+     */
+    if (slab->nr_refs == cache->bufs_per_slab) {
+        list_remove(&slab->list_node);
+
+        if (slab->nr_refs == 1)
+            cache->nr_free_slabs--;
+    } else if (slab->nr_refs == 1) {
+        /*
+         * The slab has become partial.
+         */
+        list_remove(&slab->list_node);
+        list_insert_tail(&cache->partial_slabs, &slab->list_node);
+        cache->nr_free_slabs--;
+    } else if (!list_singular(&cache->partial_slabs)) {
+        struct list *node;
+        struct kmem_slab *tmp;
+
+        /*
+         * The slab remains partial. If there are more than one partial slabs,
+         * maintain the list sorted.
+         */
+
+        assert(slab->nr_refs > 1);
+
+        for (node = list_prev(&slab->list_node);
+             !list_end(&cache->partial_slabs, node);
+             node = list_prev(node)) {
+            tmp = list_entry(node, struct kmem_slab, list_node);
+
+            if (tmp->nr_refs >= slab->nr_refs)
+                break;
+        }
+
+        /*
+         * If the direct neighbor was found, the list is already sorted.
+         * If no slab was found, the slab is inserted at the head of the list.
+         */
+        if (node != list_prev(&slab->list_node)) {
+            list_remove(&slab->list_node);
+            list_insert_after(node, &slab->list_node);
+        }
+    }
+
+    if ((slab->nr_refs == 1) && kmem_slab_use_tree(cache->flags))
+        rbtree_insert(&cache->active_slabs, &slab->tree_node,
+                      kmem_slab_cmp_insert);
+
+    return kmem_bufctl_to_buf(bufctl, cache);
+}
+
+/*
+ * Release a buffer to the slab layer of a cache.
+ *
+ * The cache must be locked before calling this function.
+ */
+static void kmem_cache_free_to_slab(struct kmem_cache *cache, void *buf)
+{
+    struct kmem_slab *slab;
+    union kmem_bufctl *bufctl;
+
+    if (cache->flags & KMEM_CF_DIRECT) {
+        assert(cache->slab_size == PAGE_SIZE);
+        slab = (struct kmem_slab *)P2END((unsigned long)buf, cache->slab_size)
+               - 1;
+    } else {
+        struct rbtree_node *node;
+
+        node = rbtree_lookup_nearest(&cache->active_slabs, buf,
+                                     kmem_slab_cmp_lookup, RBTREE_LEFT);
+        assert(node != NULL);
+        slab = rbtree_entry(node, struct kmem_slab, tree_node);
+        assert((unsigned long)buf < (P2ALIGN((unsigned long)slab->addr
+                                             + cache->slab_size, PAGE_SIZE)));
+    }
+
+    assert(slab->nr_refs >= 1);
+    assert(slab->nr_refs <= cache->bufs_per_slab);
+    bufctl = kmem_buf_to_bufctl(buf, cache);
+    bufctl->next = slab->first_free;
+    slab->first_free = bufctl;
+    slab->nr_refs--;
+    cache->nr_objs--;
+
+    /*
+     * The slab has become free.
+     */
+    if (slab->nr_refs == 0) {
+        if (kmem_slab_use_tree(cache->flags))
+            rbtree_remove(&cache->active_slabs, &slab->tree_node);
+
+        /*
+         * The slab was partial.
+         */
+        if (cache->bufs_per_slab > 1)
+            list_remove(&slab->list_node);
+
+        list_insert_tail(&cache->free_slabs, &slab->list_node);
+        cache->nr_free_slabs++;
+    } else if (slab->nr_refs == (cache->bufs_per_slab - 1)) {
+        /*
+         * The slab has become partial.
+         */
+        list_insert(&cache->partial_slabs, &slab->list_node);
+    } else if (!list_singular(&cache->partial_slabs)) {
+        struct list *node;
+        struct kmem_slab *tmp;
+
+        /*
+         * The slab remains partial. If there are more than one partial slabs,
+         * maintain the list sorted.
+         */
+
+        assert(slab->nr_refs > 0);
+
+        for (node = list_next(&slab->list_node);
+             !list_end(&cache->partial_slabs, node);
+             node = list_next(node)) {
+            tmp = list_entry(node, struct kmem_slab, list_node);
+
+            if (tmp->nr_refs <= slab->nr_refs)
+                break;
+        }
+
+        /*
+         * If the direct neighbor was found, the list is already sorted.
+         * If no slab was found, the slab is inserted at the tail of the list.
+         */
+        if (node != list_next(&slab->list_node)) {
+            list_remove(&slab->list_node);
+            list_insert_before(node, &slab->list_node);
+        }
+    }
+}
+
+static void kmem_cache_alloc_verify(struct kmem_cache *cache, void *buf,
+                                    int construct)
+{
+    struct kmem_buftag *buftag;
+    union kmem_bufctl *bufctl;
+    void *addr;
+
+    buftag = kmem_buf_to_buftag(buf, cache);
+
+    if (buftag->state != KMEM_BUFTAG_FREE)
+        kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+
+    addr = kmem_buf_verify_fill(buf, KMEM_FREE_PATTERN, KMEM_UNINIT_PATTERN,
+                                cache->bufctl_dist);
+
+    if (addr != NULL)
+        kmem_cache_error(cache, buf, KMEM_ERR_MODIFIED, addr);
+
+    addr = buf + cache->obj_size;
+    memset(addr, KMEM_REDZONE_BYTE, cache->redzone_pad);
+
+    bufctl = kmem_buf_to_bufctl(buf, cache);
+    bufctl->redzone = KMEM_REDZONE_WORD;
+    buftag->state = KMEM_BUFTAG_ALLOC;
+
+    if (construct && (cache->ctor != NULL))
+        cache->ctor(buf);
+}
+
+vm_offset_t kmem_cache_alloc(struct kmem_cache *cache)
+{
+    int filled;
+    void *buf;
+
+#if SLAB_USE_CPU_POOLS
+    struct kmem_cpu_pool *cpu_pool;
+
+    cpu_pool = kmem_cpu_pool_get(cache);
+
+    if (cpu_pool->flags & KMEM_CF_NO_CPU_POOL)
+        goto slab_alloc;
+
+    simple_lock(&cpu_pool->lock);
+
+fast_alloc:
+    if (likely(cpu_pool->nr_objs > 0)) {
+        buf = kmem_cpu_pool_pop(cpu_pool);
+        simple_unlock(&cpu_pool->lock);
+
+        if (cpu_pool->flags & KMEM_CF_VERIFY)
+            kmem_cache_alloc_verify(cache, buf, KMEM_AV_CONSTRUCT);
+
+        return (vm_offset_t)buf;
+    }
+
+    if (cpu_pool->array != NULL) {
+        filled = kmem_cpu_pool_fill(cpu_pool, cache);
+
+        if (!filled) {
+            simple_unlock(&cpu_pool->lock);
+
+            filled = kmem_cache_grow(cache);
+
+            if (!filled)
+                return 0;
+
+            simple_lock(&cpu_pool->lock);
+        }
+
+        goto fast_alloc;
+    }
+
+    simple_unlock(&cpu_pool->lock);
+#endif /* SLAB_USE_CPU_POOLS */
+
+slab_alloc:
+    simple_lock(&cache->lock);
+    buf = kmem_cache_alloc_from_slab(cache);
+    simple_unlock(&cache->lock);
+
+    if (buf == NULL) {
+        filled = kmem_cache_grow(cache);
+
+        if (!filled)
+            return 0;
+
+        goto slab_alloc;
+    }
+
+    if (cache->flags & KMEM_CF_VERIFY)
+        kmem_cache_alloc_verify(cache, buf, KMEM_AV_NOCONSTRUCT);
+
+    if (cache->ctor != NULL)
+        cache->ctor(buf);
+
+    return (vm_offset_t)buf;
+}
+
+static void kmem_cache_free_verify(struct kmem_cache *cache, void *buf)
+{
+    struct rbtree_node *node;
+    struct kmem_buftag *buftag;
+    struct kmem_slab *slab;
+    union kmem_bufctl *bufctl;
+    unsigned char *redzone_byte;
+    unsigned long slabend;
+
+    simple_lock(&cache->lock);
+    node = rbtree_lookup_nearest(&cache->active_slabs, buf,
+                                 kmem_slab_cmp_lookup, RBTREE_LEFT);
+    simple_unlock(&cache->lock);
+
+    if (node == NULL)
+        kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+    slab = rbtree_entry(node, struct kmem_slab, tree_node);
+    slabend = P2ALIGN((unsigned long)slab->addr + cache->slab_size, PAGE_SIZE);
+
+    if ((unsigned long)buf >= slabend)
+        kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+    if ((((unsigned long)buf - (unsigned long)slab->addr) % cache->buf_size)
+        != 0)
+        kmem_cache_error(cache, buf, KMEM_ERR_INVALID, NULL);
+
+    /*
+     * As the buffer address is valid, accessing its buftag is safe.
+     */
+    buftag = kmem_buf_to_buftag(buf, cache);
+
+    if (buftag->state != KMEM_BUFTAG_ALLOC) {
+        if (buftag->state == KMEM_BUFTAG_FREE)
+            kmem_cache_error(cache, buf, KMEM_ERR_DOUBLEFREE, NULL);
+        else
+            kmem_cache_error(cache, buf, KMEM_ERR_BUFTAG, buftag);
+    }
+
+    redzone_byte = buf + cache->obj_size;
+    bufctl = kmem_buf_to_bufctl(buf, cache);
+
+    while (redzone_byte < (unsigned char *)bufctl) {
+        if (*redzone_byte != KMEM_REDZONE_BYTE)
+            kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+
+        redzone_byte++;
+    }
+
+    if (bufctl->redzone != KMEM_REDZONE_WORD) {
+        unsigned long word;
+
+        word = KMEM_REDZONE_WORD;
+        redzone_byte = kmem_buf_verify_bytes(&bufctl->redzone, &word,
+                                             sizeof(bufctl->redzone));
+        kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+    }
+
+    kmem_buf_fill(buf, KMEM_FREE_PATTERN, cache->bufctl_dist);
+    buftag->state = KMEM_BUFTAG_FREE;
+}
+
+void kmem_cache_free(struct kmem_cache *cache, vm_offset_t obj)
+{
+#if SLAB_USE_CPU_POOLS
+    struct kmem_cpu_pool *cpu_pool;
+    void **array;
+
+    cpu_pool = kmem_cpu_pool_get(cache);
+
+    if (cpu_pool->flags & KMEM_CF_VERIFY) {
+#else /* SLAB_USE_CPU_POOLS */
+    if (cache->flags & KMEM_CF_VERIFY) {
+#endif /* SLAB_USE_CPU_POOLS */
+        kmem_cache_free_verify(cache, (void *)obj);
+    }
+
+#if SLAB_USE_CPU_POOLS
+    if (cpu_pool->flags & KMEM_CF_NO_CPU_POOL)
+        goto slab_free;
+
+    simple_lock(&cpu_pool->lock);
+
+fast_free:
+    if (likely(cpu_pool->nr_objs < cpu_pool->size)) {
+        kmem_cpu_pool_push(cpu_pool, (void *)obj);
+        simple_unlock(&cpu_pool->lock);
+        return;
+    }
+
+    if (cpu_pool->array != NULL) {
+        kmem_cpu_pool_drain(cpu_pool, cache);
+        goto fast_free;
+    }
+
+    simple_unlock(&cpu_pool->lock);
+
+    array = (void *)kmem_cache_alloc(cache->cpu_pool_type->array_cache);
+
+    if (array != NULL) {
+        simple_lock(&cpu_pool->lock);
+
+        /*
+         * Another thread may have built the CPU pool while the lock was
+         * dropped.
+         */
+        if (cpu_pool->array != NULL) {
+            simple_unlock(&cpu_pool->lock);
+            kmem_cache_free(cache->cpu_pool_type->array_cache,
+                            (vm_offset_t)array);
+            goto fast_free;
+        }
+
+        kmem_cpu_pool_build(cpu_pool, cache, array);
+        goto fast_free;
+    }
+
+slab_free:
+#endif /* SLAB_USE_CPU_POOLS */
+
+    kmem_cache_free_to_slab(cache, (void *)obj);
+}
+
+void slab_collect(void)
+{
+    struct kmem_cache *cache;
+
+    if (sched_tick <= (kmem_gc_last_tick + KMEM_GC_INTERVAL))
+        return;
+
+    kmem_gc_last_tick = sched_tick;
+
+    simple_lock(&mem_cache_list_lock);
+
+    list_for_each_entry(&kmem_cache_list, cache, node)
+        kmem_cache_reap(cache);
+
+    simple_unlock(&mem_cache_list_lock);
+}
+
+void slab_bootstrap(void)
+{
+    /* Make sure a bufctl can always be stored in a buffer */
+    assert(sizeof(union kmem_bufctl) <= KMEM_ALIGN_MIN);
+
+    list_init(&kmem_cache_list);
+    simple_lock_init(&kmem_cache_list_lock);
+}
+
+void slab_init(void)
+{
+    vm_offset_t min, max;
+
+#if SLAB_USE_CPU_POOLS
+    struct kmem_cpu_pool_type *cpu_pool_type;
+    char name[KMEM_CACHE_NAME_SIZE];
+    size_t i, size;
+#endif /* SLAB_USE_CPU_POOLS */
+
+    kmem_submap(kmem_map, kernel_map, &min, &max, KMEM_MAP_SIZE, FALSE);
+
+#if SLAB_USE_CPU_POOLS
+    for (i = 0; i < ARRAY_SIZE(kmem_cpu_pool_types); i++) {
+        cpu_pool_type = &kmem_cpu_pool_types[i];
+        cpu_pool_type->array_cache = &kmem_cpu_array_caches[i];
+        sprintf(name, "kmem_cpu_array_%d", cpu_pool_type->array_size);
+        size = sizeof(void *) * cpu_pool_type->array_size;
+        kmem_cache_init(cpu_pool_type->array_cache, name, size,
+                        cpu_pool_type->array_align, NULL, NULL, NULL, 0);
+    }
+#endif /* SLAB_USE_CPU_POOLS */
+
+    /*
+     * Prevent off slab data for the slab cache to avoid infinite recursion.
+     */
+    kmem_cache_init(&kmem_slab_cache, "kmem_slab", sizeof(struct kmem_slab),
+                    0, NULL, NULL, NULL, KMEM_CACHE_NOOFFSLAB);
+}
+
+static vm_offset_t kalloc_pagealloc(vm_size_t size)
+{
+    vm_offset_t addr;
+    kern_return_t kr;
+
+    kr = kmem_alloc_wired(kalloc_map, &addr, size);
+
+    if (kr != KERN_SUCCESS)
+        return 0;
+
+    return addr;
+}
+
+static void kalloc_pagefree(vm_offset_t ptr, vm_size_t size)
+{
+    kmem_free(kalloc_map, ptr, size);
+}
+
+void kalloc_init(void)
+{
+    char name[KMEM_CACHE_NAME_SIZE];
+    size_t i, size;
+    vm_offset_t min, max;
+
+    kmem_submap(kalloc_map, kernel_map, &min, &max, KALLOC_MAP_SIZE, FALSE);
+
+    size = 1 << KALLOC_FIRST_SHIFT;
+
+    for (i = 0; i < ARRAY_SIZE(kalloc_caches); i++) {
+        sprintf(name, "kalloc_%u", size);
+        kmem_cache_init(&kalloc_caches[i], name, size, 0, NULL,
+                        kalloc_pagealloc, kalloc_pagefree, 0);
+        size <<= 1;
+    }
+}
+
+/*
+ * Return the kalloc cache index matching the given allocation size, which
+ * must be strictly greater than 0.
+ */
+static inline size_t kalloc_get_index(unsigned long size)
+{
+    assert(size != 0);
+
+    size = (size - 1) >> KALLOC_FIRST_SHIFT;
+
+    if (size == 0)
+        return 0;
+    else
+        return (sizeof(long) * 8) - __builtin_clzl(size);
+}
+
+static void kalloc_verify(struct kmem_cache *cache, void *buf, size_t size)
+{
+    size_t redzone_size;
+    void *redzone;
+
+    assert(size <= cache->obj_size);
+
+    redzone = buf + size;
+    redzone_size = cache->obj_size - size;
+    memset(redzone, KMEM_REDZONE_BYTE, redzone_size);
+}
+
+vm_offset_t kalloc(vm_size_t size)
+{
+    size_t index;
+    void *buf;
+
+    if (size == 0)
+        return 0;
+
+    index = kalloc_get_index(size);
+
+    if (index < ARRAY_SIZE(kalloc_caches)) {
+        struct kmem_cache *cache;
+
+        cache = &kalloc_caches[index];
+        buf = (void *)kmem_cache_alloc(cache);
+
+        if ((buf != 0) && (cache->flags & KMEM_CF_VERIFY))
+            kalloc_verify(cache, buf, size);
+    } else
+        buf = (void *)kalloc_pagealloc(size);
+
+    return (vm_offset_t)buf;
+}
+
+static void kfree_verify(struct kmem_cache *cache, void *buf, size_t size)
+{
+    unsigned char *redzone_byte, *redzone_end;
+
+    assert(size <= cache->obj_size);
+
+    redzone_byte = buf + size;
+    redzone_end = buf + cache->obj_size;
+
+    while (redzone_byte < redzone_end) {
+        if (*redzone_byte != KMEM_REDZONE_BYTE)
+            kmem_cache_error(cache, buf, KMEM_ERR_REDZONE, redzone_byte);
+
+        redzone_byte++;
+    }
+}
+
+void kfree(vm_offset_t data, vm_size_t size)
+{
+    size_t index;
+
+    if ((data == 0) || (size == 0))
+        return;
+
+    index = kalloc_get_index(size);
+
+    if (index < ARRAY_SIZE(kalloc_caches)) {
+        struct kmem_cache *cache;
+
+        cache = &kalloc_caches[index];
+
+        if (cache->flags & KMEM_CF_VERIFY)
+            kfree_verify(cache, (void *)data, size);
+
+        kmem_cache_free(cache, data);
+    } else {
+        kalloc_pagefree(data, size);
+    }
+}
+
+#if MACH_DEBUG
+kern_return_t host_slab_info(host_t host, cache_info_array_t *infop,
+                             unsigned int *infoCntp)
+{
+    struct kmem_cache *cache;
+    cache_info_t *info;
+    unsigned int i, nr_caches;
+    vm_size_t info_size = info_size;
+    kern_return_t kr;
+
+    if (host == HOST_NULL)
+        return KERN_INVALID_HOST;
+
+    /*
+     * Assume the cache list is unaltered once the kernel is ready.
+     */
+
+    simple_lock(&mem_cache_list_lock);
+    nr_caches = kmem_nr_caches;
+    simple_unlock(&mem_cache_list_lock);
+
+    if (nr_caches <= *infoCntp)
+        info = *infop;
+    else {
+        vm_offset_t info_addr;
+
+        info_size = round_page(nr_caches * sizeof(*info));
+        kr = kmem_alloc_pageable(ipc_kernel_map, &info_addr, info_size);
+
+        if (kr != KERN_SUCCESS)
+            return kr;
+
+        info = (cache_info_t *)info_addr;
+    }
+
+    if (info == NULL)
+        return KERN_RESOURCE_SHORTAGE;
+
+    i = 0;
+
+    list_for_each_entry(&kmem_cache_list, cache, node) {
+        simple_lock(&cache_lock);
+        info[i].flags = ((cache->flags & KMEM_CF_NO_CPU_POOL)
+                         ? CACHE_FLAGS_NO_CPU_POOL : 0)
+                        | ((cache->flags & KMEM_CF_SLAB_EXTERNAL)
+                           ? CACHE_FLAGS_SLAB_EXTERNAL : 0)
+                        | ((cache->flags & KMEM_CF_NO_RECLAIM)
+                           ? CACHE_FLAGS_NO_RECLAIM : 0)
+                        | ((cache->flags & KMEM_CF_VERIFY)
+                           ? CACHE_FLAGS_VERIFY : 0)
+                        | ((cache->flags & KMEM_CF_DIRECT)
+                           ? CACHE_FLAGS_DIRECT : 0);
+#if SLAB_USE_CPU_POOLS
+        info[i].cpu_pool_size = cache->cpu_pool_type->array_size;
+#else /* SLAB_USE_CPU_POOLS */
+        info[i].cpu_pool_size = 0;
+#endif /* SLAB_USE_CPU_POOLS */
+        info[i].obj_size = cache->obj_size;
+        info[i].align = cache->align;
+        info[i].buf_size = cache->buf_size;
+        info[i].slab_size = cache->slab_size;
+        info[i].bufs_per_slab = cache->bufs_per_slab;
+        info[i].nr_objs = cache->nr_objs;
+        info[i].nr_bufs = cache->nr_bufs;
+        info[i].nr_slabs = cache->nr_slabs;
+        info[i].nr_free_slabs = cache->nr_free_slabs;
+        strncpy(info[i].name, cache->name, sizeof(info[i].name));
+        info[i].name[sizeof(info[i].name) - 1] = '\0';
+        simple_unlock(&cache->lock);
+
+        i++;
+    }
+
+    if (info != *infop) {
+        vm_map_copy_t copy;
+        vm_size_t used;
+
+        used = nr_caches * sizeof(*info);
+
+        if (used != info_size)
+            memset((char *)info + used, 0, info_size - used);
+
+        kr = vm_map_copyin(ipc_kernel_map, (vm_offset_t)info, used, TRUE,
+                           &copy);
+
+        assert(kr == KERN_SUCCESS);
+        *infop = (cache_info_t *)copy;
+    }
+
+    *infoCntp = nr_caches;
+
+    return KERN_SUCCESS;
+}
+#endif /* MACH_DEBUG */
diff --git a/kern/slab.h b/kern/slab.h
new file mode 100644
index 0000000..14c820b
--- /dev/null
+++ b/kern/slab.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2009, 2010, 2011 Richard Braun.
+ * Copyright (c) 2011 Maksym Planeta.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _KERN_SLAB_H
+#define _KERN_SLAB_H
+
+#include <kern/lock.h>
+#include <kern/list.h>
+#include <kern/rbtree.h>
+#include <mach/machine/vm_types.h>
+#include <sys/types.h>
+#include <vm/vm_types.h>
+
+#if SLAB_USE_CPU_POOLS
+/*
+ * L1 cache line size.
+ */
+#define CPU_L1_SIZE (1 << CPU_L1_SHIFT)
+
+/*
+ * Per-processor cache of pre-constructed objects.
+ *
+ * The flags member is a read-only CPU-local copy of the parent cache flags.
+ */
+struct kmem_cpu_pool {
+    simple_lock_data_t lock;
+    int flags;
+    int size;
+    int transfer_size;
+    int nr_objs;
+    void **array;
+} __attribute__((aligned(CPU_L1_SIZE)));
+
+/*
+ * When a cache is created, its CPU pool type is determined from the buffer
+ * size. For small buffer sizes, many objects can be cached in a CPU pool.
+ * Conversely, for large buffer sizes, this would incur much overhead, so only
+ * a few objects are stored in a CPU pool.
+ */
+struct kmem_cpu_pool_type {
+    size_t buf_size;
+    int array_size;
+    size_t array_align;
+    struct kmem_cache *array_cache;
+};
+#endif /* SLAB_USE_CPU_POOLS */
+
+/*
+ * Buffer descriptor.
+ *
+ * For normal caches (i.e. without SLAB_CF_VERIFY), bufctls are located at the
+ * end of (but inside) each buffer. If SLAB_CF_VERIFY is set, bufctls are
+ * located after each buffer.
+ *
+ * When an object is allocated to a client, its bufctl isn't used. This memory
+ * is instead used for redzoning if cache debugging is in effect.
+ */
+union kmem_bufctl {
+    union kmem_bufctl *next;
+    unsigned long redzone;
+};
+
+/*
+ * Buffer tag.
+ *
+ * This structure is only used for SLAB_CF_VERIFY caches. It is located after
+ * the bufctl and includes information about the state of the buffer it
+ * describes (allocated or not). It should be thought of as a debugging
+ * extension of the bufctl.
+ */
+struct kmem_buftag {
+    unsigned long state;
+};
+
+/*
+ * Page-aligned collection of unconstructed buffers.
+ */
+struct kmem_slab {
+    struct list list_node;
+    struct rbtree_node tree_node;
+    unsigned long nr_refs;
+    union kmem_bufctl *first_free;
+    void *addr;
+};
+
+/*
+ * Type for constructor functions.
+ *
+ * The pre-constructed state of an object is supposed to include only
+ * elements such as e.g. linked lists, locks, reference counters. Therefore
+ * constructors are expected to 1) never fail and 2) not need any
+ * user-provided data. The first constraint implies that object construction
+ * never performs dynamic resource allocation, which also means there is no
+ * need for destructors.
+ */
+typedef void (*kmem_cache_ctor_t)(void *obj);
+
+/*
+ * Types for slab allocation/free functions.
+ *
+ * All addresses and sizes must be page-aligned.
+ */
+typedef vm_offset_t (*kmem_slab_alloc_fn_t)(vm_size_t);
+typedef void (*kmem_slab_free_fn_t)(vm_offset_t, vm_size_t);
+
+/*
+ * Cache name buffer size.
+ */
+#define KMEM_CACHE_NAME_SIZE 32
+
+/*
+ * Cache of objects.
+ *
+ * Locking order : cpu_pool -> cache. CPU pools locking is ordered by CPU ID.
+ *
+ * The partial slabs list is sorted by slab references. Slabs with a high
+ * number of references are placed first on the list to reduce fragmentation.
+ * Sorting occurs at insertion/removal of buffers in a slab. As the list
+ * is maintained sorted, and the number of references only changes by one,
+ * this is a very cheap operation in the average case and the worst (linear)
+ * case is very unlikely.
+ */
+struct kmem_cache {
+#if SLAB_USE_CPU_POOLS
+    /* CPU pool layer */
+    struct kmem_cpu_pool cpu_pools[NCPUS];
+    struct kmem_cpu_pool_type *cpu_pool_type;
+#endif /* SLAB_USE_CPU_POOLS */
+
+    /* Slab layer */
+    simple_lock_data_t lock;
+    struct list node;   /* Cache list linkage */
+    struct list partial_slabs;
+    struct list free_slabs;
+    struct rbtree active_slabs;
+    int flags;
+    size_t obj_size;    /* User-provided size */
+    size_t align;
+    size_t buf_size;    /* Aligned object size  */
+    size_t bufctl_dist; /* Distance from buffer to bufctl   */
+    size_t slab_size;
+    size_t color;
+    size_t color_max;
+    unsigned long bufs_per_slab;
+    unsigned long nr_objs;  /* Number of allocated objects */
+    unsigned long nr_bufs;  /* Total number of buffers */
+    unsigned long nr_slabs;
+    unsigned long nr_free_slabs;
+    kmem_cache_ctor_t ctor;
+    kmem_slab_alloc_fn_t slab_alloc_fn;
+    kmem_slab_free_fn_t slab_free_fn;
+    char name[KMEM_CACHE_NAME_SIZE];
+    size_t buftag_dist; /* Distance from buffer to buftag */
+    size_t redzone_pad; /* Bytes from end of object to redzone word */
+};
+
+/*
+ * Mach-style declarations for struct kmem_cache.
+ */
+typedef struct kmem_cache *kmem_cache_t;
+#define KMEM_CACHE_NULL ((kmem_cache_t) 0)
+
+/*
+ * VM submap for slab allocations.
+ */
+extern vm_map_t kmem_map;
+
+/*
+ * Cache initialization flags.
+ */
+#define KMEM_CACHE_NOCPUPOOL    0x1 /* Don't use the per-cpu pools */
+#define KMEM_CACHE_NOOFFSLAB    0x2 /* Don't allocate external slab data */
+#define KMEM_CACHE_NORECLAIM    0x4 /* Never give slabs back to their source,
+                                       implies KMEM_CACHE_NOOFFSLAB */
+#define KMEM_CACHE_VERIFY       0x8 /* Use debugging facilities */
+
+/*
+ * Initialize a cache.
+ */
+void kmem_cache_init(struct kmem_cache *cache, const char *name,
+                     size_t obj_size, size_t align, kmem_cache_ctor_t ctor,
+                     kmem_slab_alloc_fn_t slab_alloc_fn,
+                     kmem_slab_free_fn_t slab_free_fn, int flags);
+
+/*
+ * Allocate an object from a cache.
+ */
+vm_offset_t kmem_cache_alloc(struct kmem_cache *cache);
+
+/*
+ * Release an object to its cache.
+ */
+void kmem_cache_free(struct kmem_cache *cache, vm_offset_t obj);
+
+/*
+ * Initialize the memory allocator module.
+ */
+void slab_bootstrap(void);
+void slab_init(void);
+
+/*
+ * Release free slabs to the VM system.
+ */
+void slab_collect(void);
+
+#endif /* _KERN_SLAB_H */
diff --git a/kern/zalloc.c b/kern/zalloc.c
deleted file mode 100644
index 43836a6..0000000
--- a/kern/zalloc.c
+++ /dev/null
@@ -1,1007 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1993-1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- *	File:	kern/zalloc.c
- *	Author:	Avadis Tevanian, Jr.
- *
- *	Zone-based memory allocator.  A zone is a collection of fixed size
- *	data blocks for which quick allocation/deallocation is possible.
- */
-
-#include <string.h>
-
-#include <kern/debug.h>
-#include <kern/macro_help.h>
-#include <kern/printf.h>
-#include <kern/mach_clock.h>
-#include <kern/sched.h>
-#include <kern/zalloc.h>
-#include <mach/vm_param.h>
-#include <vm/vm_kern.h>
-#include <machine/machspl.h>
-
-#if	MACH_DEBUG
-#include <mach/kern_return.h>
-#include <mach/machine/vm_types.h>
-#include <mach_debug/zone_info.h>
-#include <kern/host.h>
-#include <vm/vm_map.h>
-#include <vm/vm_user.h>
-#include <vm/vm_kern.h>
-#endif
-
-#define ADD_TO_ZONE(zone, element)					\
-MACRO_BEGIN								\
-		*((vm_offset_t *)(element)) = (zone)->free_elements;	\
-		(zone)->free_elements = (vm_offset_t) (element);	\
-		zone_count_down(zone);					\
-MACRO_END
-
-#define REMOVE_FROM_ZONE(zone, ret, type)				\
-MACRO_BEGIN								\
-	(ret) = (type) (zone)->free_elements;				\
-	if ((ret) != (type) 0) {					\
-		zone_count_up(zone);					\
-		(zone)->free_elements = *((vm_offset_t *)(ret));	\
-	}								\
-MACRO_END
-
-#define ALIGN_SIZE_UP(size, align) \
-((size) = (((size) + ((align) - 1)) & ~((align) - 1)))
-
-/*
- * Support for garbage collection of unused zone pages:
- */
-
-struct zone_page_table_entry {
-	struct	zone_page_table_entry	*next;
-	short 	in_free_list;
-	short	alloc_count;
-};
-
-extern struct zone_page_table_entry * zone_page_table;
-extern vm_offset_t zone_map_min_address;
-
-#define lock_zone_page_table() simple_lock(&zone_page_table_lock)
-#define unlock_zone_page_table() simple_unlock(&zone_page_table_lock)
-
-#define	zone_page(addr) \
-    (&(zone_page_table[(atop(((vm_offset_t)addr) - zone_map_min_address))]))
-
-
-extern void		zone_page_alloc(vm_offset_t, vm_size_t);
-extern void		zone_page_dealloc(vm_offset_t, vm_size_t);
-extern void		zone_page_in_use(vm_offset_t, vm_size_t);
-extern void		zone_page_free(vm_offset_t, vm_size_t);
-
-zone_t		zone_zone;	/* this is the zone containing other zones */
-
-boolean_t	zone_ignore_overflow = TRUE;
-
-vm_map_t	zone_map = VM_MAP_NULL;
-vm_size_t	zone_map_size = 64 * 1024 * 1024;
-
-/*
- *	The VM system gives us an initial chunk of memory.
- *	It has to be big enough to allocate the zone_zone
- *	and some initial kernel data structures, like kernel maps.
- *	It is advantageous to make it bigger than really necessary,
- *	because this memory is more efficient than normal kernel
- *	virtual memory.  (It doesn't have vm_page structures backing it
- *	and it may have other machine-dependent advantages.)
- *	So for best performance, zdata_size should approximate
- *	the amount of memory you expect the zone system to consume.
- */
-
-vm_offset_t	zdata;
-vm_size_t	zdata_size = 420 * 1024;
-
-#define zone_lock(zone)					\
-MACRO_BEGIN						\
-	if (zone->type & ZONE_PAGEABLE) {		\
-		lock_write(&zone->complex_lock);	\
-	} else {					\
-		simple_lock(&zone->lock);		\
-	}						\
-MACRO_END
-
-#define zone_unlock(zone)				\
-MACRO_BEGIN						\
-	if (zone->type & ZONE_PAGEABLE) {		\
-		lock_done(&zone->complex_lock);		\
-	} else {					\
-		simple_unlock(&zone->lock);		\
-	}						\
-MACRO_END
-
-#define zone_lock_init(zone)				\
-MACRO_BEGIN						\
-	if (zone->type & ZONE_PAGEABLE) {		\
-		lock_init(&zone->complex_lock, TRUE);	\
-	} else {					\
-		simple_lock_init(&zone->lock);		\
-	}						\
-MACRO_END
-
-static vm_offset_t zget_space(vm_offset_t size, vm_size_t align);
-
-decl_simple_lock_data(,zget_space_lock)
-vm_offset_t zalloc_next_space;
-vm_offset_t zalloc_end_of_space;
-vm_size_t zalloc_wasted_space;
-
-/*
- *	Garbage collection map information
- */
-decl_simple_lock_data(,zone_page_table_lock)
-struct zone_page_table_entry *	zone_page_table;
-vm_offset_t			zone_map_min_address;
-vm_offset_t			zone_map_max_address;
-int				zone_pages;
-
-extern void zone_page_init(vm_offset_t, vm_size_t, int);
-
-#define	ZONE_PAGE_USED  0
-#define ZONE_PAGE_UNUSED -1
-
-
-/*
- *	Protects first_zone, last_zone, num_zones,
- *	and the next_zone field of zones.
- */
-decl_simple_lock_data(,all_zones_lock)
-zone_t			first_zone;
-zone_t			*last_zone;
-int			num_zones;
-
-/*
- *	zinit initializes a new zone.  The zone data structures themselves
- *	are stored in a zone, which is initially a static structure that
- *	is initialized by zone_init.
- */
-zone_t zinit(size, align, max, alloc, memtype, name)
-	vm_size_t	size;		/* the size of an element */
-	vm_size_t	align;		/* alignment of elements */
-	vm_size_t	max;		/* maximum memory to use */
-	vm_size_t	alloc;		/* allocation size */
-	unsigned int	memtype;	/* flags specifying type of memory */
-	char		*name;		/* a name for the zone */
-{
-	register zone_t		z;
-
-	if (zone_zone == ZONE_NULL)
-		z = (zone_t) zget_space(sizeof(struct zone), 0);
-	else
-		z = (zone_t) zalloc(zone_zone);
-	if (z == ZONE_NULL)
-		panic("zinit");
- 	if (alloc == 0)
-		alloc = PAGE_SIZE;
-
-	if (size == 0)
-		size = sizeof(z->free_elements);
-	/*
-	 *	Round off all the parameters appropriately.
-	 */
-
-	if ((max = round_page(max)) < (alloc = round_page(alloc)))
-		max = alloc;
-
-	if (align > 0) {
-		if (PAGE_SIZE % align || align % sizeof(z->free_elements))
-			panic("zinit");
-		ALIGN_SIZE_UP(size, align);
-	}
-
-	z->free_elements = 0;
-	z->cur_size = 0;
-	z->max_size = max;
-	z->elem_size = ((size-1) + sizeof(z->free_elements)) -
-			((size-1) % sizeof(z->free_elements));
-	z->align = align;
-
-	z->alloc_size = alloc;
-	z->type = memtype;
-	z->zone_name = name;
-#ifdef ZONE_COUNT
-	z->count = 0;
-#endif
-	z->doing_alloc = FALSE;
-	zone_lock_init(z);
-
-	/*
-	 *	Add the zone to the all-zones list.
-	 */
-
-	z->next_zone = ZONE_NULL;
-	simple_lock(&all_zones_lock);
-	*last_zone = z;
-	last_zone = &z->next_zone;
-	num_zones++;
-	simple_unlock(&all_zones_lock);
-
-	return(z);
-}
-
-/*
- *	Cram the given memory into the specified zone.
- */
-void zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
-{
-	register vm_size_t	elem_size;
-
-	if (newmem == (vm_offset_t) 0) {
-		panic("zcram - memory at zero");
-	}
-	elem_size = zone->elem_size;
-
-	zone_lock(zone);
-	while (size >= elem_size) {
-		ADD_TO_ZONE(zone, newmem);
-		zone_page_alloc(newmem, elem_size);
-		zone_count_up(zone);	/* compensate for ADD_TO_ZONE */
-		size -= elem_size;
-		newmem += elem_size;
-		zone->cur_size += elem_size;
-	}
-	zone_unlock(zone);
-}
-
-/*
- * Contiguous space allocator for non-paged zones. Allocates "size" amount
- * of memory from zone_map.
- */
-
-static vm_offset_t zget_space(vm_offset_t size, vm_size_t align)
-{
-	vm_offset_t	new_space = 0;
-	vm_offset_t	result;
-	vm_size_t	space_to_add = 0; /*'=0' to quiet gcc warnings */
-
-	simple_lock(&zget_space_lock);
-	if (align > 0) {
-		assert(align < PAGE_SIZE);
-		ALIGN_SIZE_UP(zalloc_next_space, align);
-	}
-
-	while ((zalloc_next_space + size) > zalloc_end_of_space) {
-		/*
-		 *	Add at least one page to allocation area.
-		 */
-
-		space_to_add = round_page(size);
-
-		if (new_space == 0) {
-			/*
-			 *	Memory cannot be wired down while holding
-			 *	any locks that the pageout daemon might
-			 *	need to free up pages.  [Making the zget_space
-			 *	lock a complex lock does not help in this
-			 *	regard.]
-			 *
-			 *	Unlock and allocate memory.  Because several
-			 *	threads might try to do this at once, don't
-			 *	use the memory before checking for available
-			 *	space again.
-			 */
-
-			simple_unlock(&zget_space_lock);
-
-			if (kmem_alloc_wired(zone_map,
-					     &new_space, space_to_add)
-							!= KERN_SUCCESS)
-				return(0);
-			zone_page_init(new_space, space_to_add,
-							ZONE_PAGE_USED);
-			simple_lock(&zget_space_lock);
-			if (align > 0)
-				ALIGN_SIZE_UP(zalloc_next_space, align);
-			continue;
-		}
-
-
-		/*
-	  	 *	Memory was allocated in a previous iteration.
-		 *
-		 *	Check whether the new region is contiguous
-		 *	with the old one.
-		 */
-
-		if (new_space != zalloc_end_of_space) {
-			/*
-			 *	Throw away the remainder of the
-			 *	old space, and start a new one.
-			 */
-			zalloc_wasted_space +=
-				zalloc_end_of_space - zalloc_next_space;
-			zalloc_next_space = new_space;
-		}
-
-		zalloc_end_of_space = new_space + space_to_add;
-
-		new_space = 0;
-	}
-	result = zalloc_next_space;
-	zalloc_next_space += size;
-	simple_unlock(&zget_space_lock);
-
-	if (new_space != 0)
-		kmem_free(zone_map, new_space, space_to_add);
-
-	return(result);
-}
-
-
-/*
- *	Initialize the "zone of zones" which uses fixed memory allocated
- *	earlier in memory initialization.  zone_bootstrap is called
- *	before zone_init.
- */
-void zone_bootstrap(void)
-{
-	simple_lock_init(&all_zones_lock);
-	first_zone = ZONE_NULL;
-	last_zone = &first_zone;
-	num_zones = 0;
-
-	simple_lock_init(&zget_space_lock);
-	zalloc_next_space = zdata;
-	zalloc_end_of_space = zdata + zdata_size;
-	zalloc_wasted_space = 0;
-
-	zone_zone = ZONE_NULL;
-	zone_zone = zinit(sizeof(struct zone), 0, 128 * sizeof(struct zone),
-			  sizeof(struct zone), 0, "zones");
-}
-
-void zone_init(void)
-{
-	vm_offset_t	zone_min;
-	vm_offset_t	zone_max;
-
-	vm_size_t	zone_table_size;
-
-	zone_map = kmem_suballoc(kernel_map, &zone_min, &zone_max,
-				 zone_map_size, FALSE);
-
-	/*
-	 * Setup garbage collection information:
-	 */
-
- 	zone_table_size = atop(zone_max - zone_min) *
-				sizeof(struct zone_page_table_entry);
-	if (kmem_alloc_wired(zone_map, (vm_offset_t *) &zone_page_table,
-			     zone_table_size) != KERN_SUCCESS)
-		panic("zone_init");
-	zone_min = (vm_offset_t)zone_page_table + round_page(zone_table_size);
-	zone_pages = atop(zone_max - zone_min);
-	zone_map_min_address = zone_min;
-	zone_map_max_address = zone_max;
-	simple_lock_init(&zone_page_table_lock);
-	zone_page_init(zone_min, zone_max - zone_min, ZONE_PAGE_UNUSED);
-}
-
-
-/*
- *	zalloc returns an element from the specified zone.
- */
-vm_offset_t zalloc(zone_t zone)
-{
-	vm_offset_t	addr;
-
-	if (zone == ZONE_NULL)
-		panic ("zalloc: null zone");
-
-	check_simple_locks();
-
-	zone_lock(zone);
-	REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
-	while (addr == 0) {
-		/*
- 		 *	If nothing was there, try to get more
-		 */
-		if (zone->doing_alloc) {
-			/*
-			 *	Someone is allocating memory for this zone.
-			 *	Wait for it to show up, then try again.
-			 */
-			assert_wait((event_t)&zone->doing_alloc, TRUE);
-			/* XXX say wakeup needed */
-			zone_unlock(zone);
-			thread_block((void (*)()) 0);
-			zone_lock(zone);
-		}
-		else {
-			if ((zone->cur_size + (zone->type & ZONE_PAGEABLE ?
-				zone->alloc_size : zone->elem_size)) >
-			    zone->max_size) {
-				if (zone->type & ZONE_EXHAUSTIBLE)
-					break;
-				/*
-				 * Printf calls logwakeup, which calls
-				 * select_wakeup which will do a zfree
-				 * (which tries to take the select_zone
-				 * lock... Hang.  Release the lock now
-				 * so it can be taken again later.
-				 * NOTE: this used to be specific to
-				 * the select_zone, but for
-				 * cleanliness, we just unlock all
-				 * zones before this.
-				 */
-				if (!(zone->type & ZONE_FIXED)) {
-					/*
-					 * We're willing to overflow certain
-					 * zones, but not without complaining.
-					 *
-					 * This is best used in conjunction
-					 * with the collecatable flag. What we
-					 * want is an assurance we can get the
-					 * memory back, assuming there's no
-					 * leak.
-					 */
-					zone->max_size += (zone->max_size >> 1);
-				} else if (!zone_ignore_overflow) {
-					zone_unlock(zone);
-					printf("zone \"%s\" empty.\n",
-						zone->zone_name);
-					panic("zalloc: zone %s exhausted",
-					      zone->zone_name);
-				}
-			}
-
-			if (zone->type & ZONE_PAGEABLE)
-				zone->doing_alloc = TRUE;
-			zone_unlock(zone);
-
-			if (zone->type & ZONE_PAGEABLE) {
-				if (kmem_alloc_pageable(zone_map, &addr,
-							zone->alloc_size)
-							!= KERN_SUCCESS)
-					panic("zalloc: no pageable memory for zone %s",
-					      zone->zone_name);
-				zcram(zone, addr, zone->alloc_size);
-				zone_lock(zone);
-				zone->doing_alloc = FALSE;
-				/* XXX check before doing this */
-				thread_wakeup((event_t)&zone->doing_alloc);
-
-				REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
-			} else  if (zone->type & ZONE_COLLECTABLE) {
-				if (kmem_alloc_wired(zone_map,
-						     &addr, zone->alloc_size)
-							!= KERN_SUCCESS)
-					panic("zalloc: no wired memory for zone %s",
-					      zone->zone_name);
-				zone_page_init(addr, zone->alloc_size,
-							ZONE_PAGE_USED);
-				zcram(zone, addr, zone->alloc_size);
-				zone_lock(zone);
-				REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
-			} else {
-				addr = zget_space(zone->elem_size, zone->align);
-				if (addr == 0)
-					panic("zalloc: no memory for zone %s",
-					      zone->zone_name);
-
-				zone_lock(zone);
-				zone_count_up(zone);
-				zone->cur_size += zone->elem_size;
-				zone_unlock(zone);
-				zone_page_alloc(addr, zone->elem_size);
-				return(addr);
-			}
-		}
-	}
-
-	zone_unlock(zone);
-	return(addr);
-}
-
-
-/*
- *	zget returns an element from the specified zone
- *	and immediately returns nothing if there is nothing there.
- *
- *	This form should be used when you can not block (like when
- *	processing an interrupt).
- */
-vm_offset_t zget(zone_t zone)
-{
-	register vm_offset_t	addr;
-
-	if (zone == ZONE_NULL)
-		panic ("zalloc: null zone");
-
-	zone_lock(zone);
-	REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
-	zone_unlock(zone);
-
-	return(addr);
-}
-
-boolean_t zone_check = FALSE;
-
-void zfree(zone_t zone, vm_offset_t elem)
-{
-	zone_lock(zone);
-	if (zone_check) {
-		vm_offset_t this;
-
-		/* check the zone's consistency */
-
-		for (this = zone->free_elements;
-		     this != 0;
-		     this = * (vm_offset_t *) this)
-			if (this == elem)
-				panic("zfree");
-	}
-	ADD_TO_ZONE(zone, elem);
-	zone_unlock(zone);
-}
-
-/*
- *  Zone garbage collection subroutines
- *
- *  These routines have in common the modification of entries in the
- *  zone_page_table.  The latter contains one entry for every page
- *  in the zone_map.
- *
- *  For each page table entry in the given range:
- *
- *	zone_page_in_use        - decrements in_free_list
- *	zone_page_free          - increments in_free_list
- *	zone_page_init          - initializes in_free_list and alloc_count
- *	zone_page_alloc         - increments alloc_count
- *	zone_page_dealloc       - decrements alloc_count
- *	zone_add_free_page_list - adds the page to the free list
- *
- *  Two counts are maintained for each page, the in_free_list count and
- *  alloc_count.  The alloc_count is how many zone elements have been
- *  allocated from a page.  (Note that the page could contain elements
- *  that span page boundaries.  The count includes these elements so
- *  one element may be counted in two pages.) In_free_list is a count
- *  of how many zone elements are currently free.  If in_free_list is
- *  equal to alloc_count then the page is eligible for garbage
- *  collection.
- *
- *  Alloc_count and in_free_list are initialized to the correct values
- *  for a particular zone when a page is zcram'ed into a zone.  Subsequent
- *  gets and frees of zone elements will call zone_page_in_use and
- *  zone_page_free which modify the in_free_list count.  When the zones
- *  garbage collector runs it will walk through a zones free element list,
- *  remove the elements that reside on collectable pages, and use
- *  zone_add_free_page_list to create a list of pages to be collected.
- */
-
-void zone_page_in_use(addr, size)
-vm_offset_t	addr;
-vm_size_t	size;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		zone_page_table[i].in_free_list--;
-	}
-	unlock_zone_page_table();
-}
-
-void zone_page_free(addr, size)
-vm_offset_t	addr;
-vm_size_t	size;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		/* Set in_free_list to (ZONE_PAGE_USED + 1) if
-		 * it was previously set to ZONE_PAGE_UNUSED.
-		 */
-		if (zone_page_table[i].in_free_list == ZONE_PAGE_UNUSED) {
-			zone_page_table[i].in_free_list = 1;
-		} else {
-			zone_page_table[i].in_free_list++;
-		}
-	}
-	unlock_zone_page_table();
-}
-
-void zone_page_init(addr, size, value)
-
-vm_offset_t	addr;
-vm_size_t	size;
-int		value;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		zone_page_table[i].alloc_count = value;
-		zone_page_table[i].in_free_list = 0;
-	}
-	unlock_zone_page_table();
-}
-
-void zone_page_alloc(addr, size)
-vm_offset_t	addr;
-vm_size_t	size;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		/* Set alloc_count to (ZONE_PAGE_USED + 1) if
-		 * it was previously set to ZONE_PAGE_UNUSED.
-		 */
-		if (zone_page_table[i].alloc_count == ZONE_PAGE_UNUSED) {
-			zone_page_table[i].alloc_count = 1;
-		} else {
-			zone_page_table[i].alloc_count++;
-		}
-	}
-	unlock_zone_page_table();
-}
-
-void zone_page_dealloc(addr, size)
-vm_offset_t	addr;
-vm_size_t	size;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		zone_page_table[i].alloc_count--;
-	}
-	unlock_zone_page_table();
-}
-
-void
-zone_add_free_page_list(free_list, addr, size)
-	struct zone_page_table_entry	**free_list;
-	vm_offset_t	addr;
-	vm_size_t	size;
-{
-	int i, j;
-	if ((addr < zone_map_min_address) ||
-	    (addr+size > zone_map_max_address)) return;
-	i = atop(addr-zone_map_min_address);
-	j = atop((addr+size-1) - zone_map_min_address);
-	lock_zone_page_table();
-	for (; i <= j; i++) {
-		if (zone_page_table[i].alloc_count == 0) {
-			zone_page_table[i].next = *free_list;
-			*free_list = &zone_page_table[i];
-			zone_page_table[i].alloc_count  = ZONE_PAGE_UNUSED;
-			zone_page_table[i].in_free_list = 0;
-		}
-	}
-	unlock_zone_page_table();
-}
-
-
-/* This is used for walking through a zone's free element list.
- */
-struct zone_free_entry {
-	struct zone_free_entry * next;
-};
-
-
-/*	Zone garbage collection
- *
- *	zone_gc will walk through all the free elements in all the
- *	zones that are marked collectable looking for reclaimable
- *	pages.  zone_gc is called by consider_zone_gc when the system
- *	begins to run out of memory.
- */
-static void zone_gc(void)
-{
-	int		max_zones;
-	zone_t		z;
-	int		i;
-	register spl_t	s;
-	struct zone_page_table_entry	*freep;
-	struct zone_page_table_entry	*zone_free_page_list;
-
-	simple_lock(&all_zones_lock);
-	max_zones = num_zones;
-	z = first_zone;
-	simple_unlock(&all_zones_lock);
-
-	zone_free_page_list = (struct zone_page_table_entry *) 0;
-
-	for (i = 0; i < max_zones; i++) {
-		struct zone_free_entry * last;
-		struct zone_free_entry * elt;
-		assert(z != ZONE_NULL);
-	/* run this at splhigh so that interupt routines that use zones
-	   can not interupt while their zone is locked */
-		s=splhigh();
-		zone_lock(z);
-
-		if ((z->type & (ZONE_PAGEABLE|ZONE_COLLECTABLE)) == ZONE_COLLECTABLE) {
-
-		    /* Count the free elements in each page.  This loop
-		     * requires that all in_free_list entries are zero.
-		     */
-		    elt = (struct zone_free_entry *)(z->free_elements);
-		    while ((elt != (struct zone_free_entry *)0)) {
-			   zone_page_free((vm_offset_t)elt, z->elem_size);
-			   elt = elt->next;
-		    }
-
-		    /* Now determine which elements should be removed
-		     * from the free list and, after all the elements
-		     * on a page have been removed, add the element's
-		     * page to a list of pages to be freed.
-		     */
-		    elt = (struct zone_free_entry *)(z->free_elements);
-		    last = elt;
-		    while ((elt != (struct zone_free_entry *)0)) {
-			if (((vm_offset_t)elt>=zone_map_min_address)&&
-			    ((vm_offset_t)elt<=zone_map_max_address)&&
-			    (zone_page(elt)->in_free_list ==
-			     zone_page(elt)->alloc_count)) {
-
-			    z->cur_size -= z->elem_size;
-			    zone_page_in_use((vm_offset_t)elt, z->elem_size);
-			    zone_page_dealloc((vm_offset_t)elt, z->elem_size);
-			    if (zone_page(elt)->alloc_count == 0 ||
-			      zone_page(elt+(z->elem_size-1))->alloc_count==0) {
-				    zone_add_free_page_list(
-					    &zone_free_page_list,
-					    (vm_offset_t)elt, z->elem_size);
-			    }
-
-
-			    if (elt == last) {
-				elt = elt->next;
-				z->free_elements =(vm_offset_t)elt;
-				last = elt;
-			    } else {
-				last->next = elt->next;
-				elt = elt->next;
-			    }
-			} else {
-			    /* This element is not eligible for collection
-			     * so clear in_free_list in preparation for a
-			     * subsequent garbage collection pass.
-			     */
-			    if (((vm_offset_t)elt>=zone_map_min_address)&&
-				((vm_offset_t)elt<=zone_map_max_address)) {
-				zone_page(elt)->in_free_list = 0;
-			    }
-			    last = elt;
-			    elt = elt->next;
-			}
-		    }
-		}
-		zone_unlock(z);
-		splx(s);
-		simple_lock(&all_zones_lock);
-		z = z->next_zone;
-		simple_unlock(&all_zones_lock);
-	}
-
-	for (freep = zone_free_page_list; freep != 0; freep = freep->next) {
-		vm_offset_t	free_addr;
-
-		free_addr = zone_map_min_address +
-			PAGE_SIZE * (freep - zone_page_table);
-
-		/* Hack Hack */
-		/* Needed to make vm_map_delete's vm_map_clip_end always be
-		 * able to get an element without having to call zget_space and
-		 * hang because zone_map is already locked by vm_map_delete */
-
-		extern zone_t		vm_map_kentry_zone;	/* zone for kernel entry structures */
-		vm_offset_t entry1 = zalloc(vm_map_kentry_zone),
-			    entry2 = zalloc(vm_map_kentry_zone);
-		zfree(vm_map_kentry_zone, entry1);
-		zfree(vm_map_kentry_zone, entry2);
-
-		kmem_free(zone_map, free_addr, PAGE_SIZE);
-	}
-}
-
-boolean_t zone_gc_allowed = TRUE;
-unsigned zone_gc_last_tick = 0;
-unsigned zone_gc_max_rate = 0;		/* in ticks */
-
-/*
- *	consider_zone_gc:
- *
- *	Called by the pageout daemon when the system needs more free pages.
- */
-
-void
-consider_zone_gc(void)
-{
-	/*
-	 *	By default, don't attempt zone GC more frequently
-	 *	than once a second.
-	 */
-
-	if (zone_gc_max_rate == 0)
-		zone_gc_max_rate = hz;
-
-	if (zone_gc_allowed &&
-	    (sched_tick > (zone_gc_last_tick + zone_gc_max_rate))) {
-		zone_gc_last_tick = sched_tick;
-		zone_gc();
-	}
-}
-
-#if	MACH_DEBUG
-kern_return_t host_zone_info(host, namesp, namesCntp, infop, infoCntp)
-	host_t		host;
-	zone_name_array_t *namesp;
-	unsigned int	*namesCntp;
-	zone_info_array_t *infop;
-	unsigned int	*infoCntp;
-{
-	zone_name_t	*names;
-	vm_offset_t	names_addr;
-	vm_size_t	names_size = 0; /*'=0' to quiet gcc warnings */
-	zone_info_t	*info;
-	vm_offset_t	info_addr;
-	vm_size_t	info_size = 0; /*'=0' to quiet gcc warnings */
-	unsigned int	max_zones, i;
-	zone_t		z;
-	kern_return_t	kr;
-
-	if (host == HOST_NULL)
-		return KERN_INVALID_HOST;
-
-	/*
-	 *	We assume that zones aren't freed once allocated.
-	 *	We won't pick up any zones that are allocated later.
-	 */
-
-	simple_lock(&all_zones_lock);
-	max_zones = num_zones;
-	z = first_zone;
-	simple_unlock(&all_zones_lock);
-
-	if (max_zones <= *namesCntp) {
-		/* use in-line memory */
-
-		names = *namesp;
-	} else {
-		names_size = round_page(max_zones * sizeof *names);
-		kr = kmem_alloc_pageable(ipc_kernel_map,
-					 &names_addr, names_size);
-		if (kr != KERN_SUCCESS)
-			return kr;
-
-		names = (zone_name_t *) names_addr;
-	}
-
-	if (max_zones <= *infoCntp) {
-		/* use in-line memory */
-
-		info = *infop;
-	} else {
-		info_size = round_page(max_zones * sizeof *info);
-		kr = kmem_alloc_pageable(ipc_kernel_map,
-					 &info_addr, info_size);
-		if (kr != KERN_SUCCESS) {
-			if (names != *namesp)
-				kmem_free(ipc_kernel_map,
-					  names_addr, names_size);
-			return kr;
-		}
-
-		info = (zone_info_t *) info_addr;
-	}
-
-	for (i = 0; i < max_zones; i++) {
-		zone_name_t *zn = &names[i];
-		zone_info_t *zi = &info[i];
-		struct zone zcopy;
-
-		assert(z != ZONE_NULL);
-
-		zone_lock(z);
-		zcopy = *z;
-		zone_unlock(z);
-
-		simple_lock(&all_zones_lock);
-		z = z->next_zone;
-		simple_unlock(&all_zones_lock);
-
-		/* assuming here the name data is static */
-		(void) strncpy(zn->zn_name, zcopy.zone_name,
-			       sizeof zn->zn_name);
-
-#ifdef ZONE_COUNT
-		zi->zi_count = zcopy.count;
-#else
-		zi->zi_count = 0;
-#endif
-		zi->zi_cur_size = zcopy.cur_size;
-		zi->zi_max_size = zcopy.max_size;
-		zi->zi_elem_size = zcopy.elem_size;
-		zi->zi_alloc_size = zcopy.alloc_size;
-		zi->zi_pageable = (zcopy.type & ZONE_PAGEABLE) != 0;
-		zi->zi_exhaustible = (zcopy.type & ZONE_EXHAUSTIBLE) != 0;
-		zi->zi_collectable = (zcopy.type & ZONE_COLLECTABLE) != 0;
-	}
-
-	if (names != *namesp) {
-		vm_size_t used;
-		vm_map_copy_t copy;
-
-		used = max_zones * sizeof *names;
-
-		if (used != names_size)
-			memset((char *) (names_addr + used), 0, names_size - used);
-
-		kr = vm_map_copyin(ipc_kernel_map, names_addr, names_size,
-				   TRUE, &copy);
-		assert(kr == KERN_SUCCESS);
-
-		*namesp = (zone_name_t *) copy;
-	}
-	*namesCntp = max_zones;
-
-	if (info != *infop) {
-		vm_size_t used;
-		vm_map_copy_t copy;
-
-		used = max_zones * sizeof *info;
-
-		if (used != info_size)
-			memset((char *) (info_addr + used), 0, info_size - used);
-
-		kr = vm_map_copyin(ipc_kernel_map, info_addr, info_size,
-				   TRUE, &copy);
-		assert(kr == KERN_SUCCESS);
-
-		*infop = (zone_info_t *) copy;
-	}
-	*infoCntp = max_zones;
-
-	return KERN_SUCCESS;
-}
-#endif	/* MACH_DEBUG */
diff --git a/kern/zalloc.h b/kern/zalloc.h
deleted file mode 100644
index f1a1850..0000000
--- a/kern/zalloc.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Mach Operating System
- * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University.
- * Copyright (c) 1993,1994 The University of Utah and
- * the Computer Systems Laboratory (CSL).
- * All rights reserved.
- *
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- *
- * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF
- * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY
- * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF
- * THIS SOFTWARE.
- *
- * Carnegie Mellon requests users of this software to return to
- *
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- *
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/*
- *	File:	zalloc.h
- *	Author:	Avadis Tevanian, Jr.
- *	Date:	 1985
- *
- */
-
-#ifndef	_KERN_ZALLOC_H_
-#define _KERN_ZALLOC_H_
-
-#include <mach/machine/vm_types.h>
-#include <kern/macro_help.h>
-#include <kern/lock.h>
-#include <kern/queue.h>
-#include <machine/zalloc.h>
-
-/*
- *	A zone is a collection of fixed size blocks for which there
- *	is fast allocation/deallocation access.  Kernel routines can
- *	use zones to manage data structures dynamically, creating a zone
- *	for each type of data structure to be managed.
- *
- */
-
-struct zone {
-	decl_simple_lock_data(,lock)	/* generic lock */
-#ifdef ZONE_COUNT
-	int		count;		/* Number of elements used now */
-#endif
-	vm_offset_t	free_elements;
-	vm_size_t	cur_size;	/* current memory utilization */
-	vm_size_t	max_size;	/* how large can this zone grow */
-	vm_size_t	elem_size;	/* size of an element */
-	vm_size_t	align;		/* alignment of elements */
-	vm_size_t	alloc_size;	/* size used for more memory */
-	boolean_t	doing_alloc;	/* is zone expanding now? */
-	char		*zone_name;	/* a name for the zone */
-	unsigned int	type;		/* type of memory */
-	lock_data_t	complex_lock;	/* Lock for pageable zones */
-	struct zone	*next_zone;	/* Link for all-zones list */
-};
-typedef struct zone *zone_t;
-
-#define		ZONE_NULL	((zone_t) 0)
-
-/* Exported to everyone */
-zone_t		zinit(vm_size_t size, vm_size_t align, vm_size_t max,
-		      vm_size_t alloc, unsigned int memtype, char *name);
-vm_offset_t	zalloc(zone_t zone);
-vm_offset_t	zget(zone_t zone);
-void		zfree(zone_t zone, vm_offset_t elem);
-void		zcram(zone_t zone, vm_offset_t newmem, vm_size_t size);
-
-/* Exported only to vm/vm_init.c */
-void		zone_bootstrap(void);
-void		zone_init(void);
-
-/* Exported only to vm/vm_pageout.c */
-void		consider_zone_gc(void);
-
-
-/* Memory type bits for zones */
-#define ZONE_PAGEABLE		0x00000001
-#define ZONE_COLLECTABLE	0x00000002	/* Garbage-collect this zone when memory runs low */
-#define ZONE_EXHAUSTIBLE	0x00000004	/* zalloc() on this zone is allowed to fail */
-#define ZONE_FIXED		0x00000008	/* Panic if zone is exhausted (XXX) */
-
-/* Machine-dependent code can provide additional memory types.  */
-#define ZONE_MACHINE_TYPES	0xffff0000
-
-
-#ifdef ZONE_COUNT
-#define zone_count_up(zone) ((zone)->count++)
-#define zone_count_down(zone) ((zone)->count--)
-#else
-#define zone_count_up(zone)
-#define zone_count_down(zone)
-#endif
-
-
-
-/* These quick inline versions only work for small, nonpageable zones (currently).  */
-
-static __inline vm_offset_t ZALLOC(zone_t zone)
-{
-	simple_lock(&zone->lock);
-	if (zone->free_elements == 0) {
-		simple_unlock(&zone->lock);
-		return zalloc(zone);
-	} else {
-		vm_offset_t element = zone->free_elements;
-		zone->free_elements = *((vm_offset_t *)(element));
-		zone_count_up(zone);
-		simple_unlock(&zone->lock);
-		return element;
-	}
-}
-
-static __inline void ZFREE(zone_t zone, vm_offset_t element)
-{
-	*((vm_offset_t *)(element)) = zone->free_elements;
-	zone->free_elements = (vm_offset_t) (element);
-	zone_count_down(zone);
-}
-
-
-
-#endif	/* _KERN_ZALLOC_H_ */
author	Richard Braun <rbraun@sceen.net>	2011-12-13 20:27:56 +0000
committer	Richard Braun <rbraun@sceen.net>	2011-12-17 22:12:34 +0000
commit	7bc54a622e0c57a1085cd2990a1deedc8bd4743d (patch)
tree	0356aefb0a935c30d295a86cec2386d5197c4754 /kern
parent	d25bd66fe0bd4cddb18890390198c86b9e9b56b4 (diff)