aboutsummaryrefslogtreecommitdiff
path: root/mm/slab.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/slab.c')
-rw-r--r--mm/slab.c3060
1 files changed, 3060 insertions, 0 deletions
diff --git a/mm/slab.c b/mm/slab.c
new file mode 100644
index 00000000000..ec660d85ddd
--- /dev/null
+++ b/mm/slab.c
@@ -0,0 +1,3060 @@
+/*
+ * linux/mm/slab.c
+ * Written by Mark Hemment, 1996/97.
+ * (markhe@nextd.demon.co.uk)
+ *
+ * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
+ *
+ * Major cleanup, different bufctl logic, per-cpu arrays
+ * (c) 2000 Manfred Spraul
+ *
+ * Cleanup, make the head arrays unconditional, preparation for NUMA
+ * (c) 2002 Manfred Spraul
+ *
+ * An implementation of the Slab Allocator as described in outline in;
+ * UNIX Internals: The New Frontiers by Uresh Vahalia
+ * Pub: Prentice Hall ISBN 0-13-101908-2
+ * or with a little more detail in;
+ * The Slab Allocator: An Object-Caching Kernel Memory Allocator
+ * Jeff Bonwick (Sun Microsystems).
+ * Presented at: USENIX Summer 1994 Technical Conference
+ *
+ * The memory is organized in caches, one cache for each object type.
+ * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
+ * Each cache consists out of many slabs (they are small (usually one
+ * page long) and always contiguous), and each slab contains multiple
+ * initialized objects.
+ *
+ * This means, that your constructor is used only for newly allocated
+ * slabs and you must pass objects with the same intializations to
+ * kmem_cache_free.
+ *
+ * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
+ * normal). If you need a special memory type, then must create a new
+ * cache for that memory type.
+ *
+ * In order to reduce fragmentation, the slabs are sorted in 3 groups:
+ * full slabs with 0 free objects
+ * partial slabs
+ * empty slabs with no allocated objects
+ *
+ * If partial slabs exist, then new allocations come from these slabs,
+ * otherwise from empty slabs or new slabs are allocated.
+ *
+ * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
+ * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
+ *
+ * Each cache has a short per-cpu head array, most allocs
+ * and frees go into that array, and if that array overflows, then 1/2
+ * of the entries in the array are given back into the global cache.
+ * The head array is strictly LIFO and should improve the cache hit rates.
+ * On SMP, it additionally reduces the spinlock operations.
+ *
+ * The c_cpuarray may not be read with enabled local interrupts -
+ * it's changed with a smp_call_function().
+ *
+ * SMP synchronization:
+ * constructors and destructors are called without any locking.
+ * Several members in kmem_cache_t and struct slab never change, they
+ * are accessed without any locking.
+ * The per-cpu arrays are never accessed from the wrong cpu, no locking,
+ * and local interrupts are disabled so slab code is preempt-safe.
+ * The non-constant members are protected with a per-cache irq spinlock.
+ *
+ * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
+ * in 2000 - many ideas in the current implementation are derived from
+ * his patch.
+ *
+ * Further notes from the original documentation:
+ *
+ * 11 April '97. Started multi-threading - markhe
+ * The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ * The sem is only needed when accessing/extending the cache-chain, which
+ * can never happen inside an interrupt (kmem_cache_create(),
+ * kmem_cache_shrink() and kmem_cache_reap()).
+ *
+ * At present, each engine can be growing a cache. This should be blocked.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/cpu.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+
+/*
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
+ * SLAB_RED_ZONE & SLAB_POISON.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS - 1 to collect stats for /proc/slabinfo.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define DEBUG 1
+#define STATS 1
+#define FORCED_DEBUG 1
+#else
+#define DEBUG 0
+#define STATS 0
+#define FORCED_DEBUG 0
+#endif
+
+
+/* Shouldn't this be in a header file somewhere? */
+#define BYTES_PER_WORD sizeof(void *)
+
+#ifndef cache_line_size
+#define cache_line_size() L1_CACHE_BYTES
+#endif
+
+#ifndef ARCH_KMALLOC_MINALIGN
+/*
+ * Enforce a minimum alignment for the kmalloc caches.
+ * Usually, the kmalloc caches are cache_line_size() aligned, except when
+ * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
+ * Some archs want to perform DMA into kmalloc caches and need a guaranteed
+ * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
+ * Note that this flag disables some debug features.
+ */
+#define ARCH_KMALLOC_MINALIGN 0
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+/*
+ * Enforce a minimum alignment for all caches.
+ * Intended for archs that get misalignment faults even for BYTES_PER_WORD
+ * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
+ * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
+ * some debug features.
+ */
+#define ARCH_SLAB_MINALIGN 0
+#endif
+
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+
+/* Legal flag mask for kmem_cache_create(). */
+#if DEBUG
+# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+ SLAB_NO_REAP | SLAB_CACHE_DMA | \
+ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU)
+#else
+# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU)
+#endif
+
+/*
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementation relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+ */
+
+#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
+#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2)
+
+/* Max number of objs-per-slab for caches which use off-slab slabs.
+ * Needed to avoid a possible looping condition in cache_grow().
+ */
+static unsigned long offslab_limit;
+
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+};
+
+/*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU. This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking. We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ *
+ * We assume struct slab_rcu can overlay struct slab when destroying.
+ */
+struct slab_rcu {
+ struct rcu_head head;
+ kmem_cache_t *cachep;
+ void *addr;
+};
+
+/*
+ * struct array_cache
+ *
+ * Per cpu structures
+ * Purpose:
+ * - LIFO ordering, to hand out cache-warm objects from _alloc
+ * - reduce the number of linked list operations
+ * - reduce spinlock operations
+ *
+ * The limit is stored in the per-cpu structure to reduce the data cache
+ * footprint.
+ *
+ */
+struct array_cache {
+ unsigned int avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int touched;
+};
+
+/* bootstrap: The caches do not work without cpuarrays anymore,
+ * but the cpuarrays are allocated from the generic caches...
+ */
+#define BOOT_CPUCACHE_ENTRIES 1
+struct arraycache_init {
+ struct array_cache cache;
+ void * entries[BOOT_CPUCACHE_ENTRIES];
+};
+
+/*
+ * The slab lists of all objects.
+ * Hopefully reduce the internal fragmentation
+ * NUMA: The spinlock could be moved from the kmem_cache_t
+ * into this structure, too. Figure out what causes
+ * fewer cross-node spinlock operations.
+ */
+struct kmem_list3 {
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long free_objects;
+ int free_touched;
+ unsigned long next_reap;
+ struct array_cache *shared;
+};
+
+#define LIST3_INIT(parent) \
+ { \
+ .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
+ .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
+ .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
+ }
+#define list3_data(cachep) \
+ (&(cachep)->lists)
+
+/* NUMA: per-node */
+#define list3_data_ptr(cachep, ptr) \
+ list3_data(cachep)
+
+/*
+ * kmem_cache_t
+ *
+ * manages a cache.
+ */
+
+struct kmem_cache_s {
+/* 1) per-cpu data, touched during every alloc/free */
+ struct array_cache *array[NR_CPUS];
+ unsigned int batchcount;
+ unsigned int limit;
+/* 2) touched by every alloc & free from the backend */
+ struct kmem_list3 lists;
+ /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
+ unsigned int objsize;
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+ unsigned int free_limit; /* upper limit of objects in the lists */
+ spinlock_t spinlock;
+
+/* 3) cache_grow/shrink */
+ /* order of pgs per slab (2^n) */
+ unsigned int gfporder;
+
+ /* force GFP flags, e.g. GFP_DMA */
+ unsigned int gfpflags;
+
+ size_t colour; /* cache colouring range */
+ unsigned int colour_off; /* colour offset */
+ unsigned int colour_next; /* cache colouring */
+ kmem_cache_t *slabp_cache;
+ unsigned int slab_size;
+ unsigned int dflags; /* dynamic flags */
+
+ /* constructor func */
+ void (*ctor)(void *, kmem_cache_t *, unsigned long);
+
+ /* de-constructor func */
+ void (*dtor)(void *, kmem_cache_t *, unsigned long);
+
+/* 4) cache creation/removal */
+ const char *name;
+ struct list_head next;
+
+/* 5) statistics */
+#if STATS
+ unsigned long num_active;
+ unsigned long num_allocations;
+ unsigned long high_mark;
+ unsigned long grown;
+ unsigned long reaped;
+ unsigned long errors;
+ unsigned long max_freeable;
+ unsigned long node_allocs;
+ atomic_t allochit;
+ atomic_t allocmiss;
+ atomic_t freehit;
+ atomic_t freemiss;
+#endif
+#if DEBUG
+ int dbghead;
+ int reallen;
+#endif
+};
+
+#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+
+#define BATCHREFILL_LIMIT 16
+/* Optimization question: fewer reaps means less
+ * probability for unnessary cpucache drain/refill cycles.
+ *
+ * OTHO the cpuarrays can contain lots of objects,
+ * which could lock up otherwise freeable slabs.
+ */
+#define REAPTIMEOUT_CPUC (2*HZ)
+#define REAPTIMEOUT_LIST3 (4*HZ)
+
+#if STATS
+#define STATS_INC_ACTIVE(x) ((x)->num_active++)
+#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
+#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
+#define STATS_INC_GROWN(x) ((x)->grown++)
+#define STATS_INC_REAPED(x) ((x)->reaped++)
+#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \
+ (x)->high_mark = (x)->num_active; \
+ } while (0)
+#define STATS_INC_ERR(x) ((x)->errors++)
+#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_SET_FREEABLE(x, i) \
+ do { if ((x)->max_freeable < i) \
+ (x)->max_freeable = i; \
+ } while (0)
+
+#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
+#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
+#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
+#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
+#else
+#define STATS_INC_ACTIVE(x) do { } while (0)
+#define STATS_DEC_ACTIVE(x) do { } while (0)
+#define STATS_INC_ALLOCED(x) do { } while (0)
+#define STATS_INC_GROWN(x) do { } while (0)
+#define STATS_INC_REAPED(x) do { } while (0)
+#define STATS_SET_HIGH(x) do { } while (0)
+#define STATS_INC_ERR(x) do { } while (0)
+#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_SET_FREEABLE(x, i) \
+ do { } while (0)
+
+#define STATS_INC_ALLOCHIT(x) do { } while (0)
+#define STATS_INC_ALLOCMISS(x) do { } while (0)
+#define STATS_INC_FREEHIT(x) do { } while (0)
+#define STATS_INC_FREEMISS(x) do { } while (0)
+#endif
+
+#if DEBUG
+/* Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
+#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */
+
+/* ...and for poisoning */
+#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
+#define POISON_FREE 0x6b /* for use-after-free poisoning */
+#define POISON_END 0xa5 /* end-byte of poisoning */
+
+/* memory layout of objects:
+ * 0 : objp
+ * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
+ * the end of an object is aligned with the end of the real
+ * allocation. Catches writes behind the end of the allocation.
+ * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
+ * redzone word.
+ * cachep->dbghead: The real object.
+ * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ */
+static int obj_dbghead(kmem_cache_t *cachep)
+{
+ return cachep->dbghead;
+}
+
+static int obj_reallen(kmem_cache_t *cachep)
+{
+ return cachep->reallen;
+}
+
+static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
+}
+
+static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
+ if (cachep->flags & SLAB_STORE_USER)
+ return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
+ return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+}
+
+static void **dbg_userword(kmem_cache_t *cachep, void *objp)
+{
+ BUG_ON(!(cachep->flags & SLAB_STORE_USER));
+ return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+}
+
+#else
+
+#define obj_dbghead(x) 0
+#define obj_reallen(cachep) (cachep->objsize)
+#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
+#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
+#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
+
+#endif
+
+/*
+ * Maximum size of an obj (in 2^order pages)
+ * and absolute limit for the gfp order.
+ */
+#if defined(CONFIG_LARGE_ALLOCS)
+#define MAX_OBJ_ORDER 13 /* up to 32Mb */
+#define MAX_GFP_ORDER 13 /* up to 32Mb */
+#elif defined(CONFIG_MMU)
+#define MAX_OBJ_ORDER 5 /* 32 pages */
+#define MAX_GFP_ORDER 5 /* 32 pages */
+#else
+#define MAX_OBJ_ORDER 8 /* up to 1Mb */
+#define MAX_GFP_ORDER 8 /* up to 1Mb */
+#endif
+
+/*
+ * Do not go above this order unless 0 objects fit into the slab.
+ */
+#define BREAK_GFP_ORDER_HI 1
+#define BREAK_GFP_ORDER_LO 0
+static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+
+/* Macros for storing/retrieving the cachep and or slab from the
+ * global 'mem_map'. These are used to find the slab an obj belongs to.
+ * With kfree(), these are used to find the cache which an obj belongs to.
+ */
+#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x))
+#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next)
+#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x))
+#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev)
+
+/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+struct cache_sizes malloc_sizes[] = {
+#define CACHE(x) { .cs_size = (x) },
+#include <linux/kmalloc_sizes.h>
+ CACHE(ULONG_MAX)
+#undef CACHE
+};
+EXPORT_SYMBOL(malloc_sizes);
+
+/* Must match cache_sizes above. Out of line to keep cache footprint low. */
+struct cache_names {
+ char *name;
+ char *name_dma;
+};
+
+static struct cache_names __initdata cache_names[] = {
+#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
+#include <linux/kmalloc_sizes.h>
+ { NULL, }
+#undef CACHE
+};
+
+static struct arraycache_init initarray_cache __initdata =
+ { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+static struct arraycache_init initarray_generic =
+ { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+
+/* internal cache of cache description objs */
+static kmem_cache_t cache_cache = {
+ .lists = LIST3_INIT(cache_cache.lists),
+ .batchcount = 1,
+ .limit = BOOT_CPUCACHE_ENTRIES,
+ .objsize = sizeof(kmem_cache_t),
+ .flags = SLAB_NO_REAP,
+ .spinlock = SPIN_LOCK_UNLOCKED,
+ .name = "kmem_cache",
+#if DEBUG
+ .reallen = sizeof(kmem_cache_t),
+#endif
+};
+
+/* Guard access to the cache-chain. */
+static struct semaphore cache_chain_sem;
+static struct list_head cache_chain;
+
+/*
+ * vm_enough_memory() looks at this to determine how many
+ * slab-allocated pages are possibly freeable under pressure
+ *
+ * SLAB_RECLAIM_ACCOUNT turns this on per-slab
+ */
+atomic_t slab_reclaim_pages;
+EXPORT_SYMBOL(slab_reclaim_pages);
+
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+ NONE,
+ PARTIAL,
+ FULL
+} g_cpucache_up;
+
+static DEFINE_PER_CPU(struct work_struct, reap_work);
+
+static void free_block(kmem_cache_t* cachep, void** objpp, int len);
+static void enable_cpucache (kmem_cache_t *cachep);
+static void cache_reap (void *unused);
+
+static inline void **ac_entry(struct array_cache *ac)
+{
+ return (void**)(ac+1);
+}
+
+static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+{
+ return cachep->array[smp_processor_id()];
+}
+
+static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+{
+ struct cache_sizes *csizep = malloc_sizes;
+
+#if DEBUG
+ /* This happens if someone tries to call
+ * kmem_cache_create(), or __kmalloc(), before
+ * the generic caches are initialized.
+ */
+ BUG_ON(csizep->cs_cachep == NULL);
+#endif
+ while (size > csizep->cs_size)
+ csizep++;
+
+ /*
+ * Really subtile: The last entry with cs->cs_size==ULONG_MAX
+ * has cs_{dma,}cachep==NULL. Thus no special case
+ * for large kmalloc calls required.
+ */
+ if (unlikely(gfpflags & GFP_DMA))
+ return csizep->cs_dmacachep;
+ return csizep->cs_cachep;
+}
+
+/* Cal the num objs, wastage, and bytes left over for a given slab size. */
+static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
+ int flags, size_t *left_over, unsigned int *num)
+{
+ int i;
+ size_t wastage = PAGE_SIZE<<gfporder;
+ size_t extra = 0;
+ size_t base = 0;
+
+ if (!(flags & CFLGS_OFF_SLAB)) {
+ base = sizeof(struct slab);
+ extra = sizeof(kmem_bufctl_t);
+ }
+ i = 0;
+ while (i*size + ALIGN(base+i*extra, align) <= wastage)
+ i++;
+ if (i > 0)
+ i--;
+
+ if (i > SLAB_LIMIT)
+ i = SLAB_LIMIT;
+
+ *num = i;
+ wastage -= i*size;
+ wastage -= ALIGN(base+i*extra, align);
+ *left_over = wastage;
+}
+
+#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
+
+static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
+{
+ printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ function, cachep->name, msg);
+ dump_stack();
+}
+
+/*
+ * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
+ * via the workqueue/eventd.
+ * Add the CPU number into the expiration time to minimize the possibility of
+ * the CPUs getting into lockstep and contending for the global cache chain
+ * lock.
+ */
+static void __devinit start_cpu_timer(int cpu)
+{
+ struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+
+ /*
+ * When this gets called from do_initcalls via cpucache_init(),
+ * init_workqueues() has already run, so keventd will be setup
+ * at that time.
+ */
+ if (keventd_up() && reap_work->func == NULL) {
+ INIT_WORK(reap_work, cache_reap, NULL);
+ schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+ }
+}
+
+static struct array_cache *alloc_arraycache(int cpu, int entries,
+ int batchcount)
+{
+ int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+ struct array_cache *nc = NULL;
+
+ if (cpu != -1) {
+ kmem_cache_t *cachep;
+ cachep = kmem_find_general_cachep(memsize, GFP_KERNEL);
+ if (cachep)
+ nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu));
+ }
+ if (!nc)
+ nc = kmalloc(memsize, GFP_KERNEL);
+ if (nc) {
+ nc->avail = 0;
+ nc->limit = entries;
+ nc->batchcount = batchcount;
+ nc->touched = 0;
+ }
+ return nc;
+}
+
+static int __devinit cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ kmem_cache_t* cachep;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ down(&cache_chain_sem);
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct array_cache *nc;
+
+ nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+ if (!nc)
+ goto bad;
+
+ spin_lock_irq(&cachep->spinlock);
+ cachep->array[cpu] = nc;
+ cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+ + cachep->num;
+ spin_unlock_irq(&cachep->spinlock);
+
+ }
+ up(&cache_chain_sem);
+ break;
+ case CPU_ONLINE:
+ start_cpu_timer(cpu);
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ /* fall thru */
+ case CPU_UP_CANCELED:
+ down(&cache_chain_sem);
+
+ list_for_each_entry(cachep, &cache_chain, next) {
+ struct array_cache *nc;
+
+ spin_lock_irq(&cachep->spinlock);
+ /* cpu is dead; no one can alloc from it. */
+ nc = cachep->array[cpu];
+ cachep->array[cpu] = NULL;
+ cachep->free_limit -= cachep->batchcount;
+ free_block(cachep, ac_entry(nc), nc->avail);
+ spin_unlock_irq(&cachep->spinlock);
+ kfree(nc);
+ }
+ up(&cache_chain_sem);
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+bad:
+ up(&cache_chain_sem);
+ return NOTIFY_BAD;
+}
+
+static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+
+/* Initialisation.
+ * Called after the gfp() functions have been enabled, and before smp_init().
+ */
+void __init kmem_cache_init(void)
+{
+ size_t left_over;
+ struct cache_sizes *sizes;
+ struct cache_names *names;
+
+ /*
+ * Fragmentation resistance on low memory - only use bigger
+ * page orders on machines with more than 32MB of memory.
+ */
+ if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+ slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+
+
+ /* Bootstrap is tricky, because several objects are allocated
+ * from caches that do not exist yet:
+ * 1) initialize the cache_cache cache: it contains the kmem_cache_t
+ * structures of all caches, except cache_cache itself: cache_cache
+ * is statically allocated.
+ * Initially an __init data area is used for the head array, it's
+ * replaced with a kmalloc allocated array at the end of the bootstrap.
+ * 2) Create the first kmalloc cache.
+ * The kmem_cache_t for the new cache is allocated normally. An __init
+ * data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+ * 4) Replace the __init data head arrays for cache_cache and the first
+ * kmalloc cache with kmalloc allocated arrays.
+ * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+ */
+
+ /* 1) create the cache_cache */
+ init_MUTEX(&cache_chain_sem);
+ INIT_LIST_HEAD(&cache_chain);
+ list_add(&cache_cache.next, &cache_chain);
+ cache_cache.colour_off = cache_line_size();
+ cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+
+ cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+
+ cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+ &left_over, &cache_cache.num);
+ if (!cache_cache.num)
+ BUG();
+
+ cache_cache.colour = left_over/cache_cache.colour_off;
+ cache_cache.colour_next = 0;
+ cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+ sizeof(struct slab), cache_line_size());
+
+ /* 2+3) create the kmalloc caches */
+ sizes = malloc_sizes;
+ names = cache_names;
+
+ while (sizes->cs_size != ULONG_MAX) {
+ /* For performance, all the general caches are L1 aligned.
+ * This should be particularly beneficial on SMP boxes, as it
+ * eliminates "false sharing".
+ * Note for systems short on memory removing the alignment will
+ * allow tighter packing of the smaller caches. */
+ sizes->cs_cachep = kmem_cache_create(names->name,
+ sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
+ /* Inc off-slab bufctl limit until the ceiling is hit. */
+ if (!(OFF_SLAB(sizes->cs_cachep))) {
+ offslab_limit = sizes->cs_size-sizeof(struct slab);
+ offslab_limit /= sizeof(kmem_bufctl_t);
+ }
+
+ sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
+ sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
+ NULL, NULL);
+
+ sizes++;
+ names++;
+ }
+ /* 4) Replace the bootstrap head arrays */
+ {
+ void * ptr;
+
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ local_irq_disable();
+ BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
+ memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+ cache_cache.array[smp_processor_id()] = ptr;
+ local_irq_enable();
+
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ local_irq_disable();
+ BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
+ memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+ sizeof(struct arraycache_init));
+ malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+ local_irq_enable();
+ }
+
+ /* 5) resize the head arrays to their final sizes */
+ {
+ kmem_cache_t *cachep;
+ down(&cache_chain_sem);
+ list_for_each_entry(cachep, &cache_chain, next)
+ enable_cpucache(cachep);
+ up(&cache_chain_sem);
+ }
+
+ /* Done! */
+ g_cpucache_up = FULL;
+
+ /* Register a cpu startup notifier callback
+ * that initializes ac_data for all new cpus
+ */
+ register_cpu_notifier(&cpucache_notifier);
+
+
+ /* The reap timers are started later, with a module init call:
+ * That part of the kernel is not yet operational.
+ */
+}
+
+static int __init cpucache_init(void)
+{
+ int cpu;
+
+ /*
+ * Register the timers that return unneeded
+ * pages to gfp.
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (cpu_online(cpu))
+ start_cpu_timer(cpu);
+ }
+
+ return 0;
+}
+
+__initcall(cpucache_init);
+
+/*
+ * Interface to system's page allocator. No need to hold the cache-lock.
+ *
+ * If we requested dmaable memory, we will get it. Even if we
+ * did not request dmaable memory, we might get it, but that
+ * would be relatively rare and ignorable.
+ */
+static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+{
+ struct page *page;
+ void *addr;
+ int i;
+
+ flags |= cachep->gfpflags;
+ if (likely(nodeid == -1)) {
+ page = alloc_pages(flags, cachep->gfporder);
+ } else {
+ page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ }
+ if (!page)
+ return NULL;
+ addr = page_address(page);
+
+ i = (1 << cachep->gfporder);
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ atomic_add(i, &slab_reclaim_pages);
+ add_page_state(nr_slab, i);
+ while (i--) {
+ SetPageSlab(page);
+ page++;
+ }
+ return addr;
+}
+
+/*
+ * Interface to system's page release.
+ */
+static void kmem_freepages(kmem_cache_t *cachep, void *addr)
+{
+ unsigned long i = (1<<cachep->gfporder);
+ struct page *page = virt_to_page(addr);
+ const unsigned long nr_freed = i;
+
+ while (i--) {
+ if (!TestClearPageSlab(page))
+ BUG();
+ page++;
+ }
+ sub_page_state(nr_slab, nr_freed);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += nr_freed;
+ free_pages((unsigned long)addr, cachep->gfporder);
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+}
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+ struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+ kmem_cache_t *cachep = slab_rcu->cachep;
+
+ kmem_freepages(cachep, slab_rcu->addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slab_rcu);
+}
+
+#if DEBUG
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
+ unsigned long caller)
+{
+ int size = obj_reallen(cachep);
+
+ addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+
+ if (size < 5*sizeof(unsigned long))
+ return;
+
+ *addr++=0x12345678;
+ *addr++=caller;
+ *addr++=smp_processor_id();
+ size -= 3*sizeof(unsigned long);
+ {
+ unsigned long *sptr = &caller;
+ unsigned long svalue;
+
+ while (!kstack_end(sptr)) {
+ svalue = *sptr++;
+ if (kernel_text_address(svalue)) {
+ *addr++=svalue;
+ size -= sizeof(unsigned long);
+ if (size <= sizeof(unsigned long))
+ break;
+ }
+ }
+
+ }
+ *addr++=0x87654321;
+}
+#endif
+
+static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
+{
+ int size = obj_reallen(cachep);
+ addr = &((char*)addr)[obj_dbghead(cachep)];
+
+ memset(addr, val, size);
+ *(unsigned char *)(addr+size-1) = POISON_END;
+}
+
+static void dump_line(char *data, int offset, int limit)
+{
+ int i;
+ printk(KERN_ERR "%03x:", offset);
+ for (i=0;i<limit;i++) {
+ printk(" %02x", (unsigned char)data[offset+i]);
+ }
+ printk("\n");
+}
+#endif
+
+#if DEBUG
+
+static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
+{
+ int i, size;
+ char *realobj;
+
+ if (cachep->flags & SLAB_RED_ZONE) {
+ printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
+ }
+
+ if (cachep->flags & SLAB_STORE_USER) {
+ printk(KERN_ERR "Last user: [<%p>]",
+ *dbg_userword(cachep, objp));
+ print_symbol("(%s)",
+ (unsigned long)*dbg_userword(cachep, objp));
+ printk("\n");
+ }
+ realobj = (char*)objp+obj_dbghead(cachep);
+ size = obj_reallen(cachep);
+ for (i=0; i<size && lines;i+=16, lines--) {
+ int limit;
+ limit = 16;
+ if (i+limit > size)
+ limit = size-i;
+ dump_line(realobj, i, limit);
+ }
+}
+
+static void check_poison_obj(kmem_cache_t *cachep, void *objp)
+{
+ char *realobj;
+ int size, i;
+ int lines = 0;
+
+ realobj = (char*)objp+obj_dbghead(cachep);
+ size = obj_reallen(cachep);
+
+ for (i=0;i<size;i++) {
+ char exp = POISON_FREE;
+ if (i == size-1)
+ exp = POISON_END;
+ if (realobj[i] != exp) {
+ int limit;
+ /* Mismatch ! */
+ /* Print header */
+ if (lines == 0) {
+ printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
+ realobj, size);
+ print_objinfo(cachep, objp, 0);
+ }
+ /* Hexdump the affected line */
+ i = (i/16)*16;
+ limit = 16;
+ if (i+limit > size)
+ limit = size-i;
+ dump_line(realobj, i, limit);
+ i += 16;
+ lines++;
+ /* Limit to 5 lines */
+ if (lines > 5)
+ break;
+ }
+ }
+ if (lines != 0) {
+ /* Print some data about the neighboring objects, if they
+ * exist:
+ */
+ struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ int objnr;
+
+ objnr = (objp-slabp->s_mem)/cachep->objsize;
+ if (objnr) {
+ objp = slabp->s_mem+(objnr-1)*cachep->objsize;
+ realobj = (char*)objp+obj_dbghead(cachep);
+ printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
+ realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ if (objnr+1 < cachep->num) {
+ objp = slabp->s_mem+(objnr+1)*cachep->objsize;
+ realobj = (char*)objp+obj_dbghead(cachep);
+ printk(KERN_ERR "Next obj: start=%p, len=%d\n",
+ realobj, size);
+ print_objinfo(cachep, objp, 2);
+ }
+ }
+}
+#endif
+
+/* Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache.
+ * The cache-lock is not held/needed.
+ */
+static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+{
+ void *addr = slabp->s_mem - slabp->colouroff;
+
+#if DEBUG
+ int i;
+ for (i = 0; i < cachep->num; i++) {
+ void *objp = slabp->s_mem + cachep->objsize * i;
+
+ if (cachep->flags & SLAB_POISON) {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+ kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+ else
+ check_poison_obj(cachep, objp);
+#else
+ check_poison_obj(cachep, objp);
+#endif
+ }
+ if (cachep->flags & SLAB_RED_ZONE) {
+ if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "start of a freed object "
+ "was overwritten");
+ if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
+ slab_error(cachep, "end of a freed object "
+ "was overwritten");
+ }
+ if (cachep->dtor && !(cachep->flags & SLAB_POISON))
+ (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+ }
+#else
+ if (cachep->dtor) {
+ int i;
+ for (i = 0; i < cachep->num; i++) {
+ void* objp = slabp->s_mem+cachep->objsize*i;
+ (cachep->dtor)(objp, cachep, 0);
+ }
+ }
+#endif
+
+ if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
+ struct slab_rcu *slab_rcu;
+
+ slab_rcu