diff options
Diffstat (limited to 'mm/slab.c')
-rw-r--r-- | mm/slab.c | 3060 |
1 files changed, 3060 insertions, 0 deletions
diff --git a/mm/slab.c b/mm/slab.c new file mode 100644 index 00000000000..ec660d85ddd --- /dev/null +++ b/mm/slab.c @@ -0,0 +1,3060 @@ +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * Cleanup, make the head arrays unconditional, preparation for NUMA + * (c) 2002 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * This means, that your constructor is used only for newly allocated + * slabs and you must pass objects with the same intializations to + * kmem_cache_free. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * Each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * The head array is strictly LIFO and should improve the cache hit rates. + * On SMP, it additionally reduces the spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts - + * it's changed with a smp_call_function(). + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in kmem_cache_t and struct slab never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch + * in 2000 - many ideas in the current implementation are derived from + * his patch. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the semaphore 'cache_chain_sem'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * At present, each engine can be growing a cache. This should be blocked. + * + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/seq_file.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/cpu.h> +#include <linux/sysctl.h> +#include <linux/module.h> +#include <linux/rcupdate.h> + +#include <asm/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/page.h> + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, + * SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) + +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +#ifndef ARCH_KMALLOC_MINALIGN +/* + * Enforce a minimum alignment for the kmalloc caches. + * Usually, the kmalloc caches are cache_line_size() aligned, except when + * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. + * Some archs want to perform DMA into kmalloc caches and need a guaranteed + * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. + * Note that this flag disables some debug features. + */ +#define ARCH_KMALLOC_MINALIGN 0 +#endif + +#ifndef ARCH_SLAB_MINALIGN +/* + * Enforce a minimum alignment for all caches. + * Intended for archs that get misalignment faults even for BYTES_PER_WORD + * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. + * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables + * some debug features. + */ +#define ARCH_SLAB_MINALIGN 0 +#endif + +#ifndef ARCH_KMALLOC_FLAGS +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN +#endif + +/* Legal flag mask for kmem_cache_create(). */ +#if DEBUG +# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_HWCACHE_ALIGN | \ + SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) +#else +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU) +#endif + +/* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) + +/* Max number of objs-per-slab for caches which use off-slab slabs. + * Needed to avoid a possible looping condition in cache_grow(). + */ +static unsigned long offslab_limit; + +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; +}; + +/* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + * + * We assume struct slab_rcu can overlay struct slab when destroying. + */ +struct slab_rcu { + struct rcu_head head; + kmem_cache_t *cachep; + void *addr; +}; + +/* + * struct array_cache + * + * Per cpu structures + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; +}; + +/* bootstrap: The caches do not work without cpuarrays anymore, + * but the cpuarrays are allocated from the generic caches... + */ +#define BOOT_CPUCACHE_ENTRIES 1 +struct arraycache_init { + struct array_cache cache; + void * entries[BOOT_CPUCACHE_ENTRIES]; +}; + +/* + * The slab lists of all objects. + * Hopefully reduce the internal fragmentation + * NUMA: The spinlock could be moved from the kmem_cache_t + * into this structure, too. Figure out what causes + * fewer cross-node spinlock operations. + */ +struct kmem_list3 { + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + int free_touched; + unsigned long next_reap; + struct array_cache *shared; +}; + +#define LIST3_INIT(parent) \ + { \ + .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ + .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ + .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ + } +#define list3_data(cachep) \ + (&(cachep)->lists) + +/* NUMA: per-node */ +#define list3_data_ptr(cachep, ptr) \ + list3_data(cachep) + +/* + * kmem_cache_t + * + * manages a cache. + */ + +struct kmem_cache_s { +/* 1) per-cpu data, touched during every alloc/free */ + struct array_cache *array[NR_CPUS]; + unsigned int batchcount; + unsigned int limit; +/* 2) touched by every alloc & free from the backend */ + struct kmem_list3 lists; + /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */ + unsigned int objsize; + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + unsigned int free_limit; /* upper limit of objects in the lists */ + spinlock_t spinlock; + +/* 3) cache_grow/shrink */ + /* order of pgs per slab (2^n) */ + unsigned int gfporder; + + /* force GFP flags, e.g. GFP_DMA */ + unsigned int gfpflags; + + size_t colour; /* cache colouring range */ + unsigned int colour_off; /* colour offset */ + unsigned int colour_next; /* cache colouring */ + kmem_cache_t *slabp_cache; + unsigned int slab_size; + unsigned int dflags; /* dynamic flags */ + + /* constructor func */ + void (*ctor)(void *, kmem_cache_t *, unsigned long); + + /* de-constructor func */ + void (*dtor)(void *, kmem_cache_t *, unsigned long); + +/* 4) cache creation/removal */ + const char *name; + struct list_head next; + +/* 5) statistics */ +#if STATS + unsigned long num_active; + unsigned long num_allocations; + unsigned long high_mark; + unsigned long grown; + unsigned long reaped; + unsigned long errors; + unsigned long max_freeable; + unsigned long node_allocs; + atomic_t allochit; + atomic_t allocmiss; + atomic_t freehit; + atomic_t freemiss; +#endif +#if DEBUG + int dbghead; + int reallen; +#endif +}; + +#define CFLGS_OFF_SLAB (0x80000000UL) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +#define BATCHREFILL_LIMIT 16 +/* Optimization question: fewer reaps means less + * probability for unnessary cpucache drain/refill cycles. + * + * OTHO the cpuarrays can contain lots of objects, + * which could lock up otherwise freeable slabs. + */ +#define REAPTIMEOUT_CPUC (2*HZ) +#define REAPTIMEOUT_LIST3 (4*HZ) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_INC_REAPED(x) ((x)->reaped++) +#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) +#define STATS_SET_FREEABLE(x, i) \ + do { if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) + +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_INC_REAPED(x) do { } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#define STATS_INC_NODEALLOCS(x) do { } while (0) +#define STATS_SET_FREEABLE(x, i) \ + do { } while (0) + +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG +/* Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ +#define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ + +/* ...and for poisoning */ +#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ +#define POISON_FREE 0x6b /* for use-after-free poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +/* memory layout of objects: + * 0 : objp + * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that + * the end of an object is aligned with the end of the real + * allocation. Catches writes behind the end of the allocation. + * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: + * redzone word. + * cachep->dbghead: The real object. + * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] + */ +static int obj_dbghead(kmem_cache_t *cachep) +{ + return cachep->dbghead; +} + +static int obj_reallen(kmem_cache_t *cachep) +{ + return cachep->reallen; +} + +static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); +} + +static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + if (cachep->flags & SLAB_STORE_USER) + return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); + return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); +} + +static void **dbg_userword(kmem_cache_t *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_STORE_USER)); + return (void**)(objp+cachep->objsize-BYTES_PER_WORD); +} + +#else + +#define obj_dbghead(x) 0 +#define obj_reallen(cachep) (cachep->objsize) +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) + +#endif + +/* + * Maximum size of an obj (in 2^order pages) + * and absolute limit for the gfp order. + */ +#if defined(CONFIG_LARGE_ALLOCS) +#define MAX_OBJ_ORDER 13 /* up to 32Mb */ +#define MAX_GFP_ORDER 13 /* up to 32Mb */ +#elif defined(CONFIG_MMU) +#define MAX_OBJ_ORDER 5 /* 32 pages */ +#define MAX_GFP_ORDER 5 /* 32 pages */ +#else +#define MAX_OBJ_ORDER 8 /* up to 1Mb */ +#define MAX_GFP_ORDER 8 /* up to 1Mb */ +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab. + */ +#define BREAK_GFP_ORDER_HI 1 +#define BREAK_GFP_ORDER_LO 0 +static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +/* Macros for storing/retrieving the cachep and or slab from the + * global 'mem_map'. These are used to find the slab an obj belongs to. + * With kfree(), these are used to find the cache which an obj belongs to. + */ +#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) +#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) +#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) +#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) + +/* These are the default caches for kmalloc. Custom caches can have other sizes. */ +struct cache_sizes malloc_sizes[] = { +#define CACHE(x) { .cs_size = (x) }, +#include <linux/kmalloc_sizes.h> + CACHE(ULONG_MAX) +#undef CACHE +}; +EXPORT_SYMBOL(malloc_sizes); + +/* Must match cache_sizes above. Out of line to keep cache footprint low. */ +struct cache_names { + char *name; + char *name_dma; +}; + +static struct cache_names __initdata cache_names[] = { +#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, +#include <linux/kmalloc_sizes.h> + { NULL, } +#undef CACHE +}; + +static struct arraycache_init initarray_cache __initdata = + { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; +static struct arraycache_init initarray_generic = + { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + +/* internal cache of cache description objs */ +static kmem_cache_t cache_cache = { + .lists = LIST3_INIT(cache_cache.lists), + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .objsize = sizeof(kmem_cache_t), + .flags = SLAB_NO_REAP, + .spinlock = SPIN_LOCK_UNLOCKED, + .name = "kmem_cache", +#if DEBUG + .reallen = sizeof(kmem_cache_t), +#endif +}; + +/* Guard access to the cache-chain. */ +static struct semaphore cache_chain_sem; +static struct list_head cache_chain; + +/* + * vm_enough_memory() looks at this to determine how many + * slab-allocated pages are possibly freeable under pressure + * + * SLAB_RECLAIM_ACCOUNT turns this on per-slab + */ +atomic_t slab_reclaim_pages; +EXPORT_SYMBOL(slab_reclaim_pages); + +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL, + FULL +} g_cpucache_up; + +static DEFINE_PER_CPU(struct work_struct, reap_work); + +static void free_block(kmem_cache_t* cachep, void** objpp, int len); +static void enable_cpucache (kmem_cache_t *cachep); +static void cache_reap (void *unused); + +static inline void **ac_entry(struct array_cache *ac) +{ + return (void**)(ac+1); +} + +static inline struct array_cache *ac_data(kmem_cache_t *cachep) +{ + return cachep->array[smp_processor_id()]; +} + +static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) +{ + struct cache_sizes *csizep = malloc_sizes; + +#if DEBUG + /* This happens if someone tries to call + * kmem_cache_create(), or __kmalloc(), before + * the generic caches are initialized. + */ + BUG_ON(csizep->cs_cachep == NULL); +#endif + while (size > csizep->cs_size) + csizep++; + + /* + * Really subtile: The last entry with cs->cs_size==ULONG_MAX + * has cs_{dma,}cachep==NULL. Thus no special case + * for large kmalloc calls required. + */ + if (unlikely(gfpflags & GFP_DMA)) + return csizep->cs_dmacachep; + return csizep->cs_cachep; +} + +/* Cal the num objs, wastage, and bytes left over for a given slab size. */ +static void cache_estimate(unsigned long gfporder, size_t size, size_t align, + int flags, size_t *left_over, unsigned int *num) +{ + int i; + size_t wastage = PAGE_SIZE<<gfporder; + size_t extra = 0; + size_t base = 0; + + if (!(flags & CFLGS_OFF_SLAB)) { + base = sizeof(struct slab); + extra = sizeof(kmem_bufctl_t); + } + i = 0; + while (i*size + ALIGN(base+i*extra, align) <= wastage) + i++; + if (i > 0) + i--; + + if (i > SLAB_LIMIT) + i = SLAB_LIMIT; + + *num = i; + wastage -= i*size; + wastage -= ALIGN(base+i*extra, align); + *left_over = wastage; +} + +#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) + +static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) +{ + printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + function, cachep->name, msg); + dump_stack(); +} + +/* + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz + * via the workqueue/eventd. + * Add the CPU number into the expiration time to minimize the possibility of + * the CPUs getting into lockstep and contending for the global cache chain + * lock. + */ +static void __devinit start_cpu_timer(int cpu) +{ + struct work_struct *reap_work = &per_cpu(reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->func == NULL) { + INIT_WORK(reap_work, cache_reap, NULL); + schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + } +} + +static struct array_cache *alloc_arraycache(int cpu, int entries, + int batchcount) +{ + int memsize = sizeof(void*)*entries+sizeof(struct array_cache); + struct array_cache *nc = NULL; + + if (cpu != -1) { + kmem_cache_t *cachep; + cachep = kmem_find_general_cachep(memsize, GFP_KERNEL); + if (cachep) + nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu)); + } + if (!nc) + nc = kmalloc(memsize, GFP_KERNEL); + if (nc) { + nc->avail = 0; + nc->limit = entries; + nc->batchcount = batchcount; + nc->touched = 0; + } + return nc; +} + +static int __devinit cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + kmem_cache_t* cachep; + + switch (action) { + case CPU_UP_PREPARE: + down(&cache_chain_sem); + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + + nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); + if (!nc) + goto bad; + + spin_lock_irq(&cachep->spinlock); + cachep->array[cpu] = nc; + cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + + cachep->num; + spin_unlock_irq(&cachep->spinlock); + + } + up(&cache_chain_sem); + break; + case CPU_ONLINE: + start_cpu_timer(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + /* fall thru */ + case CPU_UP_CANCELED: + down(&cache_chain_sem); + + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + + spin_lock_irq(&cachep->spinlock); + /* cpu is dead; no one can alloc from it. */ + nc = cachep->array[cpu]; + cachep->array[cpu] = NULL; + cachep->free_limit -= cachep->batchcount; + free_block(cachep, ac_entry(nc), nc->avail); + spin_unlock_irq(&cachep->spinlock); + kfree(nc); + } + up(&cache_chain_sem); + break; +#endif + } + return NOTIFY_OK; +bad: + up(&cache_chain_sem); + return NOTIFY_BAD; +} + +static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; + +/* Initialisation. + * Called after the gfp() functions have been enabled, and before smp_init(). + */ +void __init kmem_cache_init(void) +{ + size_t left_over; + struct cache_sizes *sizes; + struct cache_names *names; + + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory. + */ + if (num_physpages > (32 << 20) >> PAGE_SHIFT) + slab_break_gfp_order = BREAK_GFP_ORDER_HI; + + + /* Bootstrap is tricky, because several objects are allocated + * from caches that do not exist yet: + * 1) initialize the cache_cache cache: it contains the kmem_cache_t + * structures of all caches, except cache_cache itself: cache_cache + * is statically allocated. + * Initially an __init data area is used for the head array, it's + * replaced with a kmalloc allocated array at the end of the bootstrap. + * 2) Create the first kmalloc cache. + * The kmem_cache_t for the new cache is allocated normally. An __init + * data area is used for the head array. + * 3) Create the remaining kmalloc caches, with minimally sized head arrays. + * 4) Replace the __init data head arrays for cache_cache and the first + * kmalloc cache with kmalloc allocated arrays. + * 5) Resize the head arrays of the kmalloc caches to their final sizes. + */ + + /* 1) create the cache_cache */ + init_MUTEX(&cache_chain_sem); + INIT_LIST_HEAD(&cache_chain); + list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); + cache_cache.array[smp_processor_id()] = &initarray_cache.cache; + + cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); + + cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, + &left_over, &cache_cache.num); + if (!cache_cache.num) + BUG(); + + cache_cache.colour = left_over/cache_cache.colour_off; + cache_cache.colour_next = 0; + cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); + + /* 2+3) create the kmalloc caches */ + sizes = malloc_sizes; + names = cache_names; + + while (sizes->cs_size != ULONG_MAX) { + /* For performance, all the general caches are L1 aligned. + * This should be particularly beneficial on SMP boxes, as it + * eliminates "false sharing". + * Note for systems short on memory removing the alignment will + * allow tighter packing of the smaller caches. */ + sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); + + /* Inc off-slab bufctl limit until the ceiling is hit. */ + if (!(OFF_SLAB(sizes->cs_cachep))) { + offslab_limit = sizes->cs_size-sizeof(struct slab); + offslab_limit /= sizeof(kmem_bufctl_t); + } + + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, + sizes->cs_size, ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), + NULL, NULL); + + sizes++; + names++; + } + /* 4) Replace the bootstrap head arrays */ + { + void * ptr; + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + local_irq_disable(); + BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); + memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); + cache_cache.array[smp_processor_id()] = ptr; + local_irq_enable(); + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + local_irq_disable(); + BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); + memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), + sizeof(struct arraycache_init)); + malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; + local_irq_enable(); + } + + /* 5) resize the head arrays to their final sizes */ + { + kmem_cache_t *cachep; + down(&cache_chain_sem); + list_for_each_entry(cachep, &cache_chain, next) + enable_cpucache(cachep); + up(&cache_chain_sem); + } + + /* Done! */ + g_cpucache_up = FULL; + + /* Register a cpu startup notifier callback + * that initializes ac_data for all new cpus + */ + register_cpu_notifier(&cpucache_notifier); + + + /* The reap timers are started later, with a module init call: + * That part of the kernel is not yet operational. + */ +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that return unneeded + * pages to gfp. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (cpu_online(cpu)) + start_cpu_timer(cpu); + } + + return 0; +} + +__initcall(cpucache_init); + +/* + * Interface to system's page allocator. No need to hold the cache-lock. + * + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ +static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid) +{ + struct page *page; + void *addr; + int i; + + flags |= cachep->gfpflags; + if (likely(nodeid == -1)) { + page = alloc_pages(flags, cachep->gfporder); + } else { + page = alloc_pages_node(nodeid, flags, cachep->gfporder); + } + if (!page) + return NULL; + addr = page_address(page); + + i = (1 << cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + atomic_add(i, &slab_reclaim_pages); + add_page_state(nr_slab, i); + while (i--) { + SetPageSlab(page); + page++; + } + return addr; +} + +/* + * Interface to system's page release. + */ +static void kmem_freepages(kmem_cache_t *cachep, void *addr) +{ + unsigned long i = (1<<cachep->gfporder); + struct page *page = virt_to_page(addr); + const unsigned long nr_freed = i; + + while (i--) { + if (!TestClearPageSlab(page)) + BUG(); + page++; + } + sub_page_state(nr_slab, nr_freed); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; + free_pages((unsigned long)addr, cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slab_rcu *slab_rcu = (struct slab_rcu *) head; + kmem_cache_t *cachep = slab_rcu->cachep; + + kmem_freepages(cachep, slab_rcu->addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); +} + +#if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, + unsigned long caller) +{ + int size = obj_reallen(cachep); + + addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; + + if (size < 5*sizeof(unsigned long)) + return; + + *addr++=0x12345678; + *addr++=caller; + *addr++=smp_processor_id(); + size -= 3*sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (!kstack_end(sptr)) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++=svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + + } + *addr++=0x87654321; +} +#endif + +static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) +{ + int size = obj_reallen(cachep); + addr = &((char*)addr)[obj_dbghead(cachep)]; + + memset(addr, val, size); + *(unsigned char *)(addr+size-1) = POISON_END; +} + +static void dump_line(char *data, int offset, int limit) +{ + int i; + printk(KERN_ERR "%03x:", offset); + for (i=0;i<limit;i++) { + printk(" %02x", (unsigned char)data[offset+i]); + } + printk("\n"); +} +#endif + +#if DEBUG + +static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) +{ + int i, size; + char *realobj; + + if (cachep->flags & SLAB_RED_ZONE) { + printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + + if (cachep->flags & SLAB_STORE_USER) { + printk(KERN_ERR "Last user: [<%p>]", + *dbg_userword(cachep, objp)); + print_symbol("(%s)", + (unsigned long)*dbg_userword(cachep, objp)); + printk("\n"); + } + realobj = (char*)objp+obj_dbghead(cachep); + size = obj_reallen(cachep); + for (i=0; i<size && lines;i+=16, lines--) { + int limit; + limit = 16; + if (i+limit > size) + limit = size-i; + dump_line(realobj, i, limit); + } +} + +static void check_poison_obj(kmem_cache_t *cachep, void *objp) +{ + char *realobj; + int size, i; + int lines = 0; + + realobj = (char*)objp+obj_dbghead(cachep); + size = obj_reallen(cachep); + + for (i=0;i<size;i++) { + char exp = POISON_FREE; + if (i == size-1) + exp = POISON_END; + if (realobj[i] != exp) { + int limit; + /* Mismatch ! */ + /* Print header */ + if (lines == 0) { + printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 0); + } + /* Hexdump the affected line */ + i = (i/16)*16; + limit = 16; + if (i+limit > size) + limit = size-i; + dump_line(realobj, i, limit); + i += 16; + lines++; + /* Limit to 5 lines */ + if (lines > 5) + break; + } + } + if (lines != 0) { + /* Print some data about the neighboring objects, if they + * exist: + */ + struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); + int objnr; + + objnr = (objp-slabp->s_mem)/cachep->objsize; + if (objnr) { + objp = slabp->s_mem+(objnr-1)*cachep->objsize; + realobj = (char*)objp+obj_dbghead(cachep); + printk(KERN_ERR "Prev obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + if (objnr+1 < cachep->num) { + objp = slabp->s_mem+(objnr+1)*cachep->objsize; + realobj = (char*)objp+obj_dbghead(cachep); + printk(KERN_ERR "Next obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + } +} +#endif + +/* Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. + * The cache-lock is not held/needed. + */ +static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) +{ + void *addr = slabp->s_mem - slabp->colouroff; + +#if DEBUG + int i; + for (i = 0; i < cachep->num; i++) { + void *objp = slabp->s_mem + cachep->objsize * i; + + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + } + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "start of a freed object " + "was overwritten"); + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "end of a freed object " + "was overwritten"); + } + if (cachep->dtor && !(cachep->flags & SLAB_POISON)) + (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); + } +#else + if (cachep->dtor) { + int i; + for (i = 0; i < cachep->num; i++) { + void* objp = slabp->s_mem+cachep->objsize*i; + (cachep->dtor)(objp, cachep, 0); + } + } +#endif + + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { + struct slab_rcu *slab_rcu; + + slab_rcu |