diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 3144 |
1 files changed, 3144 insertions, 0 deletions
diff --git a/mm/slub.c b/mm/slub.c new file mode 100644 index 00000000000..0cd56bd74b6 --- /dev/null +++ b/mm/slub.c @@ -0,0 +1,3144 @@ +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks and only + * uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/bit_spinlock.h> +#include <linux/interrupt.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/mempolicy.h> +#include <linux/ctype.h> +#include <linux/kallsyms.h> + +/* + * Lock order: + * 1. slab_lock(page) + * 2. slab->list_lock + * + * The slab_lock protects operations on the object of a particular + * slab and its metadata in the page struct. If the slab lock + * has been taken then no allocations nor frees can be performed + * on the objects in the slab nor can the slab be added or removed + * from the partial or full lists since this would mean modifying + * the page_struct of the slab. + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * + * The lock order is sometimes inverted when we are trying to get a slab + * off a list. We take the list_lock and then look for a page on the list + * to use. While we do that objects in the slabs may be freed. We can + * only operate on the slab if we have also taken the slab_lock. So we use + * a slab_trylock() on the slab. If trylock was successful then no frees + * can occur anymore and we can use the slab for allocations etc. If the + * slab_trylock() does not succeed then frees are in progress in the slab and + * we must stay away from it for a while since we may cause a bouncing + * cacheline if we try to acquire the lock. So go onto the next slab. + * If all pages are busy then we may allocate a new slab instead of reusing + * a partial slab. A new slab has noone operating on it and thus there is + * no danger of cacheline contention. + * + * Interrupts are disabled during allocation and deallocation in order to + * make the slab allocator safe to use in the context of an irq. In addition + * interrupts are disabled to ensure that the processor does not change + * while handling per_cpu slabs, due to kernel preemption. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list. + * There is no list for full slabs. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * Otherwise there is no need to track full slabs unless we have to + * track full slabs for debugging purposes. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * Overloading of page flags that are otherwise used for LRU management. + * + * PageActive The slab is used as a cpu cache. Allocations + * may be performed from the slab. The slab is not + * on any slab list and cannot be moved onto one. + * + * PageError Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path. + */ + +/* + * Issues still to be resolved: + * + * - The per cpu array is updated for each new slab and and is a remote + * cacheline for most nodes. This could become a bouncing cacheline given + * enough frequent updates. There are 16 pointers in a cacheline.so at + * max 16 cpus could compete. Likely okay. + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Support DEBUG_SLAB_LEAK. Trouble is we do not know where the full + * slabs are in SLUB. + * + * - SLAB_DEBUG_INITIAL is not supported but I have never seen a use of + * it. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to test recovery from slab corruption on boot */ +#undef SLUB_RESILIENCY_TEST + +#if PAGE_SHIFT <= 12 + +/* + * Small page size. Make sure that we do not fragment memory + */ +#define DEFAULT_MAX_ORDER 1 +#define DEFAULT_MIN_OBJECTS 4 + +#else + +/* + * Large page machines are customarily able to handle larger + * page orders. + */ +#define DEFAULT_MAX_ORDER 2 +#define DEFAULT_MIN_OBJECTS 8 + +#endif + +/* + * Flags from the regular SLAB that SLUB does not support: + */ +#define SLUB_UNIMPLEMENTED (SLAB_DEBUG_INITIAL) + +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) +/* + * Set of flags that will prevent slab merging + */ +#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU) + +#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN sizeof(void *) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN sizeof(void *) +#endif + +/* Internal SLUB flags */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ + +static int kmem_size = sizeof(struct kmem_cache); + +#ifdef CONFIG_SMP +static struct notifier_block slab_notifier; +#endif + +static enum { + DOWN, /* No slab functionality available */ + PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + UP, /* Everything works */ + SYSFS /* Sysfs up */ +} slab_state = DOWN; + +/* A list of all slab caches on the system */ +static DECLARE_RWSEM(slub_lock); +LIST_HEAD(slab_caches); + +#ifdef CONFIG_SYSFS +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +static void sysfs_slab_remove(struct kmem_cache *); +#else +static int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } +static void sysfs_slab_remove(struct kmem_cache *s) {} +#endif + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ +#ifdef CONFIG_NUMA + return s->node[node]; +#else + return &s->local_node; +#endif +} + +/* + * Object debugging + */ +static void print_section(char *text, u8 *addr, unsigned int length) +{ + int i, offset; + int newline = 1; + char ascii[17]; + + ascii[16] = 0; + + for (i = 0; i < length; i++) { + if (newline) { + printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + newline = 0; + } + printk(" %02x", addr[i]); + offset = i % 16; + ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; + if (offset == 15) { + printk(" %s\n",ascii); + newline = 1; + } + } + if (!newline) { + i %= 16; + while (i < 16) { + printk(" "); + ascii[i] = ' '; + i++; + } + printk(" %s\n", ascii); + } +} + +/* + * Slow version of get and set free pointer. + * + * This requires touching the cache lines of kmem_cache. + * The offset can also be obtained from the page. In that + * case it is in the cacheline that we already need to touch. + */ +static void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* + * Tracking user of a slab. + */ +struct track { + void *addr; /* Called from address */ + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + return p + alloc; +} + +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + p += alloc; + if (addr) { + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current ? current->pid : -1; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} + +#define set_tracking(__s, __o, __a) set_track(__s, __o, __a, \ + __builtin_return_address(0)) + +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (s->flags & SLAB_STORE_USER) { + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); + } +} + +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + + printk(KERN_ERR "%s: ", s); + __print_symbol("%s", (unsigned long)t->addr); + printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_trailer(struct kmem_cache *s, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone", p + s->objsize, + s->inuse - s->objsize); + + printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", + p + s->offset, + get_freepointer(s, p)); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; + + if (s->flags & SLAB_STORE_USER) { + print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); + print_track("Last free ", get_track(s, p, TRACK_FREE)); + off += 2 * sizeof(struct track); + } + + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Filler", p + off, s->size - off); +} + +static void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + u8 *addr = page_address(page); + + printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", + s->name, reason, object, page); + printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", + object - addr, page->flags, page->inuse, page->freelist); + if (object > addr + 16) + print_section("Bytes b4", object - 16, 16); + print_section("Object", object, min(s->objsize, 128)); + print_trailer(s, object); + dump_stack(); +} + +static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +{ + va_list args; + char buf[100]; + + va_start(args, reason); + vsnprintf(buf, sizeof(buf), reason, args); + va_end(args); + printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, + page); + dump_stack(); +} + +static void init_object(struct kmem_cache *s, void *object, int active) +{ + u8 *p = object; + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->objsize - 1); + p[s->objsize -1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->objsize, + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, + s->inuse - s->objsize); +} + +static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +{ + while (bytes) { + if (*start != (u8)value) + return 0; + start++; + bytes--; + } + return 1; +} + + +static int check_valid_pointer(struct kmem_cache *s, struct page *page, + void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + s->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->objsize + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended to word size if Redzoning is enabled + * and objsize == inuse. + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * + * If slabcaches are merged then the objsize and inuse boundaries are to + * be ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n", + s->name, message, data, from, to - 1); + memset(from, data, to - from); +} + +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ + + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->size == off) + return 1; + + if (check_bytes(p + off, POISON_INUSE, s->size - off)) + return 1; + + object_err(s, page, p, "Object padding check fails"); + + /* + * Restore padding + */ + restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); + return 0; +} + +static int slab_pad_check(struct kmem_cache *s, struct page *page) +{ + u8 *p; + int length, remainder; + + if (!(s->flags & SLAB_POISON)) + return 1; + + p = page_address(page); + length = s->objects * s->size; + remainder = (PAGE_SIZE << s->order) - length; + if (!remainder) + return 1; + + if (!check_bytes(p + length, POISON_INUSE, remainder)) { + printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n", + s->name, p); + dump_stack(); + restore_bytes(s, "slab padding", POISON_INUSE, p + length, + p + length + remainder); + return 0; + } + return 1; +} + +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) +{ + u8 *p = object; + u8 *endobject = object + s->objsize; + + if (s->flags & SLAB_RED_ZONE) { + unsigned int red = + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + + if (!check_bytes(endobject, red, s->inuse - s->objsize)) { + object_err(s, page, object, + active ? "Redzone Active" : "Redzone Inactive"); + restore_bytes(s, "redzone", red, + endobject, object + s->inuse); + return 0; + } + } else { + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && + !check_bytes(endobject, POISON_INUSE, + s->inuse - s->objsize)) { + object_err(s, page, p, "Alignment padding check fails"); + /* + * Fix it so that there will not be another report. + * + * Hmmm... We may be corrupting an object that now expects + * to be longer than allowed. + */ + restore_bytes(s, "alignment padding", POISON_INUSE, + endobject, object + s->inuse); + } + } + + if (s->flags & SLAB_POISON) { + if (!active && (s->flags & __OBJECT_POISON) && + (!check_bytes(p, POISON_FREE, s->objsize - 1) || + p[s->objsize - 1] != POISON_END)) { + + object_err(s, page, p, "Poison check failed"); + restore_bytes(s, "Poison", POISON_FREE, + p, p + s->objsize -1); + restore_bytes(s, "Poison", POISON_END, + p + s->objsize - 1, p + s->objsize); + return 0; + } + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } + + if (!s->offset && active) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus loose the remainder + * of the free objects in this slab. May cause + * another error because the object count maybe + * wrong now. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct page *page) +{ + VM_BUG_ON(!irqs_disabled()); + + if (!PageSlab(page)) { + printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p " + "flags=%lx mapping=0x%p count=%d \n", + s->name, page, page->flags, page->mapping, + page_count(page)); + return 0; + } + if (page->offset * sizeof(void *) != s->offset) { + printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p" + " flags=0x%lx mapping=0x%p count=%d\n", + s->name, + (unsigned long)(page->offset * sizeof(void *)), + page, + page->flags, + page->mapping, + page_count(page)); + dump_stack(); + return 0; + } + if (page->inuse > s->objects) { + printk(KERN_ERR "SLUB: %s Inuse %u > max %u in slab " + "page @0x%p flags=%lx mapping=0x%p count=%d\n", + s->name, page->inuse, s->objects, page, page->flags, + page->mapping, page_count(page)); + dump_stack(); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist and + * therefore free. Must hold the slab lock for cpu slabs to + * guarantee that the chains are consistent. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void *fp = page->freelist; + void *object = NULL; + + while (fp && nr <= s->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + break; + } else { + printk(KERN_ERR "SLUB: %s slab 0x%p " + "freepointer 0x%p corrupted.\n", + s->name, page, fp); + dump_stack(); + page->freelist = NULL; + page->inuse = s->objects; + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + if (page->inuse != s->objects - nr) { + printk(KERN_ERR "slab %s: page 0x%p wrong object count." + " counter is %d but counted were %d\n", + s->name, page, page->inuse, + s->objects - nr); + page->inuse = s->objects - nr; + } + return search == NULL; +} + +static int alloc_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto bad; + + if (object && !on_freelist(s, page, object)) { + printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p " + "already allocated.\n", + s->name, object, page); + goto dump; + } + + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto dump; + } + + if (!object) + return 1; + + if (!check_object(s, page, object, 0)) + goto bad; + init_object(s, object, 1); + + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", + s->name, object, page->inuse, + page->freelist); + dump_stack(); + } + return 1; +dump: + dump_stack(); +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remainder. + */ + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", + s->name, page); + page->inuse = s->objects; + page->freelist = NULL; + /* Fix up fields that may be corrupted */ + page->offset = s->offset / sizeof(void *); + } + return 0; +} + +static int free_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto fail; + + if (!check_valid_pointer(s, page, object)) { + printk(KERN_ERR "SLUB: %s slab 0x%p invalid " + "object pointer 0x%p\n", + s->name, page, object); + goto fail; + } + + if (on_freelist(s, page, object)) { + printk(KERN_ERR "SLUB: %s slab 0x%p object " + "0x%p already free.\n", s->name, page, object); + goto fail; + } + + if (!check_object(s, page, object, 1)) + return 0; + + if (unlikely(s != page->slab)) { + if (!PageSlab(page)) + printk(KERN_ERR "slab_free %s size %d: attempt to" + "free object(0x%p) outside of slab.\n", + s->name, s->size, object); + else + if (!page->slab) + printk(KERN_ERR + "slab_free : no slab(NULL) for object 0x%p.\n", + object); + else + printk(KERN_ERR "slab_free %s(%d): object at 0x%p" + " belongs to slab %s(%d)\n", + s->name, s->size, object, + page->slab->name, page->slab->size); + goto fail; + } + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", + s->name, object, page->inuse, + page->freelist); + print_section("Object", object, s->objsize); + dump_stack(); + } + init_object(s, object, 0); + return 1; +fail: + dump_stack(); + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", + s->name, page, object); + return 0; +} + +/* + * Slab allocation and freeing + */ +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page * page; + int pages = 1 << s->order; + + if (s->order) + flags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + flags |= SLUB_DMA; + + if (node == -1) + page = alloc_pages(flags, s->order); + else + page = alloc_pages_node(node, flags, s->order); + + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + pages); + + return page; +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + if (PageError(page)) { + init_object(s, object, 0); + init_tracking(s, object); + } + + if (unlikely(s->ctor)) { + int mode = SLAB_CTOR_CONSTRUCTOR; + + if (!(s->flags & __GFP_WAIT)) + mode |= SLAB_CTOR_ATOMIC; + + s->ctor(object, s, mode); + } +} + +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + struct kmem_cache_node *n; + void *start; + void *end; + void *last; + void *p; + + if (flags & __GFP_NO_GROW) + return NULL; + + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + + if (flags & __GFP_WAIT) + local_irq_enable(); + + page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + if (!page) + goto out; + + n = get_node(s, page_to_nid(page)); + if (n) + atomic_long_inc(&n->nr_slabs); + page->offset = s->offset / sizeof(void *); + page->slab = s; + page->flags |= 1 << PG_slab; + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + page->flags |= 1 << PG_error; + + start = page_address(page); + end = start + s->objects * s->size; + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << s->order); + + last = start; + for (p = start + s->size; p < end; p += s->size) { + setup_object(s, page, last); + set_freepointer(s, last, p); + last = p; + } + setup_object(s, page, last); + set_freepointer(s, last, NULL); + + page->freelist = start; + page->inuse = 0; +out: + if (flags & __GFP_WAIT) + local_irq_disable(); + return page; +} + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int pages = 1 << s->order; + + if (unlikely(PageError(page) || s->dtor)) { + void *start = page_address(page); + void *end = start + (pages << PAGE_SHIFT); + void *p; + + slab_pad_check(s, page); + for (p = start; p <= end - s->size; p += s->size) { + if (s->dtor) + s->dtor(p, s, 0); + check_object(s, page, p, 0); + } + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + - pages); + + page->mapping = NULL; + __free_pages(page, s->order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); +} + +static void free_slab(struct kmem_cache *s, struct page *page) +{ + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + /* + * RCU free overloads the RCU head over the LRU + */ + struct rcu_head *head = (void *)&page->lru; + + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); +} + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + atomic_long_dec(&n->nr_slabs); + reset_page_mapcount(page); + page->flags &= ~(1 << PG_slab | 1 << PG_error); + free_slab(s, page); +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + bit_spin_unlock(PG_locked, &page->flags); +} + +static __always_inline int slab_trylock(struct page *page) +{ + int rc = 1; + + rc = bit_spin_trylock(PG_locked, &page->flags); + return rc; +} + +/* + * Management of partially allocated slabs + */ +static void add_partial(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + n->nr_partial++; + list_add(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void remove_partial(struct kmem_cache *s, + struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} + +/* + * Lock page and remove it from the partial list + * + * Must hold list_lock + */ +static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + return 1; + } + return 0; +} + +/* + * Try to get a partial slab from a specific node + */ +static struct page *get_partial_node(struct kmem_cache_node *n) +{ + struct page *page; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab then get_partials() will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; +} + +/* + * Get a page from somewhere. Search in increasing NUMA + * distances. + */ +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zone **z; + struct page *page; + + /* + * The defrag ratio allows to configure the tradeoffs between + * inter node defragmentation and node local allocations. + * A lower defrag_ratio increases the tendency to do local + * allocations instead of scanning throught the partial + * lists on other nodes. + * + * If defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If its higher then kmalloc() + * may return off node objects in order to avoid fragmentation. + * + * A higher ratio means slabs may be taken from other nodes + * thus reducing the number of partial slabs on those nodes. + * + * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation + * will first attempt to defrag slab caches on other nodes. This + * means scanning over all nodes to look for partial slabs which + * may be a bit expensive to do on every slab allocation. + */ + if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) + return NULL; + + zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + for (z = zonelist->zones; *z; z++) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(*z)); + + if (n && cpuset_zone_allowed_hardwall(*z, flags) && + n->nr_partial > 2) { + page = get_partial_node(n); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; + + page = get_partial_node(get_node(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + return get_any_partial(s, flags); +} + +/* + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. + */ +static void putback_slab(struct kmem_cache *s, struct page *page) +{ + if (page->inuse) { + if (page->freelist) + add_partial(s, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + s->cpu_slab[cpu] = NULL; + ClearPageActive(page); + + putback_slab(s, page); +} + +static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + slab_lock(page); + deactivate_slab(s, page, cpu); +} + +/* + * Flush cpu slab. + * Called from IPI handler with interrupts disabled. + */ +static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct page *page = s->cpu_slab[cpu]; + + if (likely(page)) + flush_slab(s, page, cpu); +} + +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + int cpu = smp_processor_id(); + + __flush_cpu_slab(s, cpu); +} + +static void flush_all(struct kmem_cache *s) +{ +#ifdef CONFIG_SMP + on_each_cpu(flush_cpu_slab, s, 1, 1); +#else + unsigned long flags; + + local_irq_save(flags); + flush_cpu_slab(s); + local_irq_restore(flags); +#endif +} + +/* + * slab_alloc is optimized to only modify two cachelines on the fast path + * (aside from the stack): + * + * 1. The page struct + * 2. The first cacheline of the object to be allocated. + * + * The only cache lines that are read (apart from code) is the + * per cpu array in the kmem_cache struct. + * + * Fastpath is not possible if we need to get a new slab or have + * debugging enabled (which means all slabs are marked with PageError) + */ +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node) +{ + struct page *page; + void **object; + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + page = s->cpu_slab[cpu]; + if (!page) + goto new_slab; + + slab_lock(page); + if (unlikely(node != -1 && page_to_nid(page) != node)) + goto another_slab; +redo: + object = page->freelist; + if (unlikely(!object)) + goto another_slab; + if (unlikely(PageError(page))) + goto debug; + +have_object: + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + local_irq_restore(flags); + return object; + +another_slab: + deactivate_slab(s, page, cpu); + +new_slab: + page = get_partial(s, gfpflags, node); + if (likely(page)) { +have_slab: + s->cpu_slab[cpu] = page; + SetPageActive(page); + goto redo; + } + + page = new_slab(s, gfpflags, node); + if (page) { + cpu = smp_processor_id(); + if (s->cpu_slab[cpu]) { + /* + * Someone else populated the cpu_slab while we enabled + * interrupts, or we have got scheduled on another cpu. + * The page may not be on the requested node. + */ + if (node == -1 || + page_to_nid(s->cpu_slab[cpu]) == node) { + /* + * Current cpuslab is acceptable and we + * want the current one since its cache hot + */ + discard_slab(s, page); + page = s->cpu_slab[cpu]; + slab_lock(page); + goto redo; + } + /* Dump the current slab */ + flush_slab(s, s->cpu_slab[cpu], cpu); + } + slab_lock(page); + goto have_slab; + } + local_irq_restore(flags); + return NULL; +debug: |