diff options
Diffstat (limited to 'mm/slub.c')
| -rw-r--r-- | mm/slub.c | 4383 |
1 files changed, 2685 insertions, 1698 deletions
diff --git a/mm/slub.c b/mm/slub.c index acc975fcc8c..73004808537 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,37 +2,65 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> + * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/slab.h> +#include "slab.h" +#include <linux/proc_fs.h> +#include <linux/notifier.h> #include <linux/seq_file.h> +#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/memory.h> +#include <linux/math64.h> +#include <linux/fault-inject.h> +#include <linux/stacktrace.h> +#include <linux/prefetch.h> +#include <linux/memcontrol.h> + +#include <trace/events/kmem.h> + +#include "internal.h" /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slab_mutex (Global Mutex) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) + * + * slab_mutex + * + * The role of the slab_mutex is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -45,20 +73,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has noone operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -100,42 +114,22 @@ * the fast path and disables lockless freelists. */ -#define FROZEN (1 << PG_active) - +static inline int kmem_cache_debug(struct kmem_cache *s) +{ #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG (1 << PG_error) + return unlikely(s->flags & SLAB_DEBUG_FLAGS); #else -#define SLABDEBUG 0 + return 0; #endif - -static inline int SlabFrozen(struct page *page) -{ - return page->flags & FROZEN; } -static inline void SetSlabFrozen(struct page *page) +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { - page->flags |= FROZEN; -} - -static inline void ClearSlabFrozen(struct page *page) -{ - page->flags &= ~FROZEN; -} - -static inline int SlabDebug(struct page *page) -{ - return page->flags & SLABDEBUG; -} - -static inline void SetSlabDebug(struct page *page) -{ - page->flags |= SLABDEBUG; -} - -static inline void ClearSlabDebug(struct page *page) -{ - page->flags &= ~SLABDEBUG; +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif } /* @@ -149,24 +143,8 @@ static inline void ClearSlabDebug(struct page *page) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG /* * Mininum number of partial slabs. These will be left on the partial @@ -177,7 +155,7 @@ static inline void ClearSlabDebug(struct page *page) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -185,55 +163,43 @@ static inline void ClearSlabDebug(struct page *page) SLAB_POISON | SLAB_STORE_USER) /* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA) - -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif + SLAB_CACHE_DMA | SLAB_NOTRACK) -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ -#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ - -/* Not all arches define cache_line_size */ -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - -static int kmem_size = sizeof(struct kmem_cache); +#define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif -static enum { - DOWN, /* No slab functionality available */ - PARTIAL, /* kmem_cache_open() works but kmalloc does not */ - UP, /* Everything works but does not show up in sysfs */ - SYSFS /* Sysfs up */ -} slab_state = DOWN; - -/* A list of all slab caches on the system */ -static DECLARE_RWSEM(slub_lock); -static LIST_HEAD(slab_caches); - /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -241,26 +207,25 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s); -} - +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif -static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - c->stat[si]++; + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -268,27 +233,9 @@ static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) * Core slab cache functions *******************************************************************/ -int slab_is_available(void) -{ - return slab_state >= UP; -} - static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { -#ifdef CONFIG_NUMA return s->node[node]; -#else - return &s->local_node; -#endif -} - -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif } /* Verify that a pointer has an address that is valid within a slab page */ @@ -301,7 +248,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -309,40 +256,217 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); } +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetch(object + s->offset); +} + +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) +{ + void *p; + +#ifdef CONFIG_DEBUG_PAGEALLOC + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); +#else + p = get_freepointer(s, object); +#endif + return p; +} + static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { *(void **)(object + s->offset) = fp; } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) -/* Scan freelist */ -#define for_each_free_object(__p, __s, __free) \ - for (__p = (__free); __p; __p = get_freepointer((__s), __p)) - /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size, int reserved) +{ + struct kmem_cache_order_objects x = { + (order << OO_SHIFT) + order_objects(order, size, reserved) + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> OO_SHIFT; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & OO_MASK; +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + local_irq_restore(flags); + return 1; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* + * Determine a map of object in use on a page. + * + * Node listlock must be held to guarantee that the page does + * not vanish from under us. + */ +static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(slab_index(p, s, addr), map); +} + +/* * Debug settings: */ #ifdef CONFIG_SLUB_DEBUG_ON @@ -352,40 +476,15 @@ static int slub_debug; #endif static char *slub_debug_slabs; +static int disable_higher_order_debug; /* * Object debugging */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -402,20 +501,32 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { - struct track *p; - - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + struct track *p = get_track(s, object, alloc); - p += alloc; if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; + p->pid = current->pid; p->when = jiffies; } else memset(p, 0, sizeof(struct track)); @@ -426,8 +537,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -435,9 +546,18 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + pr_err("\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -451,35 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -491,17 +613,17 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min(s->objsize, 128)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->object_size, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, - s->inuse - s->objsize); + print_section("Redzone ", p + s->object_size, + s->inuse - s->object_size); if (s->offset) off = s->offset + sizeof(void *); @@ -513,7 +635,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -521,11 +643,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) static void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { - slab_bug(s, reason); + slab_bug(s, "%s", reason); print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, + const char *fmt, ...) { va_list args; char buf[100]; @@ -533,35 +656,22 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - slab_bug(s, fmt); + slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize - 1] = POISON_END; + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); -} - -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) -{ - while (bytes) { - if (*start != (u8)value) - return start; - start++; - bytes--; - } - return NULL; + memset(p + s->object_size, val, s->inuse - s->object_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -578,7 +688,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; - fault = check_bytes(start, value, bytes); + fault = memchr_inv(start, value, bytes); if (!fault) return 1; @@ -587,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -606,10 +716,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) * - * object + s->objsize + * object + s->object_size * Padding to reach word boundary. This is also used for Redzoning. * Padding is extended by another word if Redzoning is enabled and - * objsize == inuse. + * object_size == inuse. * * We fill with 0xbb (RED_INACTIVE) for inactive objects and with * 0xcc (RED_ACTIVE) for objects in use. @@ -628,7 +738,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object + s->size * Nothing is used beyond s->size. * - * If slabcaches are merged then the objsize and inuse boundaries are mostly + * If slabcaches are merged then the object_size and inuse boundaries are mostly * ignored. And therefore no slab options that rely on these boundaries * may be used with merged slabcaches. */ @@ -652,6 +762,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -664,51 +775,49 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding ", end - remainder, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, start, end); + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; } static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) + void *object, u8 val) { u8 *p = object; - u8 *endobject = object + s->objsize; + u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, red, s->inuse - s->objsize)) + endobject, val, s->inuse - s->object_size)) return 0; } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->objsize); + endobject, POISON_INUSE, + s->inuse - s->object_size); } } if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, - POISON_FREE, s->objsize - 1) || + POISON_FREE, s->object_size - 1) || !check_bytes_and_report(s, page, p, "Poison", - p + s->objsize - 1, POISON_END, 1))) + p + s->object_size - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. @@ -716,7 +825,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && active) + if (!s->offset && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -727,7 +836,7 @@ static int check_object(struct kmem_cache *s, struct page *page, if (!check_valid_pointer(s, page, get_freepointer(s, p))) { object_err(s, page, p, "Freepointer corrupt"); /* - * No choice but to zap it and thus loose the remainder + * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause * another error because the object count is now wrong. */ @@ -739,15 +848,24 @@ static int check_object(struct kmem_cache *s, struct page *page, static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = order_objects(compound_order(page), s->size, s->reserved); + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -762,10 +880,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; + unsigned long max_objects; - while (fp && nr <= s->objects) { + fp = page->freelist; + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -773,11 +893,10 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) object_err(s, page, object, "Freechain corrupt"); set_freepointer(s, object, NULL); - break; } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -788,89 +907,186 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - if (page->inuse != s->objects - nr) { + max_objects = order_objects(compound_order(page), s->size, s->reserved); + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) +static void trace(struct kmem_cache *s, struct page *page, void *object, + int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, + s->object_size); dump_stack(); } } /* - * Tracking of fully allocated slabs for debugging purposes. + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { - spin_lock(&n->list_lock); - list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); + kmemleak_alloc(ptr, size, 1, flags); } -static void remove_full(struct kmem_cache *s, struct page *page) +static inline void kfree_hook(const void *x) { - struct kmem_cache_node *n; + kmemleak_free(x); +} +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); + + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} + +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) +{ if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); + lockdep_assert_held(&n->list_lock); + list_add(&page->lru, &n->full); +} - spin_lock(&n->list_lock); +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (likely(n)) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) return; - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; } - if (!check_object(s, page, object, 0)) + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); - init_object(s, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: @@ -881,15 +1097,21 @@ bad: * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock_irqsave(&n->list_lock, *flags); + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -903,17 +1125,16 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - if (!check_object(s, page, object, 1)) - return 0; + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) + goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { - printk(KERN_ERR - "SLUB <none>: no slab for object 0x%p.\n", - object); + } else if (!page->slab_cache) { + pr_err("SLUB <none>: no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -921,18 +1142,23 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) - remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); - init_object(s, object, 0); - return 1; + init_object(s, object, SLUB_RED_INACTIVE); +out: + slab_unlock(page); + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; fail: + slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, *flags); slab_fix(s, "Object at 0x%p not freed", object); - return 0; + return NULL; } static int __init setup_slub_debug(char *str) @@ -951,6 +1177,15 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; + if (tolower(*str) == 'o') { + /* + * Avoid enabling debugging on caches if its minimum order + * would increase as a result. + */ + disable_higher_order_debug = 1; + goto out; + } + slub_debug = 0; if (*str == '-') /* @@ -978,9 +1213,12 @@ static int __init setup_slub_debug(char *str) case 't': slub_debug |= SLAB_TRACE; break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -993,16 +1231,16 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long objsize, +static unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) + flags |= slub_debug; return flags; } @@ -1011,46 +1249,150 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long objsize, + void *object, u8 val) { return 1; } +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } #define slub_debug 0 -#endif + +#define disable_higher_order_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} + +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) + { return 0; } + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} + +#endif /* CONFIG_SLUB_DEBUG */ + /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) +{ + struct page *page; + int order = oo_order(oo); + + flags |= __GFP_NOTRACK; + + if (memcg_charge_slab(s, flags, order)) + return NULL; + + if (node == NUMA_NO_NODE) + page = alloc_pages(flags, order); + else + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; + + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); flags |= s->allocflags; - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!page)) { + oo = s->min; + alloc_gfp = flags; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(s, alloc_gfp, node, oo); + + if (page) + stat(s, ORDER_FALLBACK); + } + + if (kmemcheck_enabled && page + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + if (flags & __GFP_WAIT) + local_irq_disable(); if (!page) return NULL; + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -1060,16 +1402,16 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - struct kmem_cache_node *n; void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1078,22 +1420,20 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); - page->slab = s; - page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); + order = compound_order(page); + inc_slabs_node(s, page_to_nid(page), page->objects); + page->slab_cache = s; + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1102,47 +1442,75 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; + page->frozen = 1; out: return page; } static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; - if (unlikely(SlabDebug(page))) { + if (kmem_cache_debug(s)) { void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) - check_object(s, page, p, 0); - ClearSlabDebug(page); + for_each_object(p, s, page_address(page), + page->objects) + check_object(s, page, p, SLUB_RED_INACTIVE); } + kmemcheck_free_shadow(page, compound_order(page)); + mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); - __free_pages(page, s->order); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + + page_mapcount_reset(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; + __free_pages(page, order); + memcg_uncharge_slab(s, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -1151,83 +1519,103 @@ static void free_slab(struct kmem_cache *s, struct page *page) static void discard_slab(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - __ClearPageSlab(page); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } /* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - -/* - * Management of partially allocated slabs + * Management of partially allocated slabs. */ -static void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static void remove_partial(struct kmem_cache *s, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} - spin_lock(&n->list_lock); +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) +{ list_del(&page->lru); n->nr_partial--; - spin_unlock(&n->list_lock); +} + +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); } /* - * Lock slab and remove from the partial list. + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. * - * Must hold list_lock. + * Returns a list of objects or NULL if it fails. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode, int *objects) { - if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; - SetSlabFrozen(page); - return 1; + void *freelist; + unsigned long counters; + struct page new; + + lockdep_assert_held(&n->list_lock); + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + *objects = new.objects - new.inuse; + if (mode) { + new.inuse = page->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; } - return 0; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + if (!__cmpxchg_double_slab(s, page, + freelist, counters, + new.freelist, new.counters, + "acquire_slab")) + return NULL; + + remove_partial(n, page); + WARN_ON(!freelist); + return freelist; } +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { - struct page *page; + struct page *page, *page2; + void *object = NULL; + int available = 0; + int objects; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1239,24 +1627,47 @@ static struct page *get_partial_node(struct kmem_cache_node *n) return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t; + + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL, &objects); + if (!t) + break; + + available += objects; + if (!object) { + c->page = page; + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { + put_cpu_partial(s, page, 0); + stat(s, CPU_PARTIAL_NODE); + } + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; - struct zone **z; - struct page *page; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + void *object; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1280,20 +1691,30 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - zonelist = &NODE_DATA( - slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; - for (z = zonelist->zones; *z; z++) { - struct kmem_cache_node *n; - - n = get_node(s, zone_to_nid(*z)); - - if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > MIN_PARTIAL) { - page = get_partial_node(n); - if (page) - return page; + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c, flags); + if (object) { + /* + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update + */ + return object; + } + } } - } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1301,103 +1722,359 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; + void *object; + int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; - page = get_partial_node(get_node(s, searchnode)); - if (page || (flags & __GFP_THISNODE)) - return page; + object = get_partial_node(s, get_node(s, searchnode), c, flags); + if (object || node != NUMA_NO_NODE) + return object; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } +#ifdef CONFIG_PREEMPT /* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) { +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + pr_info("%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + pr_warn("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + pr_warn("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +static void init_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *freelist) +{ + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *nextfree; + int tail = DEACTIVATE_TO_HEAD; + struct page new; + struct page old; + + if (page->freelist) { + stat(s, DEACTIVATE_REMOTE_FREES); + tail = DEACTIVATE_TO_TAIL; + } - ClearSlabFrozen(page); - if (page->inuse) { + /* + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. + */ + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } - if (page->freelist) { - add_partial(n, page, tail); - stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(c, DEACTIVATE_FULL); - if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial >= s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); } - slab_unlock(page); } else { - stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < MIN_PARTIAL) { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from the - * partial list. + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. */ - add_partial(n, page, 1); - slab_unlock(page); - } else { - slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); - discard_slab(s, page); + spin_lock(&n->list_lock); } } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, n, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } } /* - * Remove the cpu slab + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { - struct page *page = c->page; - int tail = 1; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct page *page, *discard_page = NULL; - if (page->freelist) - stat(c, DEACTIVATE_REMOTE_FREES); - /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. - */ - while (unlikely(c->freelist)) { - void **object; + while ((page = c->partial)) { + struct page new; + struct page old; + + c->partial = page->next; - tail = 0; /* Hot objects. Put the slab first */ + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = c->freelist[c->offset]; + n = n2; + spin_lock(&n->list_lock); + } - /* And put onto the regular freelist */ - object[c->offset] = page->freelist; - page->freelist = object; - page->inuse--; + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + page->next = discard_page; + discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } } - c->page = NULL; - unfreeze_slab(s, page, tail); + + if (n) + spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +#endif +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + oldpage = NULL; + pobjects = 0; + pages = 0; + stat(s, CPU_PARTIAL_DRAIN); + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) + != oldpage); +#endif } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(c, CPUSLAB_FLUSH); - slab_lock(c->page); - deactivate_slab(s, c); + stat(s, CPUSLAB_FLUSH); + deactivate_slab(s, c->page, c->freelist); + + c->tid = next_tid(c->tid); + c->page = NULL; + c->freelist = NULL; } /* @@ -1407,10 +2084,14 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s, c); + } } static void flush_cpu_slab(void *d) @@ -1420,38 +2101,180 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } -static void flush_all(struct kmem_cache *s) +static bool has_cpu_slab(int cpu, void *info) { -#ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); -#else - unsigned long flags; + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); -#endif + return c->page || c->partial; +} + +static void flush_all(struct kmem_cache *s) +{ + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) return 0; #endif return 1; } +#ifdef CONFIG_SLUB_DEBUG +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + int node; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); + + if (oo_order(s->min) > get_order(s->object_size)) + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + if (!n) + continue; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +#endif +} + +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *freelist; + struct kmem_cache_cpu *c = *pc; + struct page *page; + + freelist = get_partial(s, flags, node, c); + + if (freelist) + return freelist; + + page = new_slab(s, flags, node); + if (page) { + c = raw_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + freelist = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->page = page; + *pc = c; + } else + freelist = NULL; + + return freelist; +} + +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + +/* + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + * + * This function must be called with interrupt disabled. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -1464,99 +2287,107 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { - void **object; - struct page *new; + void *freelist; + struct page *page; + unsigned long flags; - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif - if (!c->page) + page = c->page; + if (!page) goto new_slab; +redo: - slab_lock(c->page); - if (unlikely(!node_match(c, node))) - goto another_slab; + if (unlikely(!node_match(page, node))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } - stat(c, ALLOC_REFILL); + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } -load_freelist: - object = c->page->freelist; - if (unlikely(!object)) - goto another_slab; - if (unlikely(SlabDebug(c->page))) - goto debug; - - c->freelist = object[c->offset]; - c->page->inuse = s->objects; - c->page->freelist = NULL; - c->node = page_to_nid(c->page); -unlock_out: - slab_unlock(c->page); - stat(c, ALLOC_SLOWPATH); - return object; + /* must check again c->freelist in case of cpu migration or IRQ */ + freelist = c->freelist; + if (freelist) + goto load_freelist; -another_slab: - deactivate_slab(s, c); + freelist = get_freelist(s, page); -new_slab: - new = get_partial(s, gfpflags, node); - if (new) { - c->page = new; - stat(c, ALLOC_FROM_PARTIAL); - goto load_freelist; + if (!freelist) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; } - if (gfpflags & __GFP_WAIT) - local_irq_enable(); + stat(s, ALLOC_REFILL); - new = new_slab(s, gfpflags, node); +load_freelist: + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. + * That page must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); + local_irq_restore(flags); + return freelist; - if (gfpflags & __GFP_WAIT) - local_irq_disable(); +new_slab: - if (new) { - c = get_cpu_slab(s, smp_processor_id()); - stat(c, ALLOC_SLAB); - if (c->page) - flush_slab(s, c); - slab_lock(new); - SetSlabFrozen(new); - c->page = new; - goto load_freelist; + if (c->partial) { + page = c->page = c->partial; + c->partial = page->next; + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; } - /* - * No memory available. - * - * If the slab uses higher order allocs but the object is - * smaller than a page size then we can fallback in emergencies - * to the page allocator via kmalloc_large. The page allocator may - * have failed to obtain a higher order page and we can try to - * allocate a single page if the object fits into a single page. - * That is only possible if certain conditions are met that are being - * checked when a slab is created. - */ - if (!(gfpflags & __GFP_NORETRY) && - (s->flags & __PAGE_ALLOC_FALLBACK)) { - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - object = kmalloc_large(s->objsize, gfpflags); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - return object; + freelist = new_slab_objects(s, gfpflags, node, &c); + + if (unlikely(!freelist)) { + slab_out_of_memory(s, gfpflags, node); + local_irq_restore(flags); + return NULL; } - return NULL; -debug: - if (!alloc_debug_processing(s, c->page, object, addr)) - goto another_slab; - c->page->inuse++; - c->page->freelist = object[c->offset]; - c->node = -1; - goto unlock_out; + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) + goto load_freelist; + + /* Only entered in the debug case */ + if (kmem_cache_debug(s) && + !alloc_debug_processing(s, page, freelist, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ + + deactivate_slab(s, page, get_freepointer(s, freelist)); + c->page = NULL; + c->freelist = NULL; + local_irq_restore(flags); + return freelist; } /* @@ -1569,44 +2400,136 @@ debug: * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) +static __always_inline void *slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; - unsigned long flags; + struct page *page; + unsigned long tid; - local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - if (unlikely(!c->freelist || !node_match(c, node))) + if (slab_pre_alloc_hook(s, gfpflags)) + return NULL; + + s = memcg_kmem_get_cache(s, gfpflags); +redo: + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + * + * Preemption is disabled for the retrieval of the tid because that + * must occur from the current processor. We cannot allow rescheduling + * on a different processor between the determination of the pointer + * and the retrieval of the tid. + */ + preempt_disable(); + c = this_cpu_ptr(s->cpu_slab); + + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + preempt_enable(); + object = c->freelist; + page = c->page; + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); + stat(s, ALLOC_SLOWPATH); + } else { + void *next_object = get_freepointer_safe(s, object); - else { - object = c->freelist; - c->freelist = object[c->offset]; - stat(c, ALLOC_FASTPATH); + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock + * semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + next_object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } + prefetch_freepointer(s, next_object); + stat(s, ALLOC_FASTPATH); } - local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, c->objsize); + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->object_size); + + slab_post_alloc_hook(s, gfpflags, object); return object; } +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc(s, gfpflags, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + s->object_size, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); + +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, + int node, size_t size) +{ + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); +#endif #endif /* @@ -1618,62 +2541,116 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; - struct kmem_cache_cpu *c; + int was_frozen; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); - c = get_cpu_slab(s, raw_smp_processor_id()); - stat(c, FREE_SLOWPATH); - slab_lock(page); + stat(s, FREE_SLOWPATH); + + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) + return; - if (unlikely(SlabDebug(page))) - goto debug; + do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen) { -checks_ok: - prior = object[offset] = page->freelist; - page->freelist = object; - page->inuse--; + if (kmem_cache_has_cpu_partial(s) && !prior) { - if (unlikely(SlabFrozen(page))) { - stat(c, FREE_FROZEN); - goto out_unlock; - } + /* + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. + */ + new.frozen = 1; + + } else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } + } + + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); + + if (likely(!n)) { + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) { + put_cpu_partial(s, page, 1); + stat(s, CPU_PARTIAL_FREE); + } + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } - if (unlikely(!page->inuse)) + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before * then add it. */ - if (unlikely(!prior)) { - add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(c, FREE_ADD_PARTIAL); + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, n, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } - -out_unlock: - slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ - remove_partial(s, page); - stat(c, FREE_REMOVE_PARTIAL); + remove_partial(n, page); + stat(s, FREE_REMOVE_PARTIAL); + } else { + /* Slab must be on the full list */ + remove_full(s, n, page); } - slab_unlock(page); - stat(c, FREE_SLAB); - discard_slab(s, page); - return; -debug: - if (!free_debug_processing(s, page, x, addr)) - goto out_unlock; - goto checks_ok; + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, page); } /* @@ -1688,45 +2665,53 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; - unsigned long flags; + unsigned long tid; - local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - debug_check_no_locks_freed(object, c->objsize); - if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; - c->freelist = object; - stat(c, FREE_FASTPATH); - } else - __slab_free(s, page, x, addr, c->offset); + slab_free_hook(s, x); - local_irq_restore(flags); -} +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ + preempt_disable(); + c = this_cpu_ptr(s->cpu_slab); -void kmem_cache_free(struct kmem_cache *s, void *x) -{ - struct page *page; + tid = c->tid; + preempt_enable(); - page = virt_to_head_page(x); + if (likely(page == c->page)) { + set_freepointer(s, object, c->freelist); - slab_free(s, page, x, __builtin_return_address(0)); -} -EXPORT_SYMBOL(kmem_cache_free); + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { -/* Figure out on which slab object the object resides */ -static struct page *get_object_page(const void *x) -{ - struct page *page = virt_to_head_page(x); + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } + stat(s, FREE_FASTPATH); + } else + __slab_free(s, page, x, addr); - if (!PageSlab(page)) - return NULL; +} - return page; +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + s = cache_from_obj(s, x); + if (!s) + return; + slab_free(s, virt_to_head_page(x), x, _RET_IP_); + trace_kmem_cache_free(_RET_IP_, x); } +EXPORT_SYMBOL(kmem_cache_free); /* * Object placement in a slab is made very easy because we always start at @@ -1748,8 +2733,8 @@ static struct page *get_object_page(const void *x) * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1764,7 +2749,7 @@ static int slub_nomerge; * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1783,22 +2768,25 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -1808,11 +2796,12 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; int fraction; + int max_objects; /* * Attempt to find best configuration for a slab. This @@ -1823,257 +2812,117 @@ static inline int calculate_order(int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = order_objects(slub_max_order, size, reserved); + min_objects = min(min_objects, max_objects); + while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; } - min_objects /= 2; + min_objects--; } /* * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + order = slab_order(size, 1, MAX_ORDER, 1, reserved); + if (order < MAX_ORDER) return order; return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -} - -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) -{ - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); - - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } - - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; -} - -static void free_kmem_cache_cpus(struct kmem_cache *s) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { - int cpu; + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} - -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - int cpu; + /* + * Must align to double word boundary for the double cmpxchg + * instructions to work; see __pcpu_double_call_return_bool(). + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), + 2 * sizeof(void *)); - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (!s->cpu_slab) + return 0; - if (c) - continue; + init_kmem_cache_cpus(s); - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } return 1; } -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) - return; +static struct kmem_cache *kmem_cache_node; - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpu_set(cpu, kmem_cach_cpu_free_init_once); -} - -static void __init init_alloc_cpu(void) -{ - int cpu; - - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } - -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); - return 1; -} -#endif - -#ifdef CONFIG_NUMA /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); - page->inuse++; - kmalloc_caches->node[node] = n; + page->freelist = get_freepointer(kmem_cache_node, n); + page->inuse = 1; + page->frozen = 0; + kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG - init_object(kmalloc_caches, n, 1); - init_tracking(kmalloc_caches, n); + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); #endif init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(kmem_cache_node, node, page->objects); /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. */ - local_irq_save(flags); - add_partial(n, page, 0); - local_irq_restore(flags); - return n; + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2082,68 +2931,57 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; - if (n && n != &s->local_node) - kmem_cache_free(kmalloc_caches, n); + + if (n) + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; } } -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; - int local_node; - - if (slab_state >= UP) - local_node = page_to_nid(virt_to_page(s)); - else - local_node = 0; for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; - if (local_node == node) - n = &s->local_node; - else { - if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); - continue; - } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); - - if (!n) { - free_kmem_cache_nodes(s); - return 0; - } + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(node); + continue; + } + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); + if (!n) { + free_kmem_cache_nodes(s); + return 0; } + s->node[node] = n; init_kmem_cache_node(n); } return 1; } -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static void set_min_partial(struct kmem_cache *s, unsigned long min) { - init_kmem_cache_node(&s->local_node); - return 1; + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; } -#endif /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->objsize; - unsigned long align = s->align; + unsigned long size = s->object_size; + int order; /* * Round up object size to the next word boundary. We can only @@ -2170,7 +3008,7 @@ static int calculate_sizes(struct kmem_cache *s) * end of the object and the free pointer. If not then add an * additional word to have some bytes to store Redzone information. */ - if ((flags & SLAB_RED_ZONE) && size == s->objsize) + if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); #endif @@ -2207,50 +3045,33 @@ static int calculate_sizes(struct kmem_cache *s) * Add some empty padding so that we can catch * overwrites from earlier objects rather than let * tracking information or the free pointer be - * corrupted if an user writes before the start + * corrupted if a user writes before the start * of the object. */ size += sizeof(void *); #endif /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->objsize); - - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size, s->reserved); - if ((flags & __KMALLOC_CACHE) && - PAGE_SIZE / size < slub_min_objects) { - /* - * Kmalloc cache that would not have enough objects in - * an order 0 page. Kmalloc slabs can fallback to - * page allocator order 0 allocs so take a reasonably large - * order that will allows us a good number of objects. - */ - s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); - s->flags |= __PAGE_ALLOC_FALLBACK; - s->allocflags |= __GFP_NOWARN; - } else - s->order = calculate_order(size); - - if (s->order < 0) + if (order < 0) return 0; s->allocflags = 0; - if (s->order) + if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) - s->allocflags |= SLUB_DMA; + s->allocflags |= GFP_DMA; if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -2258,107 +3079,141 @@ static int calculate_sizes(struct kmem_cache *s) /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; - - return !!s->objects; + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; + return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->objsize = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } + +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; - s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif - if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (!init_kmem_cache_nodes(s)) goto error; - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) - return 1; + if (alloc_kmem_cache_cpus(s)) + return 0; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, - s->offset, flags); - return 0; + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); + return -EINVAL; } -/* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) { - struct page *page; - - page = get_object_page(object); - - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; - - if (!check_valid_pointer(s, page, object)) - return 0; - - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid() is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); + if (!map) + return; + slab_err(s, page, text, s->name); + slab_lock(page); -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->objsize; -} -EXPORT_SYMBOL(kmem_cache_size); + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; + if (!test_bit(slab_index(p, s, addr), map)) { + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); + kfree(map); +#endif } -EXPORT_SYMBOL(kmem_cache_name); /* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. + * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - int slabs_inuse = 0; - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) + list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - list_del(&page->lru); + __remove_partial(n, page); discard_slab(s, page); - } else - slabs_inuse++; - spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; + } else { + list_slab_objects(s, page, + "Objects remaining in %s on kmem_cache_close()"); + } + } } /* @@ -2369,50 +3224,28 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) -{ - down_write(&slub_lock); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - up_write(&slub_lock); - if (kmem_cache_close(s)) - WARN_ON(1); - sysfs_slab_remove(s); - } else - up_write(&slub_lock); +int __kmem_cache_shutdown(struct kmem_cache *s) +{ + return kmem_cache_close(s); } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; -EXPORT_SYMBOL(kmalloc_caches); - -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2425,6 +3258,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } @@ -2448,187 +3282,67 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) -{ - unsigned int flags = 0; - - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; - - down_write(&slub_lock); - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags | __KMALLOC_CACHE, NULL)) - goto panic; - - list_add(&s->list, &slab_caches); - up_write(&slub_lock); - if (sysfs_slab_add(s)) - goto panic; - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); -} - -#ifdef CONFIG_ZONE_DMA - -static void sysfs_add_func(struct work_struct *w) -{ - struct kmem_cache *s; - - down_write(&slub_lock); - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & __SYSFS_ADD_DEFERRED) { - s->flags &= ~__SYSFS_ADD_DEFERRED; - sysfs_slab_add(s); - } - } - up_write(&slub_lock); -} - -static DECLARE_WORK(sysfs_add_work, sysfs_add_func); - -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) -{ - struct kmem_cache *s; - char *text; - size_t realsize; - - s = kmalloc_caches_dma[index]; - if (s) - return s; - - /* Dynamically create dma cache */ - if (flags & __GFP_WAIT) - down_write(&slub_lock); - else { - if (!down_write_trylock(&slub_lock)) - goto out; - } - - if (kmalloc_caches_dma[index]) - goto unlock_out; - - realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); - - if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { - kfree(s); - kfree(text); - goto unlock_out; - } - - list_add(&s->list, &slab_caches); - kmalloc_caches_dma[index] = s; - - schedule_work(&sysfs_add_work); - -unlock_out: - up_write(&slub_lock); -out: - return kmalloc_caches_dma[index]; -} -#endif - -/* - * Conversion table for small slabs sizes / 8 to the index in the - * kmalloc array. This is necessary for slabs < 192 since we have non power - * of two cache sizes there. The size of larger slabs can be determined using - * fls. - */ -static s8 size_index[24] = { - 3, /* 8 */ - 4, /* 16 */ - 5, /* 24 */ - 5, /* 32 */ - 6, /* 40 */ - 6, /* 48 */ - 6, /* 56 */ - 6, /* 64 */ - 1, /* 72 */ - 1, /* 80 */ - 1, /* 88 */ - 1, /* 96 */ - 7, /* 104 */ - 7, /* 112 */ - 7, /* 120 */ - 7, /* 128 */ - 2, /* 136 */ - 2, /* 144 */ - 2, /* 152 */ - 2, /* 160 */ - 2, /* 168 */ - 2, /* 176 */ - 2, /* 184 */ - 2 /* 192 */ -}; - -static struct kmem_cache *get_slab(size_t size, gfp_t flags) -{ - int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[(size - 1) / 8]; - } else - index = fls(size - 1); - -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) - return dma_kmalloc_cache(index, flags); - -#endif - return &kmalloc_caches[index]; -} - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, flags); - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + ret = slab_alloc(s, flags, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); +#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - struct page *page = alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); + struct page *page; + void *ptr = NULL; + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmalloc_large_node_hook(ptr, size, flags); + return ptr; } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + return ret; + } - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + ret = slab_alloc_node(s, flags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2636,38 +3350,18 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; page = virt_to_head_page(object); - if (unlikely(!PageSlab(page))) + if (unlikely(!PageSlab(page))) { + WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); + } - s = page->slab; - -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); @@ -2676,33 +3370,22 @@ void kfree(const void *x) struct page *page; void *object = (void *)x; + trace_kfree(_RET_IP_, x); + if (unlikely(ZERO_OR_NULL_PTR(x))) return; page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - put_page(page); + BUG_ON(!PageCompound(page)); + kfree_hook(x); + __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO) -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} -#endif - /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the @@ -2713,15 +3396,16 @@ static unsigned long count_partial(struct kmem_cache_node *n) * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; struct kmem_cache_node *n; struct page *page; struct page *t; + int objects = oo_objects(s->max); struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); unsigned long flags; if (!slabs_by_inuse) @@ -2734,7 +3418,7 @@ int kmem_cache_shrink(struct kmem_cache *s) if (!n->nr_partial) continue; - for (i = 0; i < s->objects; i++) + for (i = 0; i < objects; i++) INIT_LIST_HEAD(slabs_by_inuse + i); spin_lock_irqsave(&n->list_lock, flags); @@ -2746,46 +3430,37 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - list_del(&page->lru); + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = s->objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); - up_read(&slub_lock); + __kmem_cache_shrink(s); + mutex_unlock(&slab_mutex); return 0; } @@ -2797,7 +3472,7 @@ static void slab_mem_offline_callback(void *arg) struct memory_notify *marg = arg; int offline_node; - offline_node = marg->status_change_nid; + offline_node = marg->status_change_nid_normal; /* * If the node still has available memory. we need kmem_cache_node @@ -2806,23 +3481,23 @@ static void slab_mem_offline_callback(void *arg) if (offline_node < 0) return; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { n = get_node(s, offline_node); if (n) { /* * if n->nr_slabs > 0, slabs still exist on the node * that is going down. We were unable to free them, - * and offline_pages() function shoudn't call this + * and offline_pages() function shouldn't call this * callback. So, we must fail. */ - BUG_ON(atomic_long_read(&n->nr_slabs)); + BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); } } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); } static int slab_mem_going_online_callback(void *arg) @@ -2830,7 +3505,7 @@ static int slab_mem_going_online_callback(void *arg) struct kmem_cache_node *n; struct kmem_cache *s; struct memory_notify *marg = arg; - int nid = marg->status_change_nid; + int nid = marg->status_change_nid_normal; int ret = 0; /* @@ -2841,18 +3516,18 @@ static int slab_mem_going_online_callback(void *arg) return 0; /* - * We are bringing a node online. No memory is availabe yet. We must + * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that * is brought up. */ - n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -2861,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg) s->node[nid] = n; } out: - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return ret; } @@ -2885,100 +3560,109 @@ static int slab_memory_callback(struct notifier_block *self, case MEM_CANCEL_OFFLINE: break; } - - ret = notifier_from_errno(ret); + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; /******************************************************************** * Basic setup of slabs *******************************************************************/ -void __init kmem_cache_init(void) +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. + */ + +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { - int i; - int caches = 0; + int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - init_alloc_cpu(); + memcpy(s, static_cache, kmem_cache->object_size); -#ifdef CONFIG_NUMA /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); - kmalloc_caches[0].refcount = -1; - caches++; + __flush_cpu_slab(s, smp_processor_id()); + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + struct page *p; + + if (n) { + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; - hotplug_memory_notifier(slab_memory_callback, 1); +#ifdef CONFIG_SLUB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif + } + } + list_add(&s->list, &slab_caches); + return s; +} + +void __init kmem_cache_init(void) +{ + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; + + if (debug_guardpage_minorder()) + slub_max_order = 0; + + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; + + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); + + register_hotmemory_notifier(&slab_memory_callback_nb); /* Able to allocate the per node structures */ slab_state = PARTIAL; - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); - caches++; - } - if (KMALLOC_MIN_SIZE <= 128) { - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); - caches++; - } - - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); - caches++; - } + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + kmem_cache = bootstrap(&boot_kmem_cache); /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. */ - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; - - slab_state = UP; - - /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) - kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + /* Now we can use the kmem_cache to allocate kmalloc slabs */ + create_kmalloc_caches(0); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); -#else - kmem_size = sizeof(struct kmem_cache); #endif - printk(KERN_INFO - "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", - caches, cache_line_size(), + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } +void __init kmem_cache_init_late(void) +{ +} + /* * Find a mergeable slab cache */ @@ -2987,7 +3671,7 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK)) + if (!is_root_cache(s)) return 1; if (s->ctor) @@ -3002,9 +3686,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3027,7 +3710,7 @@ static struct kmem_cache *find_mergeable(size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3043,68 +3726,70 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; + int i; + struct kmem_cache *c; s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; - + s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - up_write(&slub_lock); - if (sysfs_slab_alias(s, name)) - goto err; - return s; - } + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } - s = kmalloc(kmem_size, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, GFP_KERNEL, name, - size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); - up_write(&slub_lock); - if (sysfs_slab_add(s)) - goto err; - return s; + if (sysfs_slab_alias(s, name)) { + s->refcount--; + s = NULL; } - kfree(s); } - up_write(&slub_lock); -err: - if (flags & SLAB_PANIC) - panic("Cannot create slabcache %s\n", name); - else - s = NULL; return s; } -EXPORT_SYMBOL(kmem_cache_create); + +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; + + err = kmem_cache_open(s, flags); + if (err) + return err; + + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); + err = sysfs_slab_add(s); + if (err) + kmem_cache_close(s); + + return err; +} #ifdef CONFIG_SMP /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, +static int slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3112,31 +3797,17 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); break; default: break; @@ -3144,44 +3815,77 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = { +static struct notifier_block slab_notifier = { .notifier_call = slab_cpuup_callback }; #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, gfpflags); - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, caller); + + /* Honor the call site pointer we received. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + + return ret; } +#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, gfpflags, node); + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); - s = get_slab(size, gfpflags); + return ret; + } + + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller); + + /* Honor the call site pointer we received. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + + return ret; +} +#endif + +#ifdef CONFIG_SYSFS +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; } +#endif -#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLUB_DEBUG static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3193,17 +3897,18 @@ static int validate_slab(struct kmem_cache *s, struct page *page, return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); - for_each_free_object(p, s, page->freelist) { - set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) - return 0; + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { + if (test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) + return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } @@ -3211,22 +3916,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); - - if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug not set " - "on slab 0x%p\n", s->name, page); - } else { - if (SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug set on " - "slab 0x%p\n", s->name, page); - } + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, @@ -3243,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3254,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -3267,7 +3958,7 @@ static long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3282,65 +3973,6 @@ static long validate_slab_cache(struct kmem_cache *s) kfree(map); return count; } - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches + 4); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches + 5); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches + 6); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 7); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches + 8); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 9); -} -#else -static void resiliency_test(void) {}; -#endif - /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -3348,13 +3980,13 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; long min_pid; long max_pid; - cpumask_t cpus; + DECLARE_BITMAP(cpus, NR_CPUS); nodemask_t nodes; }; @@ -3396,7 +4028,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@ -3429,7 +4061,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->pid > l->max_pid) l->max_pid = track->pid; - cpu_set(track->cpu, l->cpus); + cpumask_set_cpu(track->cpu, + to_cpumask(l->cpus)); } node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; @@ -3459,25 +4092,24 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; - cpus_clear(l->cpus); - cpu_set(track->cpu, l->cpus); + cpumask_clear(to_cpumask(l->cpus)); + cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); node_set(page_to_nid(virt_to_page(track)), l->nodes); return 1; } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + unsigned long *map) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); void *p; - bitmap_zero(map, s->objects); - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + bitmap_zero(map, page->objects); + get_map(s, page, map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } @@ -3489,11 +4121,14 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); return sprintf(buf, "Out of memory\n"); - + } /* Push back cpu slabs */ flush_all(s); @@ -3507,31 +4142,29 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (len > PAGE_SIZE - 100) + if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) break; len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, "<not-available>"); if (l->sum_time != l->min_time) { - unsigned long remainder; - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - div_long_long_rem(l->sum_time, l->count, &remainder), - l->max_time); + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); } else len += sprintf(buf + len, " age=%ld", l->min_time); @@ -3543,106 +4176,192 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, " pid=%ld", l->min_pid); - if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + if (num_online_cpus() > 1 && + !cpumask_empty(to_cpumask(l->cpus)) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->cpus); + len += cpulist_scnprintf(buf + len, + PAGE_SIZE - len - 50, + to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->nodes); + len += nodelist_scnprintf(buf + len, + PAGE_SIZE - len - 50, + l->nodes); } len += sprintf(buf + len, "\n"); } free_loc_track(&t); + kfree(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; } +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); + + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); + + validate_slab_cache(kmalloc_caches[4]); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + pr_err("\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif +#ifdef CONFIG_SYSFS enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; - unsigned long *per_cpu; - nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); if (!nodes) return -ENOMEM; - per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); + int node; + struct page *page; - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + page = ACCESS_ONCE(c->page); + if (!page) + continue; + + node = page_to_nid(page); + if (flags & SO_TOTAL) + x = page->objects; + else if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + + total += x; + nodes[node] += x; + + page = ACCESS_ONCE(c->partial); + if (page) { + node = page_to_nid(page); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); else - x = 1; + x = page->pages; total += x; nodes[node] += x; } - per_cpu[node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); + get_online_mems(); +#ifdef CONFIG_SLUB_DEBUG + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; - - if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + } else +#endif + if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3650,21 +4369,15 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } +#ifdef CONFIG_SLUB_DEBUG static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3672,14 +4385,15 @@ static int any_slab_objects(struct kmem_cache *s) if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_long_read(&n->total_objects)) return 1; } return 0; } +#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) -#define to_slab(n) container_of(n, struct kmem_cache, kobj); +#define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { struct attribute attr; @@ -3688,11 +4402,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -3708,30 +4423,87 @@ SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objsize); + return sprintf(buf, "%d\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long order; + int err; + + err = kstrtoul(buf, 10, &order); + if (err) + return err; + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); } -SLAB_ATTR_RO(order); +SLAB_ATTR(order); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + return sprintf(buf, "%lu\n", s->min_partial); +} - return n + sprintf(buf + n, "\n"); - } - return 0; +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = kstrtoul(buf, 10, &min); + if (err) + return err; + + set_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = kstrtoul(buf, 10, &objects); + if (err) + return err; + if (objects && !kmem_cache_has_cpu_partial(s)) + return -EINVAL; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); @@ -3741,12 +4513,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(aliases); -static ssize_t slabs_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); -} -SLAB_ATTR_RO(slabs); - static ssize_t partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL); @@ -3761,39 +4527,46 @@ SLAB_ATTR_RO(cpu_slabs); static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); -static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); } +SLAB_ATTR_RO(objects_partial); -static ssize_t sanity_checks_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') - s->flags |= SLAB_DEBUG_FREE; - return length; -} -SLAB_ATTR(sanity_checks); + int objects = 0; + int pages = 0; + int cpu; + int len; -static ssize_t trace_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); -} + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') - s->flags |= SLAB_TRACE; - return length; + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); } -SLAB_ATTR(trace); +SLAB_ATTR_RO(slabs_cpu_partial); static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { @@ -3830,6 +4603,59 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL); +} +SLAB_ATTR_RO(slabs); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); +} + +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_DEBUG_FREE; + } + return length; +} +SLAB_ATTR(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} + +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_TRACE; + } + return length; +} +SLAB_ATTR(trace); + static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); @@ -3842,9 +4668,11 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + } + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3861,9 +4689,11 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; - calculate_sizes(s); + } + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -3880,9 +4710,11 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + } + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); @@ -3906,6 +4738,40 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t shrink_show(struct kmem_cache *s, char *buf) { return 0; @@ -3925,22 +4791,6 @@ static ssize_t shrink_store(struct kmem_cache *s, } SLAB_ATTR(shrink); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); - #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -3950,10 +4800,16 @@ static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - int n = simple_strtoul(buf, NULL, 10); + unsigned long ratio; + int err; + + err = kstrtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio <= 100) + s->remote_node_defrag_ratio = ratio * 10; - if (n < 100) - s->remote_node_defrag_ratio = n * 10; return length; } SLAB_ATTR(remote_node_defrag_ratio); @@ -3971,7 +4827,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -3979,20 +4835,38 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) len = sprintf(buf, "%lu", sum); +#ifdef CONFIG_SMP for_each_online_cpu(cpu) { if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); } +#endif kfree(data); return len + sprintf(buf + len, "\n"); } +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; +} + #define STAT_ATTR(si, text) \ static ssize_t text##_show(struct kmem_cache *s, char *buf) \ { \ return show_stat(s, buf, si); \ } \ -SLAB_ATTR_RO(text); \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); @@ -4004,6 +4878,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4011,7 +4886,14 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); - +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); +STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif static struct attribute *slab_attrs[] = { @@ -4019,25 +4901,33 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, - &slabs_attr.attr, + &objects_partial_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr, - &sanity_checks_attr.attr, - &trace_attr.attr, &hwcache_align_attr.attr, &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, + &shrink_attr.attr, + &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, +#endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4055,6 +4945,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, @@ -4062,7 +4953,19 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, + &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, #endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + NULL }; @@ -4104,25 +5007,111 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } -static void kmem_cache_release(struct kobject *kobj) +static void memcg_propagate_slab_attrs(struct kmem_cache *s) { - struct kmem_cache *s = to_slab(kobj); +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + struct kmem_cache *root_cache; + + if (is_root_cache(s)) + return; + + root_cache = s->memcg_params->root_cache; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!root_cache->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; - kfree(s); + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); } -static struct sysfs_ops slab_sysfs_ops = { +static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, }; static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -4134,12 +5123,21 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops slab_uevent_ops = { +static const struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -4167,9 +5165,18 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_DEBUG_FREE) *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", + memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -4178,13 +5185,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; - - if (slab_state < SYSFS) - /* Defer until later */ - return 0; + int unmergeable = slab_unmergeable(s); - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -4201,27 +5203,53 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); - if (err) { - kobject_put(&s->kobj); - return err; - } + s->kobj.kset = cache_kset(s); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) - return err; + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } + } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { + if (slab_state < FULL) + /* + * Sysfs has not been setup yet so no need to remove the + * cache from sysfs. + */ + return; + +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -4229,7 +5257,7 @@ static void sysfs_slab_remove(struct kmem_cache *s) /* * Need to buffer aliases during bootup until sysfs becomes - * available lest we loose that information. + * available lest we lose that information. */ struct saved_alias { struct kmem_cache *s; @@ -4243,7 +5271,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; - if (slab_state == SYSFS) { + if (slab_state == FULL) { /* * If we have a leftover link then remove it. */ @@ -4267,19 +5295,22 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; + mutex_lock(&slab_mutex); + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { - printk(KERN_ERR "Cannot register slab subsystem.\n"); + mutex_unlock(&slab_mutex); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } - slab_state = SYSFS; + slab_state = FULL; list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -4288,100 +5319,56 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } + mutex_unlock(&slab_mutex); resiliency_test(); return 0; } __initcall(slab_sysfs_init); -#endif +#endif /* CONFIG_SYSFS */ /* * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO - -ssize_t slabinfo_write(struct file *file, const char __user * buffer, - size_t count, loff_t *ppos) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { - return -EINVAL; -} - - -static void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - down_read(&slub_lock); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - up_read(&slub_lock); -} - -static int s_show(struct seq_file *m, void *p) -{ - unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; - unsigned long nr_objs; - struct kmem_cache *s; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); if (!n) continue; - nr_partials += n->nr_partial; - nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; - - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); } -const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) +{ +} +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return -EIO; +} #endif /* CONFIG_SLABINFO */ |
