diff options
Diffstat (limited to 'mm/slub.c')
| -rw-r--r-- | mm/slub.c | 3060 |
1 files changed, 1855 insertions, 1205 deletions
diff --git a/mm/slub.c b/mm/slub.c index 8fd5401bb07..73004808537 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include <linux/mm.h> @@ -15,7 +16,9 @@ #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/slab.h> +#include "slab.h" #include <linux/proc_fs.h> +#include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> @@ -27,18 +30,37 @@ #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/stacktrace.h> +#include <linux/prefetch.h> +#include <linux/memcontrol.h> + +#include <trace/events/kmem.h> + +#include "internal.h" /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slab_mutex (Global Mutex) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) + * + * slab_mutex + * + * The role of the slab_mutex is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -51,20 +73,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has noone operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -106,9 +114,6 @@ * the fast path and disables lockless freelists. */ -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) - static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG @@ -118,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif +} + /* * Issues still to be resolved: * @@ -129,6 +143,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -138,7 +155,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -164,33 +181,25 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ - -static int kmem_size = sizeof(struct kmem_cache); +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif -static enum { - DOWN, /* No slab functionality available */ - PARTIAL, /* Kmem_cache_node works */ - UP, /* Everything works but does not show up in sysfs */ - SYSFS /* Sysfs up */ -} slab_state = DOWN; - -/* A list of all slab caches on the system */ -static DECLARE_RWSEM(slub_lock); -static LIST_HEAD(slab_caches); - /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -201,24 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s->name); - kfree(s); -} - +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif -static inline void stat(struct kmem_cache *s, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -226,11 +233,6 @@ static inline void stat(struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -int slab_is_available(void) -{ - return slab_state >= UP; -} - static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { return s->node[node]; @@ -259,6 +261,23 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return *(void **)(object + s->offset); } +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetch(object + s->offset); +} + +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) +{ + void *p; + +#ifdef CONFIG_DEBUG_PAGEALLOC + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); +#else + p = get_freepointer(s, object); +#endif + return p; +} + static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { *(void **)(object + s->offset) = fp; @@ -269,21 +288,46 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) -/* Scan freelist */ -#define for_each_free_object(__p, __s, __free) \ - for (__p = (__free); __p; __p = get_freepointer((__s), __p)) - /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -299,8 +343,130 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + local_irq_restore(flags); + return 1; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* + * Determine a map of object in use on a page. + * + * Node listlock must be held to guarantee that the page does + * not vanish from under us. + */ +static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(slab_index(p, s, addr), map); +} + +/* * Debug settings: */ #ifdef CONFIG_SLUB_DEBUG_ON @@ -317,34 +483,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -366,6 +506,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -388,8 +546,18 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + pr_err("\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -403,35 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -443,17 +613,17 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->object_size, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, - s->inuse - s->objsize); + print_section("Redzone ", p + s->object_size, + s->inuse - s->object_size); if (s->offset) off = s->offset + sizeof(void *); @@ -465,7 +635,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -477,7 +647,8 @@ static void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, + const char *fmt, ...) { va_list args; char buf[100]; @@ -495,23 +666,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) u8 *p = object; if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize - 1] = POISON_END; + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, val, s->inuse - s->objsize); -} - -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) -{ - while (bytes) { - if (*start != (u8)value) - return start; - start++; - bytes--; - } - return NULL; + memset(p + s->object_size, val, s->inuse - s->object_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -528,7 +688,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; - fault = check_bytes(start, value, bytes); + fault = memchr_inv(start, value, bytes); if (!fault) return 1; @@ -537,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -556,10 +716,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) * - * object + s->objsize + * object + s->object_size * Padding to reach word boundary. This is also used for Redzoning. * Padding is extended by another word if Redzoning is enabled and - * objsize == inuse. + * object_size == inuse. * * We fill with 0xbb (RED_INACTIVE) for inactive objects and with * 0xcc (RED_ACTIVE) for objects in use. @@ -578,7 +738,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object + s->size * Nothing is used beyond s->size. * - * If slabcaches are merged then the objsize and inuse boundaries are mostly + * If slabcaches are merged then the object_size and inuse boundaries are mostly * ignored. And therefore no slab options that rely on these boundaries * may be used with merged slabcaches. */ @@ -615,20 +775,20 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -638,25 +798,26 @@ static int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { u8 *p = object; - u8 *endobject = object + s->objsize; + u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, val, s->inuse - s->objsize)) + endobject, val, s->inuse - s->object_size)) return 0; } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->objsize); + endobject, POISON_INUSE, + s->inuse - s->object_size); } } if (s->flags & SLAB_POISON) { if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, - POISON_FREE, s->objsize - 1) || + POISON_FREE, s->object_size - 1) || !check_bytes_and_report(s, page, p, "Poison", - p + s->objsize - 1, POISON_END, 1))) + p + s->object_size - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. @@ -696,7 +857,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -719,10 +880,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -731,7 +893,6 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) object_err(s, page, object, "Freechain corrupt"); set_freepointer(s, object, NULL); - break; } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; @@ -746,7 +907,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -769,14 +930,15 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, + s->object_size); dump_stack(); } @@ -786,57 +948,76 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); - return should_failslab(s->objsize, flags, s->flags); + return should_failslab(s->object_size, flags, s->flags); } -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(x, s->object_size); } /* * Tracking of fully allocated slabs for debugging purposes. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); + lockdep_assert_held(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -862,7 +1043,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (n) { + if (likely(n)) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -886,17 +1067,13 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -926,9 +1103,15 @@ bad: return 0; } -static noinline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock_irqsave(&n->list_lock, *flags); + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -943,16 +1126,15 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { - printk(KERN_ERR - "SLUB <none>: no slab for object 0x%p.\n", - object); + } else if (!page->slab_cache) { + pr_err("SLUB <none>: no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -960,18 +1142,23 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) - remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - return 1; +out: + slab_unlock(page); + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; fail: + slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, *flags); slab_fix(s, "Object at 0x%p not freed", object); - return 0; + return NULL; } static int __init setup_slub_debug(char *str) @@ -1030,8 +1217,8 @@ static int __init setup_slub_debug(char *str) slub_debug |= SLAB_FAILSLAB; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -1044,15 +1231,15 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long objsize, +static unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; return flags; @@ -1064,15 +1251,19 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long objsize, +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1091,33 +1282,56 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} #endif /* CONFIG_SLUB_DEBUG */ /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1126,6 +1340,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1134,25 +1353,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; + page = alloc_slab_page(s, alloc_gfp, node, oo); - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled + if (kmemcheck_enabled && page && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get @@ -1164,6 +1383,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1187,6 +1411,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1195,14 +1420,17 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - page->slab = s; - page->flags |= 1 << PG_slab; + page->slab_cache = s; + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1214,7 +1442,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; + page->frozen = 1; out: return page; } @@ -1240,28 +1469,48 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - reset_page_mapcount(page); + + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); + memcg_uncharge_slab(s, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -1275,79 +1524,98 @@ static void discard_slab(struct kmem_cache *s, struct page *page) } /* - * Per slab locking using the pagelock + * Management of partially allocated slabs. */ -static __always_inline void slab_lock(struct page *page) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - -/* - * Management of partially allocated slabs - */ -static void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) -{ - spin_lock(&n->list_lock); n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); + lockdep_assert_held(&n->list_lock); __remove_partial(n, page); - spin_unlock(&n->list_lock); } /* - * Lock slab and remove from the partial list. + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. * - * Must hold list_lock. + * Returns a list of objects or NULL if it fails. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode, int *objects) { - if (slab_trylock(page)) { - __remove_partial(n, page); - __SetPageSlubFrozen(page); - return 1; + void *freelist; + unsigned long counters; + struct page new; + + lockdep_assert_held(&n->list_lock); + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + *objects = new.objects - new.inuse; + if (mode) { + new.inuse = page->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; } - return 0; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + if (!__cmpxchg_double_slab(s, page, + freelist, counters, + new.freelist, new.counters, + "acquire_slab")) + return NULL; + + remove_partial(n, page); + WARN_ON(!freelist); + return freelist; } +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { - struct page *page; + struct page *page, *page2; + void *object = NULL; + int available = 0; + int objects; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1359,26 +1627,47 @@ static struct page *get_partial_node(struct kmem_cache_node *n) return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t; + + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL, &objects); + if (!t) + break; + + available += objects; + if (!object) { + c->page = page; + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { + put_cpu_partial(s, page, 0); + stat(s, CPU_PARTIAL_NODE); + } + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1402,23 +1691,30 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - struct kmem_cache_node *n; - - n = get_node(s, zone_to_nid(zone)); - - if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > s->min_partial) { - page = get_partial_node(n); - if (page) { - put_mems_allowed(); - return page; + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c, flags); + if (object) { + /* + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update + */ + return object; + } } } - } - put_mems_allowed(); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1426,104 +1722,359 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + void *object; + int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; - page = get_partial_node(get_node(s, searchnode)); - if (page || node != -1) - return page; + object = get_partial_node(s, get_node(s, searchnode), c, flags); + if (object || node != NUMA_NO_NODE) + return object; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } +#ifdef CONFIG_PREEMPT /* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) { + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + pr_info("%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + pr_warn("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + pr_warn("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +static void init_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *freelist) +{ + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *nextfree; + int tail = DEACTIVATE_TO_HEAD; + struct page new; + struct page old; + + if (page->freelist) { + stat(s, DEACTIVATE_REMOTE_FREES); + tail = DEACTIVATE_TO_TAIL; + } - __ClearPageSlubFrozen(page); - if (page->inuse) { + /* + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. + */ + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } - if (page->freelist) { - add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial >= s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); } - slab_unlock(page); } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. */ - add_partial(n, page, 1); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, n, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + } } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } } /* - * Remove the cpu slab + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { - struct page *page = c->page; - int tail = 1; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct page *page, *discard_page = NULL; - if (page->freelist) - stat(s, DEACTIVATE_REMOTE_FREES); - /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. - */ - while (unlikely(c->freelist)) { - void **object; + while ((page = c->partial)) { + struct page new; + struct page old; + + c->partial = page->next; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + + do { - tail = 0; /* Hot objects. Put the slab first */ + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + new.counters = old.counters; + new.freelist = old.freelist; - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + page->next = discard_page; + discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } } - c->page = NULL; - unfreeze_slab(s, page, tail); + + if (n) + spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +#endif +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + oldpage = NULL; + pobjects = 0; + pages = 0; + stat(s, CPU_PARTIAL_DRAIN); + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) + != oldpage); +#endif } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); - deactivate_slab(s, c); + deactivate_slab(s, c->page, c->freelist); + + c->tid = next_tid(c->tid); + c->page = NULL; + c->freelist = NULL; } /* @@ -1535,8 +2086,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s, c); + } } static void flush_cpu_slab(void *d) @@ -1546,29 +2101,45 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->page || c->partial; +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && c->node != node) + if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) return 0; #endif return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -1582,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->objsize, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); - if (oo_order(s->min) > get_order(s->objsize)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + if (oo_order(s->min) > get_order(s->object_size)) + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -1621,18 +2189,92 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif +} + +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *freelist; + struct kmem_cache_cpu *c = *pc; + struct page *page; + + freelist = get_partial(s, flags, node, c); + + if (freelist) + return freelist; + + page = new_slab(s, flags, node); + if (page) { + c = raw_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + freelist = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->page = page; + *pc = c; + } else + freelist = NULL; + + return freelist; +} + +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + +/* + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + * + * This function must be called with interrupt disabled. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; } /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -1648,78 +2290,104 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { - void **object; - struct page *new; + void *freelist; + struct page *page; + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; + page = c->page; + if (!page) + goto new_slab; +redo: + + if (unlikely(!node_match(page, node))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } - if (!c->page) + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; goto new_slab; + } - slab_lock(c->page); - if (unlikely(!node_match(c, node))) - goto another_slab; + /* must check again c->freelist in case of cpu migration or IRQ */ + freelist = c->freelist; + if (freelist) + goto load_freelist; + + freelist = get_freelist(s, page); + + if (!freelist) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } stat(s, ALLOC_REFILL); load_freelist: - object = c->page->freelist; - if (unlikely(!object)) - goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - - c->freelist = get_freepointer(s, object); - c->page->inuse = c->page->objects; - c->page->freelist = NULL; - c->node = page_to_nid(c->page); -unlock_out: - slab_unlock(c->page); - stat(s, ALLOC_SLOWPATH); - return object; - -another_slab: - deactivate_slab(s, c); + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. + * That page must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); + local_irq_restore(flags); + return freelist; new_slab: - new = get_partial(s, gfpflags, node); - if (new) { - c->page = new; - stat(s, ALLOC_FROM_PARTIAL); - goto load_freelist; - } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); + if (c->partial) { + page = c->page = c->partial; + c->partial = page->next; + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; + } - new = new_slab(s, gfpflags, node); + freelist = new_slab_objects(s, gfpflags, node, &c); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); + if (unlikely(!freelist)) { + slab_out_of_memory(s, gfpflags, node); + local_irq_restore(flags); + return NULL; + } - if (new) { - c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); - if (c->page) - flush_slab(s, c); - slab_lock(new); - __SetPageSlubFrozen(new); - c->page = new; + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) goto load_freelist; - } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - return NULL; -debug: - if (!alloc_debug_processing(s, c->page, object, addr)) - goto another_slab; - c->page->inuse++; - c->page->freelist = get_freepointer(s, object); - c->node = NUMA_NO_NODE; - goto unlock_out; + /* Only entered in the debug case */ + if (kmem_cache_debug(s) && + !alloc_debug_processing(s, page, freelist, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ + + deactivate_slab(s, page, get_freepointer(s, freelist)); + c->page = NULL; + c->freelist = NULL; + local_irq_restore(flags); + return freelist; } /* @@ -1732,75 +2400,135 @@ debug: * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; - unsigned long flags; + struct page *page; + unsigned long tid; if (slab_pre_alloc_hook(s, gfpflags)) return NULL; - local_irq_save(flags); - c = __this_cpu_ptr(s->cpu_slab); - object = c->freelist; - if (unlikely(!object || !node_match(c, node))) + s = memcg_kmem_get_cache(s, gfpflags); +redo: + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + * + * Preemption is disabled for the retrieval of the tid because that + * must occur from the current processor. We cannot allow rescheduling + * on a different processor between the determination of the pointer + * and the retrieval of the tid. + */ + preempt_disable(); + c = this_cpu_ptr(s->cpu_slab); + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + preempt_enable(); + + object = c->freelist; + page = c->page; + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); + stat(s, ALLOC_SLOWPATH); + } else { + void *next_object = get_freepointer_safe(s, object); + + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock + * semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + next_object, next_tid(tid)))) { - else { - c->freelist = get_freepointer(s, object); + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } + prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } - local_irq_restore(flags); if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, s->objsize); + memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, object); return object; } +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + s->size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, - s->objsize, s->size, gfpflags, node); + s->object_size, s->size, gfpflags, node); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node) + int node, size_t size) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif #endif @@ -1817,57 +2545,112 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; + int was_frozen; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); stat(s, FREE_SLOWPATH); - slab_lock(page); - if (kmem_cache_debug(s)) - goto debug; + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) + return; -checks_ok: - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen) { - if (unlikely(PageSlubFrozen(page))) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + if (kmem_cache_has_cpu_partial(s) && !prior) { + + /* + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. + */ + new.frozen = 1; + + } else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } + } + + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); + + if (likely(!n)) { - if (unlikely(!page->inuse)) + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) { + put_cpu_partial(s, page, 1); + stat(s, CPU_PARTIAL_FREE); + } + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before * then add it. */ - if (unlikely(!prior)) { - add_partial(get_node(s, page_to_nid(page)), page, 1); + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, n, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } - -out_unlock: - slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ - remove_partial(s, page); + remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); + } else { + /* Slab must be on the full list */ + remove_full(s, n, page); } - slab_unlock(page); + + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); - return; - -debug: - if (!free_debug_processing(s, page, x, addr)) - goto out_unlock; - goto checks_ok; } /* @@ -1886,48 +2669,50 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; - unsigned long flags; + unsigned long tid; slab_free_hook(s, x); - local_irq_save(flags); - c = __this_cpu_ptr(s->cpu_slab); +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ + preempt_disable(); + c = this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); + tid = c->tid; + preempt_enable(); - if (likely(page == c->page && c->node != NUMA_NO_NODE)) { + if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); - c->freelist = object; + + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); - local_irq_restore(flags); } void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - slab_free(s, page, x, _RET_IP_); - + s = cache_from_obj(s, x); + if (!s) + return; + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab page the object resides */ -static struct page *get_object_page(const void *x) -{ - struct page *page = virt_to_head_page(x); - - if (!PageSlab(page)) - return NULL; - - return page; -} - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -1983,13 +2768,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -1998,10 +2783,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2011,7 +2796,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2029,14 +2814,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2048,47 +2833,21 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static void -init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -2103,11 +2862,21 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + + /* + * Must align to double word boundary for the double cmpxchg + * instructions to work; see __pcpu_double_call_return_bool(). + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), + 2 * sizeof(void *)); + + if (!s->cpu_slab) + return 0; - s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; @@ -2117,15 +2886,14 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2133,32 +2901,28 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmem_cache_node); + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. */ - local_irq_save(flags); - add_partial(n, page, 0); - local_irq_restore(flags); + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2195,7 +2959,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) } s->node[node] = n; - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); } return 1; } @@ -2216,8 +2980,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->objsize; - unsigned long align = s->align; + unsigned long size = s->object_size; int order; /* @@ -2245,7 +3008,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * end of the object and the free pointer. If not then add an * additional word to have some bytes to store Redzone information. */ - if ((flags & SLAB_RED_ZONE) && size == s->objsize) + if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); #endif @@ -2289,24 +3052,16 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) #endif /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->objsize); - s->align = align; - - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2316,7 +3071,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) - s->allocflags |= SLUB_DMA; + s->allocflags |= GFP_DMA; if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -2324,26 +3079,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; return !!oo_objects(s->oo); - } -static int kmem_cache_open(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->objsize = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) goto error; @@ -2352,7 +3102,7 @@ static int kmem_cache_open(struct kmem_cache *s, * Disable debugging flags that store metadata if the min slab * order increased. */ - if (get_order(s->size) > get_order(s->objsize)) { + if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; if (!calculate_sizes(s, -1)) @@ -2360,12 +3110,47 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); - s->refcount = 1; + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -2373,62 +3158,18 @@ static int kmem_cache_open(struct kmem_cache *s, goto error; if (alloc_kmem_cache_cpus(s)) - return 1; + return 0; free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, oo_order(s->oo), - s->offset, flags); - return 0; + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); + return -EINVAL; } -/* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) -{ - struct page *page; - - if (!kern_ptr_validate(object, s->size)) - return 0; - - page = get_object_page(object); - - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; - - if (!check_valid_pointer(s, page, object)) - return 0; - - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid() is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); - -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->objsize; -} -EXPORT_SYMBOL(kmem_cache_size); - -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -2439,16 +3180,14 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, sizeof(long), GFP_ATOMIC); if (!map) return; - slab_err(s, page, "%s", text); + slab_err(s, page, text, s->name); slab_lock(page); - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + get_map(s, page, map); for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -2459,23 +3198,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { __remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining on kmem_cache_close()"); + "Objects remaining in %s on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -2486,7 +3224,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - free_percpu(s->cpu_slab); /* Attempt to free all objects */ for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2495,46 +3232,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) -{ - down_write(&slub_lock); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - if (kmem_cache_close(s)) { - printk(KERN_ERR "SLUB %s: %s called for cache that " - "still has objects.\n", s->name, __func__); - dump_stack(); - } - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - sysfs_slab_remove(s); - } - up_write(&slub_lock); +int __kmem_cache_shutdown(struct kmem_cache *s) +{ + return kmem_cache_close(s); } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem *******************************************************************/ -struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; -EXPORT_SYMBOL(kmalloc_caches); - -static struct kmem_cache *kmem_cache; - -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2571,101 +3282,20 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *__init create_kmalloc_cache(const char *name, - int size, unsigned int flags) -{ - struct kmem_cache *s; - - s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - - /* - * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slub_lock here. - */ - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) - goto panic; - - list_add(&s->list, &slab_caches); - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); - return NULL; -} - -/* - * Conversion table for small slabs sizes / 8 to the index in the - * kmalloc array. This is necessary for slabs < 192 since we have non power - * of two cache sizes there. The size of larger slabs can be determined using - * fls. - */ -static s8 size_index[24] = { - 3, /* 8 */ - 4, /* 16 */ - 5, /* 24 */ - 5, /* 32 */ - 6, /* 40 */ - 6, /* 48 */ - 6, /* 56 */ - 6, /* 64 */ - 1, /* 72 */ - 1, /* 80 */ - 1, /* 88 */ - 1, /* 96 */ - 7, /* 104 */ - 7, /* 112 */ - 7, /* 120 */ - 7, /* 128 */ - 2, /* 136 */ - 2, /* 144 */ - 2, /* 152 */ - 2, /* 160 */ - 2, /* 168 */ - 2, /* 176 */ - 2, /* 184 */ - 2 /* 192 */ -}; - -static inline int size_index_elem(size_t bytes) -{ - return (bytes - 1) / 8; -} - -static struct kmem_cache *get_slab(size_t size, gfp_t flags) -{ - int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[size_index_elem(size)]; - } else - index = fls(size - 1); - -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; -} - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, flags); - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -2680,11 +3310,11 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; - page = alloc_pages_node(node, flags, get_order(size)); + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -2693,7 +3323,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, flags, node); trace_kmalloc_node(_RET_IP_, ret, @@ -2703,12 +3333,12 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -2720,7 +3350,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2731,28 +3360,8 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; - -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); @@ -2769,11 +3378,11 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); - put_page(page); + kfree_hook(x); + __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -2787,7 +3396,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -2821,45 +3430,37 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - __remove_partial(n, page); - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); - up_read(&slub_lock); + __kmem_cache_shrink(s); + mutex_unlock(&slab_mutex); return 0; } @@ -2871,7 +3472,7 @@ static void slab_mem_offline_callback(void *arg) struct memory_notify *marg = arg; int offline_node; - offline_node = marg->status_change_nid; + offline_node = marg->status_change_nid_normal; /* * If the node still has available memory. we need kmem_cache_node @@ -2880,7 +3481,7 @@ static void slab_mem_offline_callback(void *arg) if (offline_node < 0) return; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { n = get_node(s, offline_node); if (n) { @@ -2896,7 +3497,7 @@ static void slab_mem_offline_callback(void *arg) kmem_cache_free(kmem_cache_node, n); } } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); } static int slab_mem_going_online_callback(void *arg) @@ -2904,7 +3505,7 @@ static int slab_mem_going_online_callback(void *arg) struct kmem_cache_node *n; struct kmem_cache *s; struct memory_notify *marg = arg; - int nid = marg->status_change_nid; + int nid = marg->status_change_nid_normal; int ret = 0; /* @@ -2919,7 +3520,7 @@ static int slab_mem_going_online_callback(void *arg) * allocate a kmem_cache_node structure in order to bring the node * online. */ - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { /* * XXX: kmem_cache_alloc_node will fallback to other nodes @@ -2931,11 +3532,11 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); s->node[nid] = n; } out: - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return ret; } @@ -2966,7 +3567,10 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; /******************************************************************** * Basic setup of slabs @@ -2974,187 +3578,83 @@ static int slab_memory_callback(struct notifier_block *self, /* * Used for early kmem_cache structures that were allocated using - * the page allocator + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. */ -static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - list_add(&s->list, &slab_caches); - s->refcount = -1; + memcpy(s, static_cache, kmem_cache->object_size); + /* + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ + __flush_cpu_slab(s, smp_processor_id()); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); struct page *p; if (n) { list_for_each_entry(p, &n->partial, lru) - p->slab = s; + p->slab_cache = s; -#ifdef CONFIG_SLAB_DEBUG +#ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) - p->slab = s; + p->slab_cache = s; #endif } } + list_add(&s->list, &slab_caches); + return s; } void __init kmem_cache_init(void) { - int i; - int caches = 0; - struct kmem_cache *temp_kmem_cache; - int order; - struct kmem_cache *temp_kmem_cache_node; - unsigned long kmalloc_size; - - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; - /* Allocate two kmem_caches from the page allocator */ - kmalloc_size = ALIGN(kmem_size, cache_line_size()); - order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); + if (debug_guardpage_minorder()) + slub_max_order = 0; - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - kmem_cache_node = (void *)kmem_cache + kmalloc_size; + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; - kmem_cache_open(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + register_hotmemory_notifier(&slab_memory_callback_nb); /* Able to allocate the per node structures */ slab_state = PARTIAL; - temp_kmem_cache = kmem_cache; - kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache, temp_kmem_cache, kmem_size); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + + kmem_cache = bootstrap(&boot_kmem_cache); /* * Allocate kmem_cache_node properly from the kmem_cache slab. * kmem_cache_node is separately allocated so no need to * update any list pointers. */ - temp_kmem_cache_node = kmem_cache_node; - - kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); - - kmem_cache_bootstrap_fixup(kmem_cache_node); - - caches++; - kmem_cache_bootstrap_fixup(kmem_cache); - caches++; - /* Free temporary boot structure */ - free_pages((unsigned long)temp_kmem_cache, order); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ - - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); - if (elem >= ARRAY_SIZE(size_index)) - break; - size_index[elem] = KMALLOC_SHIFT_LOW; - } - - if (KMALLOC_MIN_SIZE == 64) { - /* - * The 96 byte size cache is not used if the alignment - * is 64 byte. - */ - for (i = 64 + 8; i <= 96; i += 8) - size_index[size_index_elem(i)] = 7; - } else if (KMALLOC_MIN_SIZE == 128) { - /* - * The 192 byte sized cache is not used if the alignment - * is 128 byte. Redirect kmalloc to use the 256 byte cache - * instead. - */ - for (i = 128 + 8; i <= 192; i += 8) - size_index[size_index_elem(i)] = 8; - } - - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); - caches++; - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); - caches++; - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); - caches++; - } - - slab_state = UP; - - /* Provide the correct kmalloc names now that the caches are up */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[1]->name); - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[2]->name); - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); - - BUG_ON(!s); - kmalloc_caches[i]->name = s; - } + create_kmalloc_caches(0); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_ZONE_DMA - for (i = 0; i < SLUB_PAGE_SHIFT; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - - if (s && s->size) { - char *name = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", s->objsize); - - BUG_ON(!name); - kmalloc_dma_caches[i] = create_kmalloc_cache(name, - s->objsize, SLAB_CACHE_DMA); - } - } -#endif - printk(KERN_INFO - "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", - caches, cache_line_size(), + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } @@ -3171,6 +3671,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3183,9 +3686,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3208,7 +3710,7 @@ static struct kmem_cache *find_mergeable(size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3224,72 +3726,70 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *n; - - if (WARN_ON(!name)) - return NULL; - down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->objsize = max(s->objsize, (int)size); + s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; - goto err; + s = NULL; } - up_write(&slub_lock); - return s; } - n = kstrdup(name, GFP_KERNEL); - if (!n) - goto err; + return s; +} - s = kmalloc(kmem_size, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, n, - size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); - if (sysfs_slab_add(s)) { - list_del(&s->list); - kfree(n); - kfree(s); - goto err; - } - up_write(&slub_lock); - return s; - } - kfree(n); - kfree(s); - } - up_write(&slub_lock); +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; -err: - if (flags & SLAB_PANIC) - panic("Cannot create slabcache %s\n", name); - else - s = NULL; - return s; + err = kmem_cache_open(s, flags); + if (err) + return err; + + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); + err = sysfs_slab_add(s); + if (err) + kmem_cache_close(s); + + return err; } -EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_SMP /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, +static int slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3301,13 +3801,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); break; default: break; @@ -3315,7 +3815,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = { +static struct notifier_block slab_notifier = { .notifier_call = slab_cpuup_callback }; @@ -3326,17 +3826,17 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, gfpflags); - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); + ret = slab_alloc(s, gfpflags, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); return ret; @@ -3349,7 +3849,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, gfpflags, node); trace_kmalloc_node(caller, ret, @@ -3359,14 +3859,14 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); return ret; @@ -3399,15 +3899,16 @@ static int validate_slab(struct kmem_cache *s, struct page *page, /* Now we know that a valid freelist exists */ bitmap_zero(map, page->objects); - for_each_free_object(p, s, page->freelist) { - set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) - return 0; + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { + if (test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) + return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } @@ -3415,12 +3916,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, @@ -3437,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3448,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -3610,8 +4107,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, void *p; bitmap_zero(map, page->objects); - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + get_map(s, page, map); for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) @@ -3660,7 +4156,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, "<not-available>"); @@ -3684,15 +4180,17 @@ static int list_locations(struct kmem_cache *s, char *buf, !cpumask_empty(to_cpumask(l->cpus)) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, + len += cpulist_scnprintf(buf + len, + PAGE_SIZE - len - 50, to_cpumask(l->cpus)); } if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->nodes); + len += nodelist_scnprintf(buf + len, + PAGE_SIZE - len - 50, + l->nodes); } len += sprintf(buf + len, "\n"); @@ -3711,55 +4209,52 @@ static void resiliency_test(void) { u8 *p; - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else @@ -3790,49 +4285,61 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int node; int x; unsigned long *nodes; - unsigned long *per_cpu; - nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); if (!nodes) return -ENOMEM; - per_cpu = nodes + nr_node_ids; if (flags & SO_CPU) { int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); + int node; + struct page *page; - if (!c || c->node < 0) + page = ACCESS_ONCE(c->page); + if (!page) continue; - if (c->page) { - if (flags & SO_TOTAL) - x = c->page->objects; + node = page_to_nid(page); + if (flags & SO_TOTAL) + x = page->objects; + else if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + + total += x; + nodes[node] += x; + + page = ACCESS_ONCE(c->partial); + if (page) { + node = page_to_nid(page); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); else if (flags & SO_OBJECTS) - x = c->page->inuse; + WARN_ON_ONCE(1); else - x = 1; - + x = page->pages; total += x; - nodes[c->node] += x; + nodes[node] += x; } - per_cpu[c->node]++; } } - down_read(&slub_lock); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_TOTAL) - x = atomic_long_read(&n->total_objects); - else if (flags & SO_OBJECTS) - x = atomic_long_read(&n->total_objects) - - count_partial(n, count_free); - + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); else x = atomic_long_read(&n->nr_slabs); total += x; @@ -3862,6 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } @@ -3885,7 +4393,7 @@ static int any_slab_objects(struct kmem_cache *s) #endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) -#define to_slab(n) container_of(n, struct kmem_cache, kobj); +#define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { struct attribute attr; @@ -3894,11 +4402,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -3914,7 +4423,7 @@ SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objsize); + return sprintf(buf, "%d\n", s->object_size); } SLAB_ATTR_RO(object_size); @@ -3930,7 +4439,7 @@ static ssize_t order_store(struct kmem_cache *s, unsigned long order; int err; - err = strict_strtoul(buf, 10, &order); + err = kstrtoul(buf, 10, &order); if (err) return err; @@ -3958,7 +4467,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; @@ -3967,14 +4476,34 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + return sprintf(buf, "%u\n", s->cpu_partial); +} - return n + sprintf(buf + n, "\n"); - } - return 0; +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = kstrtoul(buf, 10, &objects); + if (err) + return err; + if (objects && !kmem_cache_has_cpu_partial(s)) + return -EINVAL; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); @@ -4008,6 +4537,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4043,6 +4603,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4065,8 +4631,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4080,8 +4648,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4098,8 +4668,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4117,8 +4689,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4136,8 +4710,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4227,7 +4803,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, unsigned long ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtoul(buf, 10, &ratio); if (err) return err; @@ -4302,6 +4878,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4309,7 +4886,14 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif static struct attribute *slab_attrs[] = { @@ -4318,6 +4902,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4329,6 +4914,8 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4358,6 +4945,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, @@ -4365,7 +4953,14 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -4412,16 +5007,101 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } -static void kmem_cache_release(struct kobject *kobj) +static void memcg_propagate_slab_attrs(struct kmem_cache *s) { - struct kmem_cache *s = to_slab(kobj); +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + struct kmem_cache *root_cache; + + if (is_root_cache(s)) + return; + + root_cache = s->memcg_params->root_cache; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!root_cache->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} - kfree(s->name); - kfree(s); +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); } static const struct sysfs_ops slab_sysfs_ops = { @@ -4431,7 +5111,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -4449,6 +5129,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -4481,6 +5170,13 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", + memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -4489,13 +5185,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; + int unmergeable = slab_unmergeable(s); - if (slab_state < SYSFS) - /* Defer until later */ - return 0; - - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -4512,37 +5203,53 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); - if (err) { - kobject_put(&s->kobj); - return err; - } + s->kobj.kset = cache_kset(s); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { - if (slab_state < SYSFS) + if (slab_state < FULL) /* * Sysfs has not been setup yet so no need to remove the * cache from sysfs. */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -4564,7 +5271,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; - if (slab_state == SYSFS) { + if (slab_state == FULL) { /* * If we have a leftover link then remove it. */ @@ -4588,22 +5295,22 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; - down_write(&slub_lock); + mutex_lock(&slab_mutex); slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { - up_write(&slub_lock); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + mutex_unlock(&slab_mutex); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } - slab_state = SYSFS; + slab_state = FULL; list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -4612,12 +5319,12 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } - up_write(&slub_lock); + mutex_unlock(&slab_mutex); resiliency_test(); return 0; } @@ -4629,96 +5336,39 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - down_read(&slub_lock); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - up_read(&slub_lock); -} - -static int s_show(struct seq_file *m, void *p) -{ - unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; - struct kmem_cache *s; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); if (!n) continue; - nr_partials += n->nr_partial; - nr_slabs += atomic_long_read(&n->nr_slabs); - nr_objs += atomic_long_read(&n->total_objects); + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); } - nr_inuse = nr_objs - nr_free; - - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, oo_objects(s->oo), - (1 << oo_order(s->oo))); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); } -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int slabinfo_open(struct inode *inode, struct file *file) +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { - return seq_open(file, &slabinfo_op); } -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init slab_proc_init(void) +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); - return 0; + return -EIO; } -module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ |
