diff options
Diffstat (limited to 'mm/slub.c')
| -rw-r--r-- | mm/slub.c | 524 | 
1 files changed, 304 insertions, 220 deletions
diff --git a/mm/slub.c b/mm/slub.c index c3eb3d3ca83..73004808537 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)  /*   * Maximum number of desirable partial slabs.   * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use.   */  #define MAX_PARTIAL 10 @@ -210,21 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };  #ifdef CONFIG_SYSFS  static int sysfs_slab_add(struct kmem_cache *);  static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *);  static void memcg_propagate_slab_attrs(struct kmem_cache *s);  #else  static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }  static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)  							{ return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } -  static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }  #endif  static inline void stat(const struct kmem_cache *s, enum stat_item si)  {  #ifdef CONFIG_SLUB_STATS -	__this_cpu_inc(s->cpu_slab->stat[si]); +	/* +	 * The rmw is racy on a preemptible kernel but this is acceptable, so +	 * avoid this_cpu_add()'s irq-disable overhead. +	 */ +	raw_cpu_inc(s->cpu_slab->stat[si]);  #endif  } @@ -355,6 +356,21 @@ static __always_inline void slab_unlock(struct page *page)  	__bit_spin_unlock(PG_locked, &page->flags);  } +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ +	struct page tmp; +	tmp.counters = counters_new; +	/* +	 * page->counters can cover frozen/inuse/objects as well +	 * as page->_count.  If we assign to ->counters directly +	 * we run the risk of losing updates to page->_count, so +	 * be careful and only assign to the fields we need. +	 */ +	page->frozen  = tmp.frozen; +	page->inuse   = tmp.inuse; +	page->objects = tmp.objects; +} +  /* Interrupts must be disabled (for the fallback code to work right) */  static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,  		void *freelist_old, unsigned long counters_old, @@ -376,7 +392,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page  		if (page->freelist == freelist_old &&  					page->counters == counters_old) {  			page->freelist = freelist_new; -			page->counters = counters_new; +			set_page_slub_counters(page, counters_new);  			slab_unlock(page);  			return 1;  		} @@ -387,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page  	stat(s, CMPXCHG_DOUBLE_FAIL);  #ifdef SLUB_DEBUG_CMPXCHG -	printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +	pr_info("%s %s: cmpxchg double redo ", n, s->name);  #endif  	return 0; @@ -415,7 +431,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,  		if (page->freelist == freelist_old &&  					page->counters == counters_old) {  			page->freelist = freelist_new; -			page->counters = counters_new; +			set_page_slub_counters(page, counters_new);  			slab_unlock(page);  			local_irq_restore(flags);  			return 1; @@ -428,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,  	stat(s, CMPXCHG_DOUBLE_FAIL);  #ifdef SLUB_DEBUG_CMPXCHG -	printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +	pr_info("%s %s: cmpxchg double redo ", n, s->name);  #endif  	return 0; @@ -530,14 +546,14 @@ static void print_track(const char *s, struct track *t)  	if (!t->addr)  		return; -	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", -		s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", +	       s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);  #ifdef CONFIG_STACKTRACE  	{  		int i;  		for (i = 0; i < TRACK_ADDRS_COUNT; i++)  			if (t->addrs[i]) -				printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); +				pr_err("\t%pS\n", (void *)t->addrs[i]);  			else  				break;  	} @@ -555,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object)  static void print_page_info(struct page *page)  { -	printk(KERN_ERR -	       "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", +	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",  	       page, page->objects, page->inuse, page->freelist, page->flags);  }  static void slab_bug(struct kmem_cache *s, char *fmt, ...)  { +	struct va_format vaf;  	va_list args; -	char buf[100];  	va_start(args, fmt); -	vsnprintf(buf, sizeof(buf), fmt, args); -	va_end(args); -	printk(KERN_ERR "========================================" -			"=====================================\n"); -	printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); -	printk(KERN_ERR "----------------------------------------" -			"-------------------------------------\n\n"); +	vaf.fmt = fmt; +	vaf.va = &args; +	pr_err("=============================================================================\n"); +	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); +	pr_err("-----------------------------------------------------------------------------\n\n");  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +	va_end(args);  }  static void slab_fix(struct kmem_cache *s, char *fmt, ...)  { +	struct va_format vaf;  	va_list args; -	char buf[100];  	va_start(args, fmt); -	vsnprintf(buf, sizeof(buf), fmt, args); +	vaf.fmt = fmt; +	vaf.va = &args; +	pr_err("FIX %s: %pV\n", s->name, &vaf);  	va_end(args); -	printk(KERN_ERR "FIX %s: %s\n", s->name, buf);  }  static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -598,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)  	print_page_info(page); -	printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", -			p, p - addr, get_freepointer(s, p)); +	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", +	       p, p - addr, get_freepointer(s, p));  	if (p > addr + 16)  		print_section("Bytes b4 ", p - 16, 16); @@ -682,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,  		end--;  	slab_bug(s, "%s overwritten", what); -	printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", +	pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",  					fault, end - 1, fault[0], value);  	print_trailer(s, page, object); @@ -915,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,  								int alloc)  {  	if (s->flags & SLAB_TRACE) { -		printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", +		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",  			s->name,  			alloc ? "alloc" : "free",  			object, page->inuse, @@ -933,6 +948,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,   * Hooks for other subsystems that check memory allocations. In a typical   * production configuration these hooks all should produce no code at all.   */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ +	kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ +	kmemleak_free(x); +} +  static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)  {  	flags &= gfp_allowed_mask; @@ -955,7 +980,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)  	kmemleak_free_recursive(x, s->flags);  	/* -	 * Trouble is that we may no longer disable interupts in the fast path +	 * Trouble is that we may no longer disable interrupts in the fast path  	 * So in order to make the debug calls that expect irqs to be  	 * disabled we need to disable interrupts temporarily.  	 */ @@ -975,8 +1000,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)  /*   * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held.   */  static void add_full(struct kmem_cache *s,  	struct kmem_cache_node *n, struct page *page) @@ -984,17 +1007,16 @@ static void add_full(struct kmem_cache *s,  	if (!(s->flags & SLAB_STORE_USER))  		return; +	lockdep_assert_held(&n->list_lock);  	list_add(&page->lru, &n->full);  } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)  {  	if (!(s->flags & SLAB_STORE_USER))  		return; +	lockdep_assert_held(&n->list_lock);  	list_del(&page->lru);  } @@ -1111,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing(  			slab_err(s, page, "Attempt to free object(0x%p) "  				"outside of slab", object);  		} else if (!page->slab_cache) { -			printk(KERN_ERR -				"SLUB <none>: no slab for object 0x%p.\n", -						object); +			pr_err("SLUB <none>: no slab for object 0x%p.\n", +			       object);  			dump_stack();  		} else  			object_err(s, page, object, @@ -1196,8 +1217,8 @@ static int __init setup_slub_debug(char *str)  			slub_debug |= SLAB_FAILSLAB;  			break;  		default: -			printk(KERN_ERR "slub_debug option '%c' " -				"unknown. skipped\n", *str); +			pr_err("slub_debug option '%c' unknown. skipped\n", +			       *str);  		}  	} @@ -1217,8 +1238,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,  	/*  	 * Enable debugging if selected on the kernel commandline.  	 */ -	if (slub_debug && (!slub_debug_slabs || -		!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) +	if (slub_debug && (!slub_debug_slabs || (name && +		!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))  		flags |= slub_debug;  	return flags; @@ -1240,7 +1261,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,  			void *object, u8 val) { return 1; }  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,  					struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, +					struct page *page) {}  static inline unsigned long kmem_cache_flags(unsigned long object_size,  	unsigned long flags, const char *name,  	void (*ctor)(void *)) @@ -1260,30 +1282,56 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,  static inline void dec_slabs_node(struct kmem_cache *s, int node,  							int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ +	kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ +	kmemleak_free(x); +} +  static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)  							{ return 0; }  static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, -		void *object) {} +		void *object) +{ +	kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, +		flags & gfp_allowed_mask); +} -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ +	kmemleak_free_recursive(x, s->flags); +}  #endif /* CONFIG_SLUB_DEBUG */  /*   * Slab allocation and freeing   */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, -					struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, +		gfp_t flags, int node, struct kmem_cache_order_objects oo)  { +	struct page *page;  	int order = oo_order(oo);  	flags |= __GFP_NOTRACK; +	if (memcg_charge_slab(s, flags, order)) +		return NULL; +  	if (node == NUMA_NO_NODE) -		return alloc_pages(flags, order); +		page = alloc_pages(flags, order);  	else -		return alloc_pages_exact_node(node, flags, order); +		page = alloc_pages_exact_node(node, flags, order); + +	if (!page) +		memcg_uncharge_slab(s, order); + +	return page;  }  static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1305,14 +1353,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)  	 */  	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; -	page = alloc_slab_page(alloc_gfp, node, oo); +	page = alloc_slab_page(s, alloc_gfp, node, oo);  	if (unlikely(!page)) {  		oo = s->min; +		alloc_gfp = flags;  		/*  		 * Allocation may have failed due to fragmentation.  		 * Try a lower order alloc if possible  		 */ -		page = alloc_slab_page(flags, node, oo); +		page = alloc_slab_page(s, alloc_gfp, node, oo);  		if (page)  			stat(s, ORDER_FALLBACK); @@ -1322,7 +1371,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)  		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {  		int pages = 1 << oo_order(oo); -		kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); +		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);  		/*  		 * Objects from caches that have a constructor don't get @@ -1373,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)  	order = compound_order(page);  	inc_slabs_node(s, page_to_nid(page), page->objects); -	memcg_bind_pages(s, order);  	page->slab_cache = s;  	__SetPageSlab(page);  	if (page->pfmemalloc) @@ -1424,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)  	__ClearPageSlabPfmemalloc(page);  	__ClearPageSlab(page); -	memcg_release_pages(s, order);  	page_mapcount_reset(page);  	if (current->reclaim_state)  		current->reclaim_state->reclaimed_slab += pages; -	__free_memcg_kmem_pages(page, order); +	__free_pages(page, order); +	memcg_uncharge_slab(s, order);  }  #define need_reserve_slab_rcu						\ @@ -1477,11 +1525,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)  /*   * Management of partially allocated slabs. - * - * list_lock must be held.   */ -static inline void add_partial(struct kmem_cache_node *n, -				struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail)  {  	n->nr_partial++;  	if (tail == DEACTIVATE_TO_TAIL) @@ -1490,23 +1536,32 @@ static inline void add_partial(struct kmem_cache_node *n,  		list_add(&page->lru, &n->partial);  } -/* - * list_lock must be held. - */ -static inline void remove_partial(struct kmem_cache_node *n, -					struct page *page) +static inline void add_partial(struct kmem_cache_node *n, +				struct page *page, int tail) +{ +	lockdep_assert_held(&n->list_lock); +	__add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page)  {  	list_del(&page->lru);  	n->nr_partial--;  } +static inline void remove_partial(struct kmem_cache_node *n, +					struct page *page) +{ +	lockdep_assert_held(&n->list_lock); +	__remove_partial(n, page); +} +  /*   * Remove slab from the partial list, freeze it and   * return the pointer to the freelist.   *   * Returns a list of objects or NULL if it fails. - * - * Must hold list_lock since we modify the partial list.   */  static inline void *acquire_slab(struct kmem_cache *s,  		struct kmem_cache_node *n, struct page *page, @@ -1516,6 +1571,8 @@ static inline void *acquire_slab(struct kmem_cache *s,  	unsigned long counters;  	struct page new; +	lockdep_assert_held(&n->list_lock); +  	/*  	 * Zap the freelist and set the frozen bit.  	 * The old freelist is the list of objects for the @@ -1635,8 +1692,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,  		return NULL;  	do { -		cpuset_mems_cookie = get_mems_allowed(); -		zonelist = node_zonelist(slab_node(), flags); +		cpuset_mems_cookie = read_mems_allowed_begin(); +		zonelist = node_zonelist(mempolicy_slab_node(), flags);  		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {  			struct kmem_cache_node *n; @@ -1647,19 +1704,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,  				object = get_partial_node(s, n, c, flags);  				if (object) {  					/* -					 * Return the object even if -					 * put_mems_allowed indicated that -					 * the cpuset mems_allowed was -					 * updated in parallel. It's a -					 * harmless race between the alloc -					 * and the cpuset update. +					 * Don't check read_mems_allowed_retry() +					 * here - if mems_allowed was updated in +					 * parallel, that was a harmless race +					 * between allocation and the cpuset +					 * update  					 */ -					put_mems_allowed(cpuset_mems_cookie);  					return object;  				}  			}  		} -	} while (!put_mems_allowed(cpuset_mems_cookie)); +	} while (read_mems_allowed_retry(cpuset_mems_cookie));  #endif  	return NULL;  } @@ -1671,7 +1726,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,  		struct kmem_cache_cpu *c)  {  	void *object; -	int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; +	int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;  	object = get_partial_node(s, get_node(s, searchnode), c, flags);  	if (object || node != NUMA_NO_NODE) @@ -1721,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n,  #ifdef SLUB_DEBUG_CMPXCHG  	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); -	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); +	pr_info("%s %s: cmpxchg redo ", n, s->name);  #ifdef CONFIG_PREEMPT  	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) -		printk("due to cpu change %d -> %d\n", +		pr_warn("due to cpu change %d -> %d\n",  			tid_to_cpu(tid), tid_to_cpu(actual_tid));  	else  #endif  	if (tid_to_event(tid) != tid_to_event(actual_tid)) -		printk("due to cpu running other code. Event %ld->%ld\n", +		pr_warn("due to cpu running other code. Event %ld->%ld\n",  			tid_to_event(tid), tid_to_event(actual_tid));  	else -		printk("for unknown reason: actual=%lx was=%lx target=%lx\n", +		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",  			actual_tid, tid, next_tid(tid));  #endif  	stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -1826,7 +1881,7 @@ redo:  	new.frozen = 0; -	if (!new.inuse && n->nr_partial > s->min_partial) +	if (!new.inuse && n->nr_partial >= s->min_partial)  		m = M_FREE;  	else if (new.freelist) {  		m = M_PARTIAL; @@ -1860,7 +1915,7 @@ redo:  		else if (l == M_FULL) -			remove_full(s, page); +			remove_full(s, n, page);  		if (m == M_PARTIAL) { @@ -1937,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s,  				new.freelist, new.counters,  				"unfreezing slab")); -		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { +		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {  			page->next = discard_page;  			discard_page = page;  		} else { @@ -2072,11 +2127,19 @@ static inline int node_match(struct page *page, int node)  	return 1;  } +#ifdef CONFIG_SLUB_DEBUG  static int count_free(struct page *page)  {  	return page->objects - page->inuse;  } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ +	return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)  static unsigned long count_partial(struct kmem_cache_node *n,  					int (*get_count)(struct page *))  { @@ -2090,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n,  	spin_unlock_irqrestore(&n->list_lock, flags);  	return x;  } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG -	return atomic_long_read(&n->total_objects); -#else -	return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */  static noinline void  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)  { +#ifdef CONFIG_SLUB_DEBUG +	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, +				      DEFAULT_RATELIMIT_BURST);  	int node; -	printk(KERN_WARNING -		"SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", +	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) +		return; + +	pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",  		nid, gfpflags); -	printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, " -		"default order: %d, min order: %d\n", s->name, s->object_size, -		s->size, oo_order(s->oo), oo_order(s->min)); +	pr_warn("  cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", +		s->name, s->object_size, s->size, oo_order(s->oo), +		oo_order(s->min));  	if (oo_order(s->min) > get_order(s->object_size)) -		printk(KERN_WARNING "  %s debugging increased min order, use " -		       "slub_debug=O to disable.\n", s->name); +		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n", +			s->name);  	for_each_online_node(node) {  		struct kmem_cache_node *n = get_node(s, node); @@ -2129,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)  		nr_slabs = node_nr_slabs(n);  		nr_objs  = node_nr_objs(n); -		printk(KERN_WARNING -			"  node %d: slabs: %ld, objs: %ld, free: %ld\n", +		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",  			node, nr_slabs, nr_objs, nr_free);  	} +#endif  }  static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2149,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,  	page = new_slab(s, flags, node);  	if (page) { -		c = __this_cpu_ptr(s->cpu_slab); +		c = raw_cpu_ptr(s->cpu_slab);  		if (c->page)  			flush_slab(s, c); @@ -2274,8 +2334,6 @@ redo:  	if (freelist)  		goto load_freelist; -	stat(s, ALLOC_SLOWPATH); -  	freelist = get_freelist(s, page);  	if (!freelist) { @@ -2311,9 +2369,7 @@ new_slab:  	freelist = new_slab_objects(s, gfpflags, node, &c);  	if (unlikely(!freelist)) { -		if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) -			slab_out_of_memory(s, gfpflags, node); - +		slab_out_of_memory(s, gfpflags, node);  		local_irq_restore(flags);  		return NULL;  	} @@ -2369,7 +2425,7 @@ redo:  	 * and the retrieval of the tid.  	 */  	preempt_disable(); -	c = __this_cpu_ptr(s->cpu_slab); +	c = this_cpu_ptr(s->cpu_slab);  	/*  	 * The transaction ids are globally unique per cpu and per operation on @@ -2382,10 +2438,10 @@ redo:  	object = c->freelist;  	page = c->page; -	if (unlikely(!object || !node_match(page, node))) +	if (unlikely(!object || !node_match(page, node))) {  		object = __slab_alloc(s, gfpflags, node, addr, c); - -	else { +		stat(s, ALLOC_SLOWPATH); +	} else {  		void *next_object = get_freepointer_safe(s, object);  		/* @@ -2514,7 +2570,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  		new.inuse--;  		if ((!new.inuse || !prior) && !was_frozen) { -			if (kmem_cache_has_cpu_partial(s) && !prior) +			if (kmem_cache_has_cpu_partial(s) && !prior) {  				/*  				 * Slab was on no list before and will be @@ -2524,7 +2580,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  				 */  				new.frozen = 1; -			else { /* Needs to be taken off a list */ +			} else { /* Needs to be taken off a list */  	                        n = get_node(s, page_to_nid(page));  				/* @@ -2564,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,                  return;          } -	if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) +	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))  		goto slab_empty;  	/* @@ -2573,7 +2629,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,  	 */  	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {  		if (kmem_cache_debug(s)) -			remove_full(s, page); +			remove_full(s, n, page);  		add_partial(n, page, DEACTIVATE_TO_TAIL);  		stat(s, FREE_ADD_PARTIAL);  	} @@ -2587,9 +2643,10 @@ slab_empty:  		 */  		remove_partial(n, page);  		stat(s, FREE_REMOVE_PARTIAL); -	} else +	} else {  		/* Slab must be on the full list */ -		remove_full(s, page); +		remove_full(s, n, page); +	}  	spin_unlock_irqrestore(&n->list_lock, flags);  	stat(s, FREE_SLAB); @@ -2624,7 +2681,7 @@ redo:  	 * during the cmpxchg then the free will succedd.  	 */  	preempt_disable(); -	c = __this_cpu_ptr(s->cpu_slab); +	c = this_cpu_ptr(s->cpu_slab);  	tid = c->tid;  	preempt_enable(); @@ -2829,8 +2886,8 @@ static struct kmem_cache *kmem_cache_node;   * slab on the node for this slabcache. There are no concurrent accesses   * possible.   * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping   * memory on a fresh node that has no slab structures yet.   */  static void early_kmem_cache_node_alloc(int node) @@ -2844,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node)  	BUG_ON(!page);  	if (page_to_nid(page) != node) { -		printk(KERN_ERR "SLUB: Unable to allocate memory from " -				"node %d\n", node); -		printk(KERN_ERR "SLUB: Allocating a useless per node structure " -				"in order to be able to continue\n"); +		pr_err("SLUB: Unable to allocate memory from node %d\n", node); +		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");  	}  	n = page->freelist; @@ -2863,7 +2918,11 @@ static void early_kmem_cache_node_alloc(int node)  	init_kmem_cache_node(n);  	inc_slabs_node(kmem_cache_node, node, page->objects); -	add_partial(n, page, DEACTIVATE_TO_HEAD); +	/* +	 * No locks need to be taken here as it has just been +	 * initialized and there is no concurrent access. +	 */ +	__add_partial(n, page, DEACTIVATE_TO_HEAD);  }  static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -3128,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,  	for_each_object(p, s, addr, page->objects) {  		if (!test_bit(slab_index(p, s, addr), map)) { -			printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", -							p, p - addr); +			pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);  			print_tracking(s, p);  		}  	} @@ -3149,7 +3207,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)  	list_for_each_entry_safe(page, h, &n->partial, lru) {  		if (!page->inuse) { -			remove_partial(n, page); +			__remove_partial(n, page);  			discard_slab(s, page);  		} else {  			list_slab_objects(s, page, @@ -3181,23 +3239,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)  int __kmem_cache_shutdown(struct kmem_cache *s)  { -	int rc = kmem_cache_close(s); - -	if (!rc) { -		/* -		 * We do the same lock strategy around sysfs_slab_add, see -		 * __kmem_cache_create. Because this is pretty much the last -		 * operation we do and the lock will be released shortly after -		 * that in slab_common.c, we could just move sysfs_slab_remove -		 * to a later point in common code. We should do that when we -		 * have a common sysfs framework for all allocators. -		 */ -		mutex_unlock(&slab_mutex); -		sysfs_slab_remove(s); -		mutex_lock(&slab_mutex); -	} - -	return rc; +	return kmem_cache_close(s);  }  /******************************************************************** @@ -3267,12 +3309,12 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)  	struct page *page;  	void *ptr = NULL; -	flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; -	page = alloc_pages_node(node, flags, get_order(size)); +	flags |= __GFP_COMP | __GFP_NOTRACK; +	page = alloc_kmem_pages_node(node, flags, get_order(size));  	if (page)  		ptr = page_address(page); -	kmemleak_alloc(ptr, size, 1, flags); +	kmalloc_large_node_hook(ptr, size, flags);  	return ptr;  } @@ -3336,8 +3378,8 @@ void kfree(const void *x)  	page = virt_to_head_page(x);  	if (unlikely(!PageSlab(page))) {  		BUG_ON(!PageCompound(page)); -		kmemleak_free(x); -		__free_memcg_kmem_pages(page, compound_order(page)); +		kfree_hook(x); +		__free_kmem_pages(page, compound_order(page));  		return;  	}  	slab_free(page->slab_cache, page, object, _RET_IP_); @@ -3354,7 +3396,7 @@ EXPORT_SYMBOL(kfree);   * being allocated from last increasing the chance that the last objects   * are freed in them.   */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s)  {  	int node;  	int i; @@ -3410,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s)  	kfree(slabs_by_inuse);  	return 0;  } -EXPORT_SYMBOL(kmem_cache_shrink);  static int slab_mem_going_offline_callback(void *arg)  { @@ -3418,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg)  	mutex_lock(&slab_mutex);  	list_for_each_entry(s, &slab_caches, list) -		kmem_cache_shrink(s); +		__kmem_cache_shrink(s);  	mutex_unlock(&slab_mutex);  	return 0; @@ -3612,9 +3653,7 @@ void __init kmem_cache_init(void)  	register_cpu_notifier(&slab_notifier);  #endif -	printk(KERN_INFO -		"SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," -		" CPUs=%d, Nodes=%d\n", +	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",  		cache_line_size(),  		slub_min_order, slub_max_order, slub_min_objects,  		nr_cpu_ids, nr_node_ids); @@ -3632,6 +3671,9 @@ static int slab_unmergeable(struct kmem_cache *s)  	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))  		return 1; +	if (!is_root_cache(s)) +		return 1; +  	if (s->ctor)  		return 1; @@ -3644,9 +3686,8 @@ static int slab_unmergeable(struct kmem_cache *s)  	return 0;  } -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, -		size_t align, unsigned long flags, const char *name, -		void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, +		unsigned long flags, const char *name, void (*ctor)(void *))  {  	struct kmem_cache *s; @@ -3669,7 +3710,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,  			continue;  		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) -				continue; +			continue;  		/*  		 * Check if alignment is compatible.  		 * Courtesy of Adrian Drzewiecki @@ -3680,23 +3721,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,  		if (s->size - size >= sizeof(void *))  			continue; -		if (!cache_match_memcg(s, memcg)) -			continue; -  		return s;  	}  	return NULL;  }  struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, -		   size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, +		   unsigned long flags, void (*ctor)(void *))  {  	struct kmem_cache *s; -	s = find_mergeable(memcg, size, align, flags, name, ctor); +	s = find_mergeable(size, align, flags, name, ctor);  	if (s) { +		int i; +		struct kmem_cache *c; +  		s->refcount++; +  		/*  		 * Adjust the object sizes so that we clear  		 * the complete object on kzalloc. @@ -3704,6 +3746,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,  		s->object_size = max(s->object_size, (int)size);  		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); +		for_each_memcg_cache_index(i) { +			c = cache_from_memcg_idx(s, i); +			if (!c) +				continue; +			c->object_size = s->object_size; +			c->inuse = max_t(int, c->inuse, +					 ALIGN(size, sizeof(void *))); +		} +  		if (sysfs_slab_alias(s, name)) {  			s->refcount--;  			s = NULL; @@ -3726,10 +3777,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)  		return 0;  	memcg_propagate_slab_attrs(s); -	mutex_unlock(&slab_mutex);  	err = sysfs_slab_add(s); -	mutex_lock(&slab_mutex); -  	if (err)  		kmem_cache_close(s); @@ -3887,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s,  		count++;  	}  	if (count != n->nr_partial) -		printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " -			"counter=%ld\n", s->name, count, n->nr_partial); +		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", +		       s->name, count, n->nr_partial);  	if (!(s->flags & SLAB_STORE_USER))  		goto out; @@ -3898,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s,  		count++;  	}  	if (count != atomic_long_read(&n->nr_slabs)) -		printk(KERN_ERR "SLUB: %s %ld slabs counted but " -			"counter=%ld\n", s->name, count, -			atomic_long_read(&n->nr_slabs)); +		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", +		       s->name, count, atomic_long_read(&n->nr_slabs));  out:  	spin_unlock_irqrestore(&n->list_lock, flags); @@ -4164,53 +4211,50 @@ static void resiliency_test(void)  	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); -	printk(KERN_ERR "SLUB resiliency testing\n"); -	printk(KERN_ERR "-----------------------\n"); -	printk(KERN_ERR "A. Corruption after allocation\n"); +	pr_err("SLUB resiliency testing\n"); +	pr_err("-----------------------\n"); +	pr_err("A. Corruption after allocation\n");  	p = kzalloc(16, GFP_KERNEL);  	p[16] = 0x12; -	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" -			" 0x12->0x%p\n\n", p + 16); +	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", +	       p + 16);  	validate_slab_cache(kmalloc_caches[4]);  	/* Hmmm... The next two are dangerous */  	p = kzalloc(32, GFP_KERNEL);  	p[32 + sizeof(void *)] = 0x34; -	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" -			" 0x34 -> -0x%p\n", p); -	printk(KERN_ERR -		"If allocated object is overwritten then not detectable\n\n"); +	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", +	       p); +	pr_err("If allocated object is overwritten then not detectable\n\n");  	validate_slab_cache(kmalloc_caches[5]);  	p = kzalloc(64, GFP_KERNEL);  	p += 64 + (get_cycles() & 0xff) * sizeof(void *);  	*p = 0x56; -	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", -									p); -	printk(KERN_ERR -		"If allocated object is overwritten then not detectable\n\n"); +	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", +	       p); +	pr_err("If allocated object is overwritten then not detectable\n\n");  	validate_slab_cache(kmalloc_caches[6]); -	printk(KERN_ERR "\nB. Corruption after free\n"); +	pr_err("\nB. Corruption after free\n");  	p = kzalloc(128, GFP_KERNEL);  	kfree(p);  	*p = 0x78; -	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); +	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);  	validate_slab_cache(kmalloc_caches[7]);  	p = kzalloc(256, GFP_KERNEL);  	kfree(p);  	p[50] = 0x9a; -	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", -			p); +	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);  	validate_slab_cache(kmalloc_caches[8]);  	p = kzalloc(512, GFP_KERNEL);  	kfree(p);  	p[512] = 0xab; -	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); +	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);  	validate_slab_cache(kmalloc_caches[9]);  }  #else @@ -4272,14 +4316,20 @@ static ssize_t show_slab_objects(struct kmem_cache *s,  			page = ACCESS_ONCE(c->partial);  			if (page) { -				x = page->pobjects; +				node = page_to_nid(page); +				if (flags & SO_TOTAL) +					WARN_ON_ONCE(1); +				else if (flags & SO_OBJECTS) +					WARN_ON_ONCE(1); +				else +					x = page->pages;  				total += x;  				nodes[node] += x;  			}  		}  	} -	lock_memory_hotplug(); +	get_online_mems();  #ifdef CONFIG_SLUB_DEBUG  	if (flags & SO_ALL) {  		for_each_node_state(node, N_NORMAL_MEMORY) { @@ -4319,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,  			x += sprintf(buf + x, " N%d=%lu",  					node, nodes[node]);  #endif -	unlock_memory_hotplug(); +	put_online_mems();  	kfree(nodes);  	return x + sprintf(buf + x, "\n");  } @@ -4983,7 +5033,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,  		 * through the descendants with best-effort propagation.  		 */  		for_each_memcg_cache_index(i) { -			struct kmem_cache *c = cache_from_memcg(s, i); +			struct kmem_cache *c = cache_from_memcg_idx(s, i);  			if (c)  				attribute->store(c, buf, len);  		} @@ -4998,15 +5048,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)  #ifdef CONFIG_MEMCG_KMEM  	int i;  	char *buffer = NULL; +	struct kmem_cache *root_cache; -	if (!is_root_cache(s)) +	if (is_root_cache(s))  		return; +	root_cache = s->memcg_params->root_cache; +  	/*  	 * This mean this cache had no attribute written. Therefore, no point  	 * in copying default values around  	 */ -	if (!s->max_attr_size) +	if (!root_cache->max_attr_size)  		return;  	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5028,7 +5081,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)  		 */  		if (buffer)  			buf = buffer; -		else if (s->max_attr_size < ARRAY_SIZE(mbuf)) +		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))  			buf = mbuf;  		else {  			buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5037,7 +5090,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)  			buf = buffer;  		} -		attr->show(s->memcg_params->root_cache, buf); +		attr->show(root_cache, buf);  		attr->store(s, buf, strlen(buf));  	} @@ -5046,6 +5099,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)  #endif  } +static void kmem_cache_release(struct kobject *k) +{ +	slab_kmem_cache_release(to_slab(k)); +} +  static const struct sysfs_ops slab_sysfs_ops = {  	.show = slab_attr_show,  	.store = slab_attr_store, @@ -5053,6 +5111,7 @@ static const struct sysfs_ops slab_sysfs_ops = {  static struct kobj_type slab_ktype = {  	.sysfs_ops = &slab_sysfs_ops, +	.release = kmem_cache_release,  };  static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5070,6 +5129,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {  static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM +	if (!is_root_cache(s)) +		return s->memcg_params->root_cache->memcg_kset; +#endif +	return slab_kset; +} +  #define ID_STR_LENGTH 64  /* Create a unique string id for a slab cache: @@ -5135,29 +5203,42 @@ static int sysfs_slab_add(struct kmem_cache *s)  		name = create_unique_id(s);  	} -	s->kobj.kset = slab_kset; -	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); -	if (err) { -		kobject_put(&s->kobj); -		return err; -	} +	s->kobj.kset = cache_kset(s); +	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); +	if (err) +		goto out_put_kobj;  	err = sysfs_create_group(&s->kobj, &slab_attr_group); -	if (err) { -		kobject_del(&s->kobj); -		kobject_put(&s->kobj); -		return err; +	if (err) +		goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM +	if (is_root_cache(s)) { +		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); +		if (!s->memcg_kset) { +			err = -ENOMEM; +			goto out_del_kobj; +		}  	} +#endif +  	kobject_uevent(&s->kobj, KOBJ_ADD);  	if (!unmergeable) {  		/* Setup first alias */  		sysfs_slab_alias(s, s->name); -		kfree(name);  	} -	return 0; +out: +	if (!unmergeable) +		kfree(name); +	return err; +out_del_kobj: +	kobject_del(&s->kobj); +out_put_kobj: +	kobject_put(&s->kobj); +	goto out;  } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s)  {  	if (slab_state < FULL)  		/* @@ -5166,6 +5247,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)  		 */  		return; +#ifdef CONFIG_MEMCG_KMEM +	kset_unregister(s->memcg_kset); +#endif  	kobject_uevent(&s->kobj, KOBJ_REMOVE);  	kobject_del(&s->kobj);  	kobject_put(&s->kobj); @@ -5216,7 +5300,7 @@ static int __init slab_sysfs_init(void)  	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);  	if (!slab_kset) {  		mutex_unlock(&slab_mutex); -		printk(KERN_ERR "Cannot register slab subsystem.\n"); +		pr_err("Cannot register slab subsystem.\n");  		return -ENOSYS;  	} @@ -5225,8 +5309,8 @@ static int __init slab_sysfs_init(void)  	list_for_each_entry(s, &slab_caches, list) {  		err = sysfs_slab_add(s);  		if (err) -			printk(KERN_ERR "SLUB: Unable to add boot slab %s" -						" to sysfs\n", s->name); +			pr_err("SLUB: Unable to add boot slab %s to sysfs\n", +			       s->name);  	}  	while (alias_list) { @@ -5235,8 +5319,8 @@ static int __init slab_sysfs_init(void)  		alias_list = alias_list->next;  		err = sysfs_slab_alias(al->s, al->name);  		if (err) -			printk(KERN_ERR "SLUB: Unable to add boot slab alias" -					" %s to sysfs\n", al->name); +			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", +			       al->name);  		kfree(al);  	}  | 
