diff options
Diffstat (limited to 'mm/slab.c')
| -rw-r--r-- | mm/slab.c | 2817 | 
1 files changed, 1331 insertions, 1486 deletions
diff --git a/mm/slab.c b/mm/slab.c index b1e40dafbab..3070b929a1b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -68,7 +68,7 @@   * Further notes from the original documentation:   *   * 11 April '97.  Started multi-threading - markhe - *	The global cache-chain is protected by the mutex 'cache_chain_mutex'. + *	The global cache-chain is protected by the mutex 'slab_mutex'.   *	The sem is only needed when accessing/extending the cache-chain, which   *	can never happen inside an interrupt (kmem_cache_create(),   *	kmem_cache_shrink() and kmem_cache_reap()). @@ -115,11 +115,20 @@  #include	<linux/debugobjects.h>  #include	<linux/kmemcheck.h>  #include	<linux/memory.h> +#include	<linux/prefetch.h> + +#include	<net/sock.h>  #include	<asm/cacheflush.h>  #include	<asm/tlbflush.h>  #include	<asm/page.h> +#include <trace/events/kmem.h> + +#include	"internal.h" + +#include	"slab.h" +  /*   * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.   *		  0 for faster, smaller code (especially in the critical paths). @@ -148,85 +157,22 @@  #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN  #endif -/* Legal flag mask for kmem_cache_create(). */ -#if DEBUG -# define CREATE_MASK	(SLAB_RED_ZONE | \ -			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ -			 SLAB_CACHE_DMA | \ -			 SLAB_STORE_USER | \ -			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ -			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ -			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ +				<= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t;  #else -# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \ -			 SLAB_CACHE_DMA | \ -			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ -			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ -			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) +typedef unsigned short freelist_idx_t;  #endif -/* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1) -#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2) -#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3) +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)  /* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. + * true if a page was allocated from pfmemalloc reserves for network-based + * swap   */ -struct slab { -	struct list_head list; -	unsigned long colouroff; -	void *s_mem;		/* including colour offset */ -	unsigned int inuse;	/* num of objs active in slab */ -	kmem_bufctl_t free; -	unsigned short nodeid; -}; - -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU.  This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking.  We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. - */ -struct slab_rcu { -	struct rcu_head head; -	struct kmem_cache *cachep; -	void *addr; -}; +static bool pfmemalloc_active __read_mostly;  /*   * struct array_cache @@ -250,9 +196,30 @@ struct array_cache {  			 * Must have this definition in here for the proper  			 * alignment of array_cache. Also simplifies accessing  			 * the entries. +			 * +			 * Entries should not be directly dereferenced as +			 * entries belonging to slabs marked pfmemalloc will +			 * have the lower bits set SLAB_OBJ_PFMEMALLOC  			 */  }; +#define SLAB_OBJ_PFMEMALLOC	1 +static inline bool is_obj_pfmemalloc(void *objp) +{ +	return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; +} + +static inline void set_obj_pfmemalloc(void **objp) +{ +	*objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); +	return; +} + +static inline void clear_obj_pfmemalloc(void **objp) +{ +	*objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); +} +  /*   * bootstrap: The caches do not work without cpuarrays anymore, but the   * cpuarrays are allocated from the generic caches... @@ -264,68 +231,27 @@ struct arraycache_init {  };  /* - * The slab lists for all objects. - */ -struct kmem_list3 { -	struct list_head slabs_partial;	/* partial list first, better asm code */ -	struct list_head slabs_full; -	struct list_head slabs_free; -	unsigned long free_objects; -	unsigned int free_limit; -	unsigned int colour_next;	/* Per-node cache coloring */ -	spinlock_t list_lock; -	struct array_cache *shared;	/* shared per node */ -	struct array_cache **alien;	/* on other nodes */ -	unsigned long next_reap;	/* updated without locking */ -	int free_touched;		/* updated without locking */ -}; - -/*   * Need this for bootstrapping a per node allocator.   */  #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];  #define	CACHE_CACHE 0  #define	SIZE_AC MAX_NUMNODES -#define	SIZE_L3 (2 * MAX_NUMNODES) +#define	SIZE_NODE (2 * MAX_NUMNODES)  static int drain_freelist(struct kmem_cache *cache, -			struct kmem_list3 *l3, int tofree); +			struct kmem_cache_node *n, int tofree);  static void free_block(struct kmem_cache *cachep, void **objpp, int len,  			int node);  static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);  static void cache_reap(struct work_struct *unused); -/* - * This function must be completely optimized away if a constant is passed to - * it.  Mostly the same as what is in linux/slab.h except it returns an index. - */ -static __always_inline int index_of(const size_t size) -{ -	extern void __bad_size(void); - -	if (__builtin_constant_p(size)) { -		int i = 0; - -#define CACHE(x) \ -	if (size <=x) \ -		return i; \ -	else \ -		i++; -#include <linux/kmalloc_sizes.h> -#undef CACHE -		__bad_size(); -	} else -		__bad_size(); -	return 0; -} -  static int slab_early_init = 1; -#define INDEX_AC index_of(sizeof(struct arraycache_init)) -#define INDEX_L3 index_of(sizeof(struct kmem_list3)) +#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) +#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) -static void kmem_list3_init(struct kmem_list3 *parent) +static void kmem_cache_node_init(struct kmem_cache_node *parent)  {  	INIT_LIST_HEAD(&parent->slabs_full);  	INIT_LIST_HEAD(&parent->slabs_partial); @@ -341,7 +267,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)  #define MAKE_LIST(cachep, listp, slab, nodeid)				\  	do {								\  		INIT_LIST_HEAD(listp);					\ -		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\ +		list_splice(&(cachep->node[nodeid]->slab), listp);	\  	} while (0)  #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\ @@ -362,8 +288,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)   * OTOH the cpuarrays can contain lots of objects,   * which could lock up otherwise freeable slabs.   */ -#define REAPTIMEOUT_CPUC	(2*HZ) -#define REAPTIMEOUT_LIST3	(4*HZ) +#define REAPTIMEOUT_AC		(2*HZ) +#define REAPTIMEOUT_NODE	(4*HZ)  #if STATS  #define	STATS_INC_ACTIVE(x)	((x)->num_active++) @@ -418,8 +344,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)   * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:   * 		redzone word.   * cachep->obj_offset: The real object. - * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->size - 1* BYTES_PER_WORD: last caller address   *					[BYTES_PER_WORD long]   */  static int obj_offset(struct kmem_cache *cachep) @@ -427,11 +353,6 @@ static int obj_offset(struct kmem_cache *cachep)  	return cachep->obj_offset;  } -static int obj_size(struct kmem_cache *cachep) -{ -	return cachep->obj_size; -} -  static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)  {  	BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); @@ -443,163 +364,109 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)  {  	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));  	if (cachep->flags & SLAB_STORE_USER) -		return (unsigned long long *)(objp + cachep->buffer_size - +		return (unsigned long long *)(objp + cachep->size -  					      sizeof(unsigned long long) -  					      REDZONE_ALIGN); -	return (unsigned long long *) (objp + cachep->buffer_size - +	return (unsigned long long *) (objp + cachep->size -  				       sizeof(unsigned long long));  }  static void **dbg_userword(struct kmem_cache *cachep, void *objp)  {  	BUG_ON(!(cachep->flags & SLAB_STORE_USER)); -	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); +	return (void **)(objp + cachep->size - BYTES_PER_WORD);  }  #else  #define obj_offset(x)			0 -#define obj_size(cachep)		(cachep->buffer_size)  #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})  #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})  #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})  #endif -#ifdef CONFIG_TRACING -size_t slab_buffer_size(struct kmem_cache *cachep) -{ -	return cachep->buffer_size; -} -EXPORT_SYMBOL(slab_buffer_size); -#endif +#define OBJECT_FREE (0) +#define OBJECT_ACTIVE (1) -/* - * Do not go above this order unless 0 objects fit into the slab. - */ -#define	BREAK_GFP_ORDER_HI	1 -#define	BREAK_GFP_ORDER_LO	0 -static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; +#ifdef CONFIG_DEBUG_SLAB_LEAK -/* - * Functions for storing/retrieving the cachep and or slab from the page - * allocator.  These are used to find the slab an obj belongs to.  With kfree(), - * these are used to find the cache which an obj belongs to. - */ -static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +static void set_obj_status(struct page *page, int idx, int val)  { -	page->lru.next = (struct list_head *)cache; -} +	int freelist_size; +	char *status; +	struct kmem_cache *cachep = page->slab_cache; -static inline struct kmem_cache *page_get_cache(struct page *page) -{ -	page = compound_head(page); -	BUG_ON(!PageSlab(page)); -	return (struct kmem_cache *)page->lru.next; +	freelist_size = cachep->num * sizeof(freelist_idx_t); +	status = (char *)page->freelist + freelist_size; +	status[idx] = val;  } -static inline void page_set_slab(struct page *page, struct slab *slab) +static inline unsigned int get_obj_status(struct page *page, int idx)  { -	page->lru.prev = (struct list_head *)slab; -} +	int freelist_size; +	char *status; +	struct kmem_cache *cachep = page->slab_cache; -static inline struct slab *page_get_slab(struct page *page) -{ -	BUG_ON(!PageSlab(page)); -	return (struct slab *)page->lru.prev; -} +	freelist_size = cachep->num * sizeof(freelist_idx_t); +	status = (char *)page->freelist + freelist_size; -static inline struct kmem_cache *virt_to_cache(const void *obj) -{ -	struct page *page = virt_to_head_page(obj); -	return page_get_cache(page); +	return status[idx];  } -static inline struct slab *virt_to_slab(const void *obj) +#else +static inline void set_obj_status(struct page *page, int idx, int val) {} + +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. + */ +#define	SLAB_MAX_ORDER_HI	1 +#define	SLAB_MAX_ORDER_LO	0 +static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; + +static inline struct kmem_cache *virt_to_cache(const void *obj)  {  	struct page *page = virt_to_head_page(obj); -	return page_get_slab(page); +	return page->slab_cache;  } -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,  				 unsigned int idx)  { -	return slab->s_mem + cache->buffer_size * idx; +	return page->s_mem + cache->size * idx;  }  /* - * We want to avoid an expensive divide : (offset / cache->buffer_size) - *   Using the fact that buffer_size is a constant for a particular cache, - *   we can replace (offset / cache->buffer_size) by + * We want to avoid an expensive divide : (offset / cache->size) + *   Using the fact that size is a constant for a particular cache, + *   we can replace (offset / cache->size) by   *   reciprocal_divide(offset, cache->reciprocal_buffer_size)   */  static inline unsigned int obj_to_index(const struct kmem_cache *cache, -					const struct slab *slab, void *obj) +					const struct page *page, void *obj)  { -	u32 offset = (obj - slab->s_mem); +	u32 offset = (obj - page->s_mem);  	return reciprocal_divide(offset, cache->reciprocal_buffer_size);  } -/* - * These are the default caches for kmalloc. Custom caches can have other sizes. - */ -struct cache_sizes malloc_sizes[] = { -#define CACHE(x) { .cs_size = (x) }, -#include <linux/kmalloc_sizes.h> -	CACHE(ULONG_MAX) -#undef CACHE -}; -EXPORT_SYMBOL(malloc_sizes); - -/* Must match cache_sizes above. Out of line to keep cache footprint low. */ -struct cache_names { -	char *name; -	char *name_dma; -}; - -static struct cache_names __initdata cache_names[] = { -#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, -#include <linux/kmalloc_sizes.h> -	{NULL,} -#undef CACHE -}; - -static struct arraycache_init initarray_cache __initdata = -    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };  static struct arraycache_init initarray_generic =      { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };  /* internal cache of cache description objs */ -static struct kmem_cache cache_cache = { +static struct kmem_cache kmem_cache_boot = {  	.batchcount = 1,  	.limit = BOOT_CPUCACHE_ENTRIES,  	.shared = 1, -	.buffer_size = sizeof(struct kmem_cache), +	.size = sizeof(struct kmem_cache),  	.name = "kmem_cache",  };  #define BAD_ALIEN_MAGIC 0x01020304ul -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { -	NONE, -	PARTIAL_AC, -	PARTIAL_L3, -	EARLY, -	FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ -	return g_cpucache_up >= EARLY; -} -  #ifdef CONFIG_LOCKDEP  /* @@ -616,40 +483,92 @@ int slab_is_available(void)  static struct lock_class_key on_slab_l3_key;  static struct lock_class_key on_slab_alc_key; +static struct lock_class_key debugobj_l3_key; +static struct lock_class_key debugobj_alc_key; + +static void slab_set_lock_classes(struct kmem_cache *cachep, +		struct lock_class_key *l3_key, struct lock_class_key *alc_key, +		int q) +{ +	struct array_cache **alc; +	struct kmem_cache_node *n; +	int r; + +	n = cachep->node[q]; +	if (!n) +		return; + +	lockdep_set_class(&n->list_lock, l3_key); +	alc = n->alien; +	/* +	 * FIXME: This check for BAD_ALIEN_MAGIC +	 * should go away when common slab code is taught to +	 * work even without alien caches. +	 * Currently, non NUMA code returns BAD_ALIEN_MAGIC +	 * for alloc_alien_cache, +	 */ +	if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) +		return; +	for_each_node(r) { +		if (alc[r]) +			lockdep_set_class(&alc[r]->lock, alc_key); +	} +} + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ +	slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ +	int node; + +	for_each_online_node(node) +		slab_set_debugobj_lock_classes_node(cachep, node); +} +  static void init_node_lock_keys(int q)  { -	struct cache_sizes *s = malloc_sizes; +	int i; -	if (g_cpucache_up != FULL) +	if (slab_state < UP)  		return; -	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { -		struct array_cache **alc; -		struct kmem_list3 *l3; -		int r; +	for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { +		struct kmem_cache_node *n; +		struct kmem_cache *cache = kmalloc_caches[i]; -		l3 = s->cs_cachep->nodelists[q]; -		if (!l3 || OFF_SLAB(s->cs_cachep)) +		if (!cache)  			continue; -		lockdep_set_class(&l3->list_lock, &on_slab_l3_key); -		alc = l3->alien; -		/* -		 * FIXME: This check for BAD_ALIEN_MAGIC -		 * should go away when common slab code is taught to -		 * work even without alien caches. -		 * Currently, non NUMA code returns BAD_ALIEN_MAGIC -		 * for alloc_alien_cache, -		 */ -		if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + +		n = cache->node[q]; +		if (!n || OFF_SLAB(cache))  			continue; -		for_each_node(r) { -			if (alc[r]) -				lockdep_set_class(&alc[r]->lock, -					&on_slab_alc_key); -		} + +		slab_set_lock_classes(cache, &on_slab_l3_key, +				&on_slab_alc_key, q);  	}  } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ +	if (!cachep->node[q]) +		return; + +	slab_set_lock_classes(cachep, &on_slab_l3_key, +			&on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +	int node; + +	VM_BUG_ON(OFF_SLAB(cachep)); +	for_each_node(node) +		on_slab_lock_classes_node(cachep, node); +} +  static inline void init_lock_keys(void)  {  	int node; @@ -665,13 +584,23 @@ static void init_node_lock_keys(int q)  static inline void init_lock_keys(void)  {  } -#endif -/* - * Guard access to the cache-chain. - */ -static DEFINE_MUTEX(cache_chain_mutex); -static struct list_head cache_chain; +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ +} +#endif  static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); @@ -680,44 +609,50 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)  	return cachep->array[smp_processor_id()];  } -static inline struct kmem_cache *__find_general_cachep(size_t size, -							gfp_t gfpflags) +static size_t calculate_freelist_size(int nr_objs, size_t align)  { -	struct cache_sizes *csizep = malloc_sizes; +	size_t freelist_size; -#if DEBUG -	/* This happens if someone tries to call -	 * kmem_cache_create(), or __kmalloc(), before -	 * the generic caches are initialized. -	 */ -	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); -#endif -	if (!size) -		return ZERO_SIZE_PTR; +	freelist_size = nr_objs * sizeof(freelist_idx_t); +	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) +		freelist_size += nr_objs * sizeof(char); -	while (size > csizep->cs_size) -		csizep++; +	if (align) +		freelist_size = ALIGN(freelist_size, align); -	/* -	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX -	 * has cs_{dma,}cachep==NULL. Thus no special case -	 * for large kmalloc calls required. -	 */ -#ifdef CONFIG_ZONE_DMA -	if (unlikely(gfpflags & GFP_DMA)) -		return csizep->cs_dmacachep; -#endif -	return csizep->cs_cachep; +	return freelist_size;  } -static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, +				size_t idx_size, size_t align)  { -	return __find_general_cachep(size, gfpflags); -} +	int nr_objs; +	size_t remained_size; +	size_t freelist_size; +	int extra_space = 0; -static size_t slab_mgmt_size(size_t nr_objs, size_t align) -{ -	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); +	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) +		extra_space = sizeof(char); +	/* +	 * Ignore padding for the initial guess. The padding +	 * is at most @align-1 bytes, and @buffer_size is at +	 * least @align. In the worst case, this result will +	 * be one greater than the number of objects that fit +	 * into the memory allocation when taking the padding +	 * into account. +	 */ +	nr_objs = slab_size / (buffer_size + idx_size + extra_space); + +	/* +	 * This calculated number will be either the right +	 * amount, or one greater than what we want. +	 */ +	remained_size = slab_size - nr_objs * buffer_size; +	freelist_size = calculate_freelist_size(nr_objs, align); +	if (remained_size < freelist_size) +		nr_objs--; + +	return nr_objs;  }  /* @@ -736,8 +671,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,  	 * on it. For the latter case, the memory allocated for a  	 * slab is used for:  	 * -	 * - The struct slab -	 * - One kmem_bufctl_t for each object +	 * - One unsigned int for each object  	 * - Padding to respect alignment of @align  	 * - @buffer_size bytes for each object  	 * @@ -750,37 +684,16 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,  		mgmt_size = 0;  		nr_objs = slab_size / buffer_size; -		if (nr_objs > SLAB_LIMIT) -			nr_objs = SLAB_LIMIT;  	} else { -		/* -		 * Ignore padding for the initial guess. The padding -		 * is at most @align-1 bytes, and @buffer_size is at -		 * least @align. In the worst case, this result will -		 * be one greater than the number of objects that fit -		 * into the memory allocation when taking the padding -		 * into account. -		 */ -		nr_objs = (slab_size - sizeof(struct slab)) / -			  (buffer_size + sizeof(kmem_bufctl_t)); - -		/* -		 * This calculated number will be either the right -		 * amount, or one greater than what we want. -		 */ -		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size -		       > slab_size) -			nr_objs--; - -		if (nr_objs > SLAB_LIMIT) -			nr_objs = SLAB_LIMIT; - -		mgmt_size = slab_mgmt_size(nr_objs, align); +		nr_objs = calculate_nr_objs(slab_size, buffer_size, +					sizeof(freelist_idx_t), align); +		mgmt_size = calculate_freelist_size(nr_objs, align);  	}  	*num = nr_objs;  	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;  } +#if DEBUG  #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)  static void __slab_error(const char *function, struct kmem_cache *cachep, @@ -789,7 +702,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,  	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",  	       function, cachep->name, msg);  	dump_stack(); +	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);  } +#endif  /*   * By default on NUMA we use alien caches to stage the freeing of @@ -807,6 +722,17 @@ static int __init noaliencache_setup(char *s)  }  __setup("noaliencache", noaliencache_setup); +static int __init slab_max_order_setup(char *str) +{ +	get_option(&str, &slab_max_order); +	slab_max_order = slab_max_order < 0 ? 0 : +				min(slab_max_order, MAX_ORDER - 1); +	slab_max_order_set = true; + +	return 1; +} +__setup("slab_max_order=", slab_max_order_setup); +  #ifdef CONFIG_NUMA  /*   * Special reaping functions for NUMA systems called from cache_reap(). @@ -829,12 +755,12 @@ static void init_reap_node(int cpu)  static void next_reap_node(void)  { -	int node = __get_cpu_var(slab_reap_node); +	int node = __this_cpu_read(slab_reap_node);  	node = next_node(node, node_online_map);  	if (unlikely(node >= MAX_NUMNODES))  		node = first_node(node_online_map); -	__get_cpu_var(slab_reap_node) = node; +	__this_cpu_write(slab_reap_node, node);  }  #else @@ -849,7 +775,7 @@ static void next_reap_node(void)   * the CPUs getting into lockstep and contending for the global cache chain   * lock.   */ -static void __cpuinit start_cpu_timer(int cpu) +static void start_cpu_timer(int cpu)  {  	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); @@ -860,7 +786,7 @@ static void __cpuinit start_cpu_timer(int cpu)  	 */  	if (keventd_up() && reap_work->work.func == NULL) {  		init_reap_node(cpu); -		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); +		INIT_DEFERRABLE_WORK(reap_work, cache_reap);  		schedule_delayed_work_on(cpu, reap_work,  					__round_jiffies_relative(HZ, cpu));  	} @@ -875,7 +801,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,  	nc = kmalloc_node(memsize, gfp, node);  	/*  	 * The array_cache structures contain pointers to free object. -	 * However, when such objects are allocated or transfered to another +	 * However, when such objects are allocated or transferred to another  	 * cache the pointers are not cleared and they could be counted as  	 * valid references during a kmemleak scan. Therefore, kmemleak must  	 * not scan such objects. @@ -891,6 +817,122 @@ static struct array_cache *alloc_arraycache(int node, int entries,  	return nc;  } +static inline bool is_slab_pfmemalloc(struct page *page) +{ +	return PageSlabPfmemalloc(page); +} + +/* Clears pfmemalloc_active if no slabs have pfmalloc set */ +static void recheck_pfmemalloc_active(struct kmem_cache *cachep, +						struct array_cache *ac) +{ +	struct kmem_cache_node *n = cachep->node[numa_mem_id()]; +	struct page *page; +	unsigned long flags; + +	if (!pfmemalloc_active) +		return; + +	spin_lock_irqsave(&n->list_lock, flags); +	list_for_each_entry(page, &n->slabs_full, lru) +		if (is_slab_pfmemalloc(page)) +			goto out; + +	list_for_each_entry(page, &n->slabs_partial, lru) +		if (is_slab_pfmemalloc(page)) +			goto out; + +	list_for_each_entry(page, &n->slabs_free, lru) +		if (is_slab_pfmemalloc(page)) +			goto out; + +	pfmemalloc_active = false; +out: +	spin_unlock_irqrestore(&n->list_lock, flags); +} + +static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, +						gfp_t flags, bool force_refill) +{ +	int i; +	void *objp = ac->entry[--ac->avail]; + +	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ +	if (unlikely(is_obj_pfmemalloc(objp))) { +		struct kmem_cache_node *n; + +		if (gfp_pfmemalloc_allowed(flags)) { +			clear_obj_pfmemalloc(&objp); +			return objp; +		} + +		/* The caller cannot use PFMEMALLOC objects, find another one */ +		for (i = 0; i < ac->avail; i++) { +			/* If a !PFMEMALLOC object is found, swap them */ +			if (!is_obj_pfmemalloc(ac->entry[i])) { +				objp = ac->entry[i]; +				ac->entry[i] = ac->entry[ac->avail]; +				ac->entry[ac->avail] = objp; +				return objp; +			} +		} + +		/* +		 * If there are empty slabs on the slabs_free list and we are +		 * being forced to refill the cache, mark this one !pfmemalloc. +		 */ +		n = cachep->node[numa_mem_id()]; +		if (!list_empty(&n->slabs_free) && force_refill) { +			struct page *page = virt_to_head_page(objp); +			ClearPageSlabPfmemalloc(page); +			clear_obj_pfmemalloc(&objp); +			recheck_pfmemalloc_active(cachep, ac); +			return objp; +		} + +		/* No !PFMEMALLOC objects available */ +		ac->avail++; +		objp = NULL; +	} + +	return objp; +} + +static inline void *ac_get_obj(struct kmem_cache *cachep, +			struct array_cache *ac, gfp_t flags, bool force_refill) +{ +	void *objp; + +	if (unlikely(sk_memalloc_socks())) +		objp = __ac_get_obj(cachep, ac, flags, force_refill); +	else +		objp = ac->entry[--ac->avail]; + +	return objp; +} + +static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, +								void *objp) +{ +	if (unlikely(pfmemalloc_active)) { +		/* Some pfmemalloc slabs exist, check if this is one */ +		struct page *page = virt_to_head_page(objp); +		if (PageSlabPfmemalloc(page)) +			set_obj_pfmemalloc(&objp); +	} + +	return objp; +} + +static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, +								void *objp) +{ +	if (unlikely(sk_memalloc_socks())) +		objp = __ac_put_obj(cachep, ac, objp); + +	ac->entry[ac->avail++] = objp; +} +  /*   * Transfer objects in one arraycache to another.   * Locking must be handled by the caller. @@ -917,7 +959,7 @@ static int transfer_objects(struct array_cache *to,  #ifndef CONFIG_NUMA  #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, n) do { } while (0)  static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)  { @@ -989,33 +1031,33 @@ static void free_alien_cache(struct array_cache **ac_ptr)  static void __drain_alien_cache(struct kmem_cache *cachep,  				struct array_cache *ac, int node)  { -	struct kmem_list3 *rl3 = cachep->nodelists[node]; +	struct kmem_cache_node *n = cachep->node[node];  	if (ac->avail) { -		spin_lock(&rl3->list_lock); +		spin_lock(&n->list_lock);  		/*  		 * Stuff objects into the remote nodes shared array first.  		 * That way we could avoid the overhead of putting the objects  		 * into the free lists and getting them back later.  		 */ -		if (rl3->shared) -			transfer_objects(rl3->shared, ac, ac->limit); +		if (n->shared) +			transfer_objects(n->shared, ac, ac->limit);  		free_block(cachep, ac->entry, ac->avail, node);  		ac->avail = 0; -		spin_unlock(&rl3->list_lock); +		spin_unlock(&n->list_lock);  	}  }  /*   * Called from cache_reap() to regularly drain alien caches round robin.   */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)  { -	int node = __get_cpu_var(slab_reap_node); +	int node = __this_cpu_read(slab_reap_node); -	if (l3->alien) { -		struct array_cache *ac = l3->alien[node]; +	if (n->alien) { +		struct array_cache *ac = n->alien[node];  		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {  			__drain_alien_cache(cachep, ac, node); @@ -1043,9 +1085,8 @@ static void drain_alien_cache(struct kmem_cache *cachep,  static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)  { -	struct slab *slabp = virt_to_slab(objp); -	int nodeid = slabp->nodeid; -	struct kmem_list3 *l3; +	int nodeid = page_to_nid(virt_to_page(objp)); +	struct kmem_cache_node *n;  	struct array_cache *alien = NULL;  	int node; @@ -1055,83 +1096,89 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)  	 * Make sure we are not freeing a object from another node to the array  	 * cache on this cpu.  	 */ -	if (likely(slabp->nodeid == node)) +	if (likely(nodeid == node))  		return 0; -	l3 = cachep->nodelists[node]; +	n = cachep->node[node];  	STATS_INC_NODEFREES(cachep); -	if (l3->alien && l3->alien[nodeid]) { -		alien = l3->alien[nodeid]; +	if (n->alien && n->alien[nodeid]) { +		alien = n->alien[nodeid];  		spin_lock(&alien->lock);  		if (unlikely(alien->avail == alien->limit)) {  			STATS_INC_ACOVERFLOW(cachep);  			__drain_alien_cache(cachep, alien, nodeid);  		} -		alien->entry[alien->avail++] = objp; +		ac_put_obj(cachep, alien, objp);  		spin_unlock(&alien->lock);  	} else { -		spin_lock(&(cachep->nodelists[nodeid])->list_lock); +		spin_lock(&(cachep->node[nodeid])->list_lock);  		free_block(cachep, &objp, 1, nodeid); -		spin_unlock(&(cachep->nodelists[nodeid])->list_lock); +		spin_unlock(&(cachep->node[nodeid])->list_lock);  	}  	return 1;  }  #endif  /* - * Allocates and initializes nodelists for a node on each slab cache, used for - * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3 + * Allocates and initializes node for a node on each slab cache, used for + * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node   * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodelists are not replaced if + * When hotplugging memory or a cpu, existing node are not replaced if   * already in use.   * - * Must hold cache_chain_mutex. + * Must hold slab_mutex.   */ -static int init_cache_nodelists_node(int node) +static int init_cache_node_node(int node)  {  	struct kmem_cache *cachep; -	struct kmem_list3 *l3; -	const int memsize = sizeof(struct kmem_list3); +	struct kmem_cache_node *n; +	const int memsize = sizeof(struct kmem_cache_node); -	list_for_each_entry(cachep, &cache_chain, next) { +	list_for_each_entry(cachep, &slab_caches, list) {  		/* -		 * Set up the size64 kmemlist for cpu before we can +		 * Set up the kmem_cache_node for cpu before we can  		 * begin anything. Make sure some other cpu on this  		 * node has not already allocated this  		 */ -		if (!cachep->nodelists[node]) { -			l3 = kmalloc_node(memsize, GFP_KERNEL, node); -			if (!l3) +		if (!cachep->node[node]) { +			n = kmalloc_node(memsize, GFP_KERNEL, node); +			if (!n)  				return -ENOMEM; -			kmem_list3_init(l3); -			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + -			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3; +			kmem_cache_node_init(n); +			n->next_reap = jiffies + REAPTIMEOUT_NODE + +			    ((unsigned long)cachep) % REAPTIMEOUT_NODE;  			/* -			 * The l3s don't come and go as CPUs come and -			 * go.  cache_chain_mutex is sufficient +			 * The kmem_cache_nodes don't come and go as CPUs +			 * come and go.  slab_mutex is sufficient  			 * protection here.  			 */ -			cachep->nodelists[node] = l3; +			cachep->node[node] = n;  		} -		spin_lock_irq(&cachep->nodelists[node]->list_lock); -		cachep->nodelists[node]->free_limit = +		spin_lock_irq(&cachep->node[node]->list_lock); +		cachep->node[node]->free_limit =  			(1 + nr_cpus_node(node)) *  			cachep->batchcount + cachep->num; -		spin_unlock_irq(&cachep->nodelists[node]->list_lock); +		spin_unlock_irq(&cachep->node[node]->list_lock);  	}  	return 0;  } -static void __cpuinit cpuup_canceled(long cpu) +static inline int slabs_tofree(struct kmem_cache *cachep, +						struct kmem_cache_node *n) +{ +	return (n->free_objects + cachep->num - 1) / cachep->num; +} + +static void cpuup_canceled(long cpu)  {  	struct kmem_cache *cachep; -	struct kmem_list3 *l3 = NULL; +	struct kmem_cache_node *n = NULL;  	int node = cpu_to_mem(cpu);  	const struct cpumask *mask = cpumask_of_node(node); -	list_for_each_entry(cachep, &cache_chain, next) { +	list_for_each_entry(cachep, &slab_caches, list) {  		struct array_cache *nc;  		struct array_cache *shared;  		struct array_cache **alien; @@ -1139,34 +1186,34 @@ static void __cpuinit cpuup_canceled(long cpu)  		/* cpu is dead; no one can alloc from it. */  		nc = cachep->array[cpu];  		cachep->array[cpu] = NULL; -		l3 = cachep->nodelists[node]; +		n = cachep->node[node]; -		if (!l3) +		if (!n)  			goto free_array_cache; -		spin_lock_irq(&l3->list_lock); +		spin_lock_irq(&n->list_lock); -		/* Free limit for this kmem_list3 */ -		l3->free_limit -= cachep->batchcount; +		/* Free limit for this kmem_cache_node */ +		n->free_limit -= cachep->batchcount;  		if (nc)  			free_block(cachep, nc->entry, nc->avail, node);  		if (!cpumask_empty(mask)) { -			spin_unlock_irq(&l3->list_lock); +			spin_unlock_irq(&n->list_lock);  			goto free_array_cache;  		} -		shared = l3->shared; +		shared = n->shared;  		if (shared) {  			free_block(cachep, shared->entry,  				   shared->avail, node); -			l3->shared = NULL; +			n->shared = NULL;  		} -		alien = l3->alien; -		l3->alien = NULL; +		alien = n->alien; +		n->alien = NULL; -		spin_unlock_irq(&l3->list_lock); +		spin_unlock_irq(&n->list_lock);  		kfree(shared);  		if (alien) { @@ -1181,18 +1228,18 @@ free_array_cache:  	 * the respective cache's slabs,  now we can go ahead and  	 * shrink each nodelist to its limit.  	 */ -	list_for_each_entry(cachep, &cache_chain, next) { -		l3 = cachep->nodelists[node]; -		if (!l3) +	list_for_each_entry(cachep, &slab_caches, list) { +		n = cachep->node[node]; +		if (!n)  			continue; -		drain_freelist(cachep, l3, l3->free_objects); +		drain_freelist(cachep, n, slabs_tofree(cachep, n));  	}  } -static int __cpuinit cpuup_prepare(long cpu) +static int cpuup_prepare(long cpu)  {  	struct kmem_cache *cachep; -	struct kmem_list3 *l3 = NULL; +	struct kmem_cache_node *n = NULL;  	int node = cpu_to_mem(cpu);  	int err; @@ -1200,9 +1247,9 @@ static int __cpuinit cpuup_prepare(long cpu)  	 * We need to do this right in the beginning since  	 * alloc_arraycache's are going to use this list.  	 * kmalloc_node allows us to add the slab to the right -	 * kmem_list3 and not this cpu's kmem_list3 +	 * kmem_cache_node and not this cpu's kmem_cache_node  	 */ -	err = init_cache_nodelists_node(node); +	err = init_cache_node_node(node);  	if (err < 0)  		goto bad; @@ -1210,7 +1257,7 @@ static int __cpuinit cpuup_prepare(long cpu)  	 * Now we can go ahead with allocating the shared arrays and  	 * array caches  	 */ -	list_for_each_entry(cachep, &cache_chain, next) { +	list_for_each_entry(cachep, &slab_caches, list) {  		struct array_cache *nc;  		struct array_cache *shared = NULL;  		struct array_cache **alien = NULL; @@ -1237,27 +1284,32 @@ static int __cpuinit cpuup_prepare(long cpu)  			}  		}  		cachep->array[cpu] = nc; -		l3 = cachep->nodelists[node]; -		BUG_ON(!l3); +		n = cachep->node[node]; +		BUG_ON(!n); -		spin_lock_irq(&l3->list_lock); -		if (!l3->shared) { +		spin_lock_irq(&n->list_lock); +		if (!n->shared) {  			/*  			 * We are serialised from CPU_DEAD or  			 * CPU_UP_CANCELLED by the cpucontrol lock  			 */ -			l3->shared = shared; +			n->shared = shared;  			shared = NULL;  		}  #ifdef CONFIG_NUMA -		if (!l3->alien) { -			l3->alien = alien; +		if (!n->alien) { +			n->alien = alien;  			alien = NULL;  		}  #endif -		spin_unlock_irq(&l3->list_lock); +		spin_unlock_irq(&n->list_lock);  		kfree(shared);  		free_alien_cache(alien); +		if (cachep->flags & SLAB_DEBUG_OBJECTS) +			slab_set_debugobj_lock_classes_node(cachep, node); +		else if (!OFF_SLAB(cachep) && +			 !(cachep->flags & SLAB_DESTROY_BY_RCU)) +			on_slab_lock_classes_node(cachep, node);  	}  	init_node_lock_keys(node); @@ -1267,7 +1319,7 @@ bad:  	return -ENOMEM;  } -static int __cpuinit cpuup_callback(struct notifier_block *nfb, +static int cpuup_callback(struct notifier_block *nfb,  				    unsigned long action, void *hcpu)  {  	long cpu = (long)hcpu; @@ -1276,9 +1328,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		mutex_lock(&cache_chain_mutex); +		mutex_lock(&slab_mutex);  		err = cpuup_prepare(cpu); -		mutex_unlock(&cache_chain_mutex); +		mutex_unlock(&slab_mutex);  		break;  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: @@ -1288,12 +1340,12 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,    	case CPU_DOWN_PREPARE:    	case CPU_DOWN_PREPARE_FROZEN:  		/* -		 * Shutdown cache reaper. Note that the cache_chain_mutex is +		 * Shutdown cache reaper. Note that the slab_mutex is  		 * held so that if cache_reap() is invoked it cannot do  		 * anything expensive but will only modify reap_work  		 * and reschedule the timer.  		*/ -		cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); +		cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));  		/* Now the cache_reaper is guaranteed to be not running. */  		per_cpu(slab_reap_work, cpu).work.func = NULL;    		break; @@ -1305,9 +1357,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,  	case CPU_DEAD_FROZEN:  		/*  		 * Even if all the cpus of a node are down, we don't free the -		 * kmem_list3 of any cache. This to avoid a race between +		 * kmem_cache_node of any cache. This to avoid a race between  		 * cpu_down, and a kmalloc allocation from another cpu for -		 * memory from the node of the cpu going down.  The list3 +		 * memory from the node of the cpu going down.  The node  		 * structure is usually allocated from kmem_cache_create() and  		 * gets destroyed at kmem_cache_destroy().  		 */ @@ -1315,15 +1367,15 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,  #endif  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: -		mutex_lock(&cache_chain_mutex); +		mutex_lock(&slab_mutex);  		cpuup_canceled(cpu); -		mutex_unlock(&cache_chain_mutex); +		mutex_unlock(&slab_mutex);  		break;  	}  	return notifier_from_errno(err);  } -static struct notifier_block __cpuinitdata cpucache_notifier = { +static struct notifier_block cpucache_notifier = {  	&cpuup_callback, NULL, 0  }; @@ -1333,24 +1385,24 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {   * Returns -EBUSY if all objects cannot be drained so that the node is not   * removed.   * - * Must hold cache_chain_mutex. + * Must hold slab_mutex.   */ -static int __meminit drain_cache_nodelists_node(int node) +static int __meminit drain_cache_node_node(int node)  {  	struct kmem_cache *cachep;  	int ret = 0; -	list_for_each_entry(cachep, &cache_chain, next) { -		struct kmem_list3 *l3; +	list_for_each_entry(cachep, &slab_caches, list) { +		struct kmem_cache_node *n; -		l3 = cachep->nodelists[node]; -		if (!l3) +		n = cachep->node[node]; +		if (!n)  			continue; -		drain_freelist(cachep, l3, l3->free_objects); +		drain_freelist(cachep, n, slabs_tofree(cachep, n)); -		if (!list_empty(&l3->slabs_full) || -		    !list_empty(&l3->slabs_partial)) { +		if (!list_empty(&n->slabs_full) || +		    !list_empty(&n->slabs_partial)) {  			ret = -EBUSY;  			break;  		} @@ -1371,14 +1423,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,  	switch (action) {  	case MEM_GOING_ONLINE: -		mutex_lock(&cache_chain_mutex); -		ret = init_cache_nodelists_node(nid); -		mutex_unlock(&cache_chain_mutex); +		mutex_lock(&slab_mutex); +		ret = init_cache_node_node(nid); +		mutex_unlock(&slab_mutex);  		break;  	case MEM_GOING_OFFLINE: -		mutex_lock(&cache_chain_mutex); -		ret = drain_cache_nodelists_node(nid); -		mutex_unlock(&cache_chain_mutex); +		mutex_lock(&slab_mutex); +		ret = drain_cache_node_node(nid); +		mutex_unlock(&slab_mutex);  		break;  	case MEM_ONLINE:  	case MEM_OFFLINE: @@ -1387,253 +1439,201 @@ static int __meminit slab_memory_callback(struct notifier_block *self,  		break;  	}  out: -	return ret ? notifier_from_errno(ret) : NOTIFY_OK; +	return notifier_from_errno(ret);  }  #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */  /* - * swap the static kmem_list3 with kmalloced memory + * swap the static kmem_cache_node with kmalloced memory   */ -static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, +static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,  				int nodeid)  { -	struct kmem_list3 *ptr; +	struct kmem_cache_node *ptr; -	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); +	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);  	BUG_ON(!ptr); -	memcpy(ptr, list, sizeof(struct kmem_list3)); +	memcpy(ptr, list, sizeof(struct kmem_cache_node));  	/*  	 * Do not assume that spinlocks can be initialized via memcpy:  	 */  	spin_lock_init(&ptr->list_lock);  	MAKE_ALL_LISTS(cachep, ptr, nodeid); -	cachep->nodelists[nodeid] = ptr; +	cachep->node[nodeid] = ptr;  }  /* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. + * For setting up all the kmem_cache_node for cache whose buffer_size is same as + * size of kmem_cache_node.   */ -static void __init set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_node(struct kmem_cache *cachep, int index)  {  	int node;  	for_each_online_node(node) { -		cachep->nodelists[node] = &initkmem_list3[index + node]; -		cachep->nodelists[node]->next_reap = jiffies + -		    REAPTIMEOUT_LIST3 + -		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3; +		cachep->node[node] = &init_kmem_cache_node[index + node]; +		cachep->node[node]->next_reap = jiffies + +		    REAPTIMEOUT_NODE + +		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;  	}  }  /* + * The memory after the last cpu cache pointer is used for the + * the node pointer. + */ +static void setup_node_pointer(struct kmem_cache *cachep) +{ +	cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; +} + +/*   * Initialisation.  Called after the page allocator have been initialised and   * before smp_init().   */  void __init kmem_cache_init(void)  { -	size_t left_over; -	struct cache_sizes *sizes; -	struct cache_names *names;  	int i; -	int order; -	int node; + +	BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < +					sizeof(struct rcu_head)); +	kmem_cache = &kmem_cache_boot; +	setup_node_pointer(kmem_cache);  	if (num_possible_nodes() == 1)  		use_alien_caches = 0; -	for (i = 0; i < NUM_INIT_LISTS; i++) { -		kmem_list3_init(&initkmem_list3[i]); -		if (i < MAX_NUMNODES) -			cache_cache.nodelists[i] = NULL; -	} -	set_up_list3s(&cache_cache, CACHE_CACHE); +	for (i = 0; i < NUM_INIT_LISTS; i++) +		kmem_cache_node_init(&init_kmem_cache_node[i]); + +	set_up_node(kmem_cache, CACHE_CACHE);  	/*  	 * Fragmentation resistance on low memory - only use bigger -	 * page orders on machines with more than 32MB of memory. +	 * page orders on machines with more than 32MB of memory if +	 * not overridden on the command line.  	 */ -	if (totalram_pages > (32 << 20) >> PAGE_SHIFT) -		slab_break_gfp_order = BREAK_GFP_ORDER_HI; +	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) +		slab_max_order = SLAB_MAX_ORDER_HI;  	/* Bootstrap is tricky, because several objects are allocated  	 * from caches that do not exist yet: -	 * 1) initialize the cache_cache cache: it contains the struct -	 *    kmem_cache structures of all caches, except cache_cache itself: -	 *    cache_cache is statically allocated. +	 * 1) initialize the kmem_cache cache: it contains the struct +	 *    kmem_cache structures of all caches, except kmem_cache itself: +	 *    kmem_cache is statically allocated.  	 *    Initially an __init data area is used for the head array and the -	 *    kmem_list3 structures, it's replaced with a kmalloc allocated +	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated  	 *    array at the end of the bootstrap.  	 * 2) Create the first kmalloc cache.  	 *    The struct kmem_cache for the new cache is allocated normally.  	 *    An __init data area is used for the head array.  	 * 3) Create the remaining kmalloc caches, with minimally sized  	 *    head arrays. -	 * 4) Replace the __init data head arrays for cache_cache and the first +	 * 4) Replace the __init data head arrays for kmem_cache and the first  	 *    kmalloc cache with kmalloc allocated arrays. -	 * 5) Replace the __init data for kmem_list3 for cache_cache and +	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and  	 *    the other cache's with kmalloc allocated memory.  	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.  	 */ -	node = numa_mem_id(); - -	/* 1) create the cache_cache */ -	INIT_LIST_HEAD(&cache_chain); -	list_add(&cache_cache.next, &cache_chain); -	cache_cache.colour_off = cache_line_size(); -	cache_cache.array[smp_processor_id()] = &initarray_cache.cache; -	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; +	/* 1) create the kmem_cache */  	/* -	 * struct kmem_cache size depends on nr_node_ids, which -	 * can be less than MAX_NUMNODES. +	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids  	 */ -	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + -				 nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG -	cache_cache.obj_size = cache_cache.buffer_size; -#endif -	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, -					cache_line_size()); -	cache_cache.reciprocal_buffer_size = -		reciprocal_value(cache_cache.buffer_size); - -	for (order = 0; order < MAX_ORDER; order++) { -		cache_estimate(order, cache_cache.buffer_size, -			cache_line_size(), 0, &left_over, &cache_cache.num); -		if (cache_cache.num) -			break; -	} -	BUG_ON(!cache_cache.num); -	cache_cache.gfporder = order; -	cache_cache.colour = left_over / cache_cache.colour_off; -	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + -				      sizeof(struct slab), cache_line_size()); +	create_boot_cache(kmem_cache, "kmem_cache", +		offsetof(struct kmem_cache, array[nr_cpu_ids]) + +				  nr_node_ids * sizeof(struct kmem_cache_node *), +				  SLAB_HWCACHE_ALIGN); +	list_add(&kmem_cache->list, &slab_caches);  	/* 2+3) create the kmalloc caches */ -	sizes = malloc_sizes; -	names = cache_names;  	/*  	 * Initialize the caches that provide memory for the array cache and the -	 * kmem_list3 structures first.  Without this, further allocations will +	 * kmem_cache_node structures first.  Without this, further allocations will  	 * bug.  	 */ -	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, -					sizes[INDEX_AC].cs_size, -					ARCH_KMALLOC_MINALIGN, -					ARCH_KMALLOC_FLAGS|SLAB_PANIC, -					NULL); - -	if (INDEX_AC != INDEX_L3) { -		sizes[INDEX_L3].cs_cachep = -			kmem_cache_create(names[INDEX_L3].name, -				sizes[INDEX_L3].cs_size, -				ARCH_KMALLOC_MINALIGN, -				ARCH_KMALLOC_FLAGS|SLAB_PANIC, -				NULL); -	} +	kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", +					kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); + +	if (INDEX_AC != INDEX_NODE) +		kmalloc_caches[INDEX_NODE] = +			create_kmalloc_cache("kmalloc-node", +				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);  	slab_early_init = 0; -	while (sizes->cs_size != ULONG_MAX) { -		/* -		 * For performance, all the general caches are L1 aligned. -		 * This should be particularly beneficial on SMP boxes, as it -		 * eliminates "false sharing". -		 * Note for systems short on memory removing the alignment will -		 * allow tighter packing of the smaller caches. -		 */ -		if (!sizes->cs_cachep) { -			sizes->cs_cachep = kmem_cache_create(names->name, -					sizes->cs_size, -					ARCH_KMALLOC_MINALIGN, -					ARCH_KMALLOC_FLAGS|SLAB_PANIC, -					NULL); -		} -#ifdef CONFIG_ZONE_DMA -		sizes->cs_dmacachep = kmem_cache_create( -					names->name_dma, -					sizes->cs_size, -					ARCH_KMALLOC_MINALIGN, -					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| -						SLAB_PANIC, -					NULL); -#endif -		sizes++; -		names++; -	}  	/* 4) Replace the bootstrap head arrays */  	{  		struct array_cache *ptr;  		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); -		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); -		memcpy(ptr, cpu_cache_get(&cache_cache), +		memcpy(ptr, cpu_cache_get(kmem_cache),  		       sizeof(struct arraycache_init));  		/*  		 * Do not assume that spinlocks can be initialized via memcpy:  		 */  		spin_lock_init(&ptr->lock); -		cache_cache.array[smp_processor_id()] = ptr; +		kmem_cache->array[smp_processor_id()] = ptr;  		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); -		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) +		BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])  		       != &initarray_generic.cache); -		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), +		memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),  		       sizeof(struct arraycache_init));  		/*  		 * Do not assume that spinlocks can be initialized via memcpy:  		 */  		spin_lock_init(&ptr->lock); -		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = -		    ptr; +		kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;  	} -	/* 5) Replace the bootstrap kmem_list3's */ +	/* 5) Replace the bootstrap kmem_cache_node */  	{  		int nid;  		for_each_online_node(nid) { -			init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); +			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); -			init_list(malloc_sizes[INDEX_AC].cs_cachep, -				  &initkmem_list3[SIZE_AC + nid], nid); +			init_list(kmalloc_caches[INDEX_AC], +				  &init_kmem_cache_node[SIZE_AC + nid], nid); -			if (INDEX_AC != INDEX_L3) { -				init_list(malloc_sizes[INDEX_L3].cs_cachep, -					  &initkmem_list3[SIZE_L3 + nid], nid); +			if (INDEX_AC != INDEX_NODE) { +				init_list(kmalloc_caches[INDEX_NODE], +					  &init_kmem_cache_node[SIZE_NODE + nid], nid);  			}  		}  	} -	g_cpucache_up = EARLY; +	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);  }  void __init kmem_cache_init_late(void)  {  	struct kmem_cache *cachep; +	slab_state = UP; +  	/* 6) resize the head arrays to their final sizes */ -	mutex_lock(&cache_chain_mutex); -	list_for_each_entry(cachep, &cache_chain, next) +	mutex_lock(&slab_mutex); +	list_for_each_entry(cachep, &slab_caches, list)  		if (enable_cpucache(cachep, GFP_NOWAIT))  			BUG(); -	mutex_unlock(&cache_chain_mutex); - -	/* Done! */ -	g_cpucache_up = FULL; +	mutex_unlock(&slab_mutex);  	/* Annotate slab for lockdep -- annotate the malloc caches */  	init_lock_keys(); +	/* Done! */ +	slab_state = FULL; +  	/*  	 * Register a cpu startup notifier callback that initializes  	 * cpu_cache_get for all new cpus @@ -1643,7 +1643,7 @@ void __init kmem_cache_init_late(void)  #ifdef CONFIG_NUMA  	/*  	 * Register a memory hotplug callback that initializes and frees -	 * nodelists. +	 * node.  	 */  	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);  #endif @@ -1663,10 +1663,66 @@ static int __init cpucache_init(void)  	 */  	for_each_online_cpu(cpu)  		start_cpu_timer(cpu); + +	/* Done! */ +	slab_state = FULL;  	return 0;  }  __initcall(cpucache_init); +static noinline void +slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) +{ +#if DEBUG +	struct kmem_cache_node *n; +	struct page *page; +	unsigned long flags; +	int node; +	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, +				      DEFAULT_RATELIMIT_BURST); + +	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) +		return; + +	printk(KERN_WARNING +		"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", +		nodeid, gfpflags); +	printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n", +		cachep->name, cachep->size, cachep->gfporder); + +	for_each_online_node(node) { +		unsigned long active_objs = 0, num_objs = 0, free_objects = 0; +		unsigned long active_slabs = 0, num_slabs = 0; + +		n = cachep->node[node]; +		if (!n) +			continue; + +		spin_lock_irqsave(&n->list_lock, flags); +		list_for_each_entry(page, &n->slabs_full, lru) { +			active_objs += cachep->num; +			active_slabs++; +		} +		list_for_each_entry(page, &n->slabs_partial, lru) { +			active_objs += page->active; +			active_slabs++; +		} +		list_for_each_entry(page, &n->slabs_free, lru) +			num_slabs++; + +		free_objects += n->free_objects; +		spin_unlock_irqrestore(&n->list_lock, flags); + +		num_slabs += active_slabs; +		num_objs = num_slabs * cachep->num; +		printk(KERN_WARNING +			"  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", +			node, active_slabs, num_slabs, active_objs, num_objs, +			free_objects); +	} +#endif +} +  /*   * Interface to system's page allocator. No need to hold the cache-lock.   * @@ -1674,27 +1730,29 @@ __initcall(cpucache_init);   * did not request dmaable memory, we might get it, but that   * would be relatively rare and ignorable.   */ -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, +								int nodeid)  {  	struct page *page;  	int nr_pages; -	int i; -#ifndef CONFIG_MMU -	/* -	 * Nommu uses slab's for process anonymous memory allocations, and thus -	 * requires __GFP_COMP to properly refcount higher order allocations -	 */ -	flags |= __GFP_COMP; -#endif - -	flags |= cachep->gfpflags; +	flags |= cachep->allocflags;  	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)  		flags |= __GFP_RECLAIMABLE; +	if (memcg_charge_slab(cachep, flags, cachep->gfporder)) +		return NULL; +  	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); -	if (!page) +	if (!page) { +		memcg_uncharge_slab(cachep, cachep->gfporder); +		slab_out_of_memory(cachep, flags, nodeid);  		return NULL; +	} + +	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ +	if (unlikely(page->pfmemalloc)) +		pfmemalloc_active = true;  	nr_pages = (1 << cachep->gfporder);  	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1703,8 +1761,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)  	else  		add_zone_page_state(page_zone(page),  			NR_SLAB_UNRECLAIMABLE, nr_pages); -	for (i = 0; i < nr_pages; i++) -		__SetPageSlab(page + i); +	__SetPageSlab(page); +	if (page->pfmemalloc) +		SetPageSlabPfmemalloc(page);  	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {  		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1715,17 +1774,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)  			kmemcheck_mark_unallocated_pages(page, nr_pages);  	} -	return page_address(page); +	return page;  }  /*   * Interface to system's page release.   */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, struct page *page)  { -	unsigned long i = (1 << cachep->gfporder); -	struct page *page = virt_to_page(addr); -	const unsigned long nr_freed = i; +	const unsigned long nr_freed = (1 << cachep->gfporder);  	kmemcheck_free_shadow(page, cachep->gfporder); @@ -1735,24 +1792,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)  	else  		sub_zone_page_state(page_zone(page),  				NR_SLAB_UNRECLAIMABLE, nr_freed); -	while (i--) { -		BUG_ON(!PageSlab(page)); -		__ClearPageSlab(page); -		page++; -	} + +	BUG_ON(!PageSlab(page)); +	__ClearPageSlabPfmemalloc(page); +	__ClearPageSlab(page); +	page_mapcount_reset(page); +	page->mapping = NULL; +  	if (current->reclaim_state)  		current->reclaim_state->reclaimed_slab += nr_freed; -	free_pages((unsigned long)addr, cachep->gfporder); +	__free_pages(page, cachep->gfporder); +	memcg_uncharge_slab(cachep, cachep->gfporder);  }  static void kmem_rcu_free(struct rcu_head *head)  { -	struct slab_rcu *slab_rcu = (struct slab_rcu *)head; -	struct kmem_cache *cachep = slab_rcu->cachep; +	struct kmem_cache *cachep; +	struct page *page; -	kmem_freepages(cachep, slab_rcu->addr); -	if (OFF_SLAB(cachep)) -		kmem_cache_free(cachep->slabp_cache, slab_rcu); +	page = container_of(head, struct page, rcu_head); +	cachep = page->slab_cache; + +	kmem_freepages(cachep, page);  }  #if DEBUG @@ -1761,7 +1822,7 @@ static void kmem_rcu_free(struct rcu_head *head)  static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,  			    unsigned long caller)  { -	int size = obj_size(cachep); +	int size = cachep->object_size;  	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; @@ -1793,7 +1854,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,  static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)  { -	int size = obj_size(cachep); +	int size = cachep->object_size;  	addr = &((char *)addr)[obj_offset(cachep)];  	memset(addr, val, size); @@ -1806,15 +1867,15 @@ static void dump_line(char *data, int offset, int limit)  	unsigned char error = 0;  	int bad_count = 0; -	printk(KERN_ERR "%03x:", offset); +	printk(KERN_ERR "%03x: ", offset);  	for (i = 0; i < limit; i++) {  		if (data[offset + i] != POISON_FREE) {  			error = data[offset + i];  			bad_count++;  		} -		printk(" %02x", (unsigned char)data[offset + i]);  	} -	printk("\n"); +	print_hex_dump(KERN_CONT, "", 0, 16, 1, +			&data[offset], limit, 1);  	if (bad_count == 1) {  		error ^= POISON_FREE; @@ -1846,14 +1907,12 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)  	}  	if (cachep->flags & SLAB_STORE_USER) { -		printk(KERN_ERR "Last user: [<%p>]", -			*dbg_userword(cachep, objp)); -		print_symbol("(%s)", -				(unsigned long)*dbg_userword(cachep, objp)); -		printk("\n"); +		printk(KERN_ERR "Last user: [<%p>](%pSR)\n", +		       *dbg_userword(cachep, objp), +		       *dbg_userword(cachep, objp));  	}  	realobj = (char *)objp + obj_offset(cachep); -	size = obj_size(cachep); +	size = cachep->object_size;  	for (i = 0; i < size && lines; i += 16, lines--) {  		int limit;  		limit = 16; @@ -1870,7 +1929,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)  	int lines = 0;  	realobj = (char *)objp + obj_offset(cachep); -	size = obj_size(cachep); +	size = cachep->object_size;  	for (i = 0; i < size; i++) {  		char exp = POISON_FREE; @@ -1882,8 +1941,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)  			/* Print header */  			if (lines == 0) {  				printk(KERN_ERR -					"Slab corruption: %s start=%p, len=%d\n", -					cachep->name, realobj, size); +					"Slab corruption (%s): %s start=%p, len=%d\n", +					print_tainted(), cachep->name, realobj, size);  				print_objinfo(cachep, objp, 0);  			}  			/* Hexdump the affected line */ @@ -1903,19 +1962,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)  		/* Print some data about the neighboring objects, if they  		 * exist:  		 */ -		struct slab *slabp = virt_to_slab(objp); +		struct page *page = virt_to_head_page(objp);  		unsigned int objnr; -		objnr = obj_to_index(cachep, slabp, objp); +		objnr = obj_to_index(cachep, page, objp);  		if (objnr) { -			objp = index_to_obj(cachep, slabp, objnr - 1); +			objp = index_to_obj(cachep, page, objnr - 1);  			realobj = (char *)objp + obj_offset(cachep);  			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",  			       realobj, size);  			print_objinfo(cachep, objp, 2);  		}  		if (objnr + 1 < cachep->num) { -			objp = index_to_obj(cachep, slabp, objnr + 1); +			objp = index_to_obj(cachep, page, objnr + 1);  			realobj = (char *)objp + obj_offset(cachep);  			printk(KERN_ERR "Next obj: start=%p, len=%d\n",  			       realobj, size); @@ -1926,18 +1985,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)  #endif  #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, +						struct page *page)  {  	int i;  	for (i = 0; i < cachep->num; i++) { -		void *objp = index_to_obj(cachep, slabp, i); +		void *objp = index_to_obj(cachep, page, i);  		if (cachep->flags & SLAB_POISON) {  #ifdef CONFIG_DEBUG_PAGEALLOC -			if (cachep->buffer_size % PAGE_SIZE == 0 && +			if (cachep->size % PAGE_SIZE == 0 &&  					OFF_SLAB(cachep))  				kernel_map_pages(virt_to_page(objp), -					cachep->buffer_size / PAGE_SIZE, 1); +					cachep->size / PAGE_SIZE, 1);  			else  				check_poison_obj(cachep, objp);  #else @@ -1955,7 +2015,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab  	}  }  #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, +						struct page *page)  {  }  #endif @@ -1963,52 +2024,42 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab  /**   * slab_destroy - destroy and release all objects in a slab   * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed + * @page: page pointer being destroyed   *   * Destroy all the objs in a slab, and release the mem back to the system.   * Before calling the slab must have been unlinked from the cache.  The   * cache-lock is not held/needed.   */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct page *page)  { -	void *addr = slabp->s_mem - slabp->colouroff; +	void *freelist; -	slab_destroy_debugcheck(cachep, slabp); +	freelist = page->freelist; +	slab_destroy_debugcheck(cachep, page);  	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { -		struct slab_rcu *slab_rcu; +		struct rcu_head *head; + +		/* +		 * RCU free overloads the RCU head over the LRU. +		 * slab_page has been overloeaded over the LRU, +		 * however it is not used from now on so that +		 * we can use it safely. +		 */ +		head = (void *)&page->rcu_head; +		call_rcu(head, kmem_rcu_free); -		slab_rcu = (struct slab_rcu *)slabp; -		slab_rcu->cachep = cachep; -		slab_rcu->addr = addr; -		call_rcu(&slab_rcu->head, kmem_rcu_free);  	} else { -		kmem_freepages(cachep, addr); -		if (OFF_SLAB(cachep)) -			kmem_cache_free(cachep->slabp_cache, slabp); +		kmem_freepages(cachep, page);  	} -} -static void __kmem_cache_destroy(struct kmem_cache *cachep) -{ -	int i; -	struct kmem_list3 *l3; - -	for_each_online_cpu(i) -	    kfree(cachep->array[i]); - -	/* NUMA: free the list3 structures */ -	for_each_online_node(i) { -		l3 = cachep->nodelists[i]; -		if (l3) { -			kfree(l3->shared); -			free_alien_cache(l3->alien); -			kfree(l3); -		} -	} -	kmem_cache_free(&cache_cache, cachep); +	/* +	 * From now on, we don't use freelist +	 * although actual page can be freed in rcu context +	 */ +	if (OFF_SLAB(cachep)) +		kmem_cache_free(cachep->freelist_cache, freelist);  } -  /**   * calculate_slab_order - calculate size (page order) of slabs   * @cachep: pointer to the cache that is being created @@ -2037,14 +2088,21 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,  		if (!num)  			continue; +		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ +		if (num > SLAB_OBJ_MAX_NUM) +			break; +  		if (flags & CFLGS_OFF_SLAB) { +			size_t freelist_size_per_obj = sizeof(freelist_idx_t);  			/*  			 * Max number of objs-per-slab for caches which  			 * use off-slab slabs. Needed to avoid a possible  			 * looping condition in cache_grow().  			 */ -			offslab_limit = size - sizeof(struct slab); -			offslab_limit /= sizeof(kmem_bufctl_t); +			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) +				freelist_size_per_obj += sizeof(char); +			offslab_limit = size; +			offslab_limit /= freelist_size_per_obj;   			if (num > offslab_limit)  				break; @@ -2067,7 +2125,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,  		 * Large number of objects is good, but very large slabs are  		 * currently bad for the gfp()s.  		 */ -		if (gfporder >= slab_break_gfp_order) +		if (gfporder >= slab_max_order)  			break;  		/* @@ -2081,48 +2139,57 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,  static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)  { -	if (g_cpucache_up == FULL) +	if (slab_state >= FULL)  		return enable_cpucache(cachep, gfp); -	if (g_cpucache_up == NONE) { +	if (slab_state == DOWN) {  		/* -		 * Note: the first kmem_cache_create must create the cache +		 * Note: Creation of first cache (kmem_cache). +		 * The setup_node is taken care +		 * of by the caller of __kmem_cache_create +		 */ +		cachep->array[smp_processor_id()] = &initarray_generic.cache; +		slab_state = PARTIAL; +	} else if (slab_state == PARTIAL) { +		/* +		 * Note: the second kmem_cache_create must create the cache  		 * that's used by kmalloc(24), otherwise the creation of  		 * further caches will BUG().  		 */  		cachep->array[smp_processor_id()] = &initarray_generic.cache;  		/* -		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is -		 * the first cache, then we need to set up all its list3s, +		 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is +		 * the second cache, then we need to set up all its node/,  		 * otherwise the creation of further caches will BUG().  		 */ -		set_up_list3s(cachep, SIZE_AC); -		if (INDEX_AC == INDEX_L3) -			g_cpucache_up = PARTIAL_L3; +		set_up_node(cachep, SIZE_AC); +		if (INDEX_AC == INDEX_NODE) +			slab_state = PARTIAL_NODE;  		else -			g_cpucache_up = PARTIAL_AC; +			slab_state = PARTIAL_ARRAYCACHE;  	} else { +		/* Remaining boot caches */  		cachep->array[smp_processor_id()] =  			kmalloc(sizeof(struct arraycache_init), gfp); -		if (g_cpucache_up == PARTIAL_AC) { -			set_up_list3s(cachep, SIZE_L3); -			g_cpucache_up = PARTIAL_L3; +		if (slab_state == PARTIAL_ARRAYCACHE) { +			set_up_node(cachep, SIZE_NODE); +			slab_state = PARTIAL_NODE;  		} else {  			int node;  			for_each_online_node(node) { -				cachep->nodelists[node] = -				    kmalloc_node(sizeof(struct kmem_list3), +				cachep->node[node] = +				    kmalloc_node(sizeof(struct kmem_cache_node),  						gfp, node); -				BUG_ON(!cachep->nodelists[node]); -				kmem_list3_init(cachep->nodelists[node]); +				BUG_ON(!cachep->node[node]); +				kmem_cache_node_init(cachep->node[node]);  			}  		}  	} -	cachep->nodelists[numa_mem_id()]->next_reap = -			jiffies + REAPTIMEOUT_LIST3 + -			((unsigned long)cachep) % REAPTIMEOUT_LIST3; +	cachep->node[numa_mem_id()]->next_reap = +			jiffies + REAPTIMEOUT_NODE + +			((unsigned long)cachep) % REAPTIMEOUT_NODE;  	cpu_cache_get(cachep)->avail = 0;  	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2134,22 +2201,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)  }  /** - * kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. + * __kmem_cache_create - Create a cache. + * @cachep: cache management descriptor   * @flags: SLAB flags - * @ctor: A constructor for the objects.   *   * Returns a ptr to the cache on success, NULL on failure.   * Cannot be called within a int, but can be interrupted.   * The @ctor is run when new pages are allocated by the cache.   * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * Note that kmem_cache_name() is not guaranteed to return the same pointer, - * therefore applications must manage it themselves. - *   * The flags are   *   * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -2162,60 +2221,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)   * cacheline.  This can be beneficial if you're counting cycles as closely   * as davem.   */ -struct kmem_cache * -kmem_cache_create (const char *name, size_t size, size_t align, -	unsigned long flags, void (*ctor)(void *)) +int +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)  { -	size_t left_over, slab_size, ralign; -	struct kmem_cache *cachep = NULL, *pc; +	size_t left_over, freelist_size, ralign;  	gfp_t gfp; - -	/* -	 * Sanity checks... these are all serious usage bugs. -	 */ -	if (!name || in_interrupt() || (size < BYTES_PER_WORD) || -	    size > KMALLOC_MAX_SIZE) { -		printk(KERN_ERR "%s: Early error in slab %s\n", __func__, -				name); -		BUG(); -	} - -	/* -	 * We use cache_chain_mutex to ensure a consistent view of -	 * cpu_online_mask as well.  Please see cpuup_callback -	 */ -	if (slab_is_available()) { -		get_online_cpus(); -		mutex_lock(&cache_chain_mutex); -	} - -	list_for_each_entry(pc, &cache_chain, next) { -		char tmp; -		int res; - -		/* -		 * This happens when the module gets unloaded and doesn't -		 * destroy its slab cache and no-one else reuses the vmalloc -		 * area of the module.  Print a warning. -		 */ -		res = probe_kernel_address(pc->name, tmp); -		if (res) { -			printk(KERN_ERR -			       "SLAB: cache with size %d has lost its name\n", -			       pc->buffer_size); -			continue; -		} - -		if (!strcmp(pc->name, name)) { -			printk(KERN_ERR -			       "kmem_cache_create: duplicate cache %s\n", name); -			dump_stack(); -			goto oops; -		} -	} +	int err; +	size_t size = cachep->size;  #if DEBUG -	WARN_ON(strchr(name, ' '));	/* It confuses parsers */  #if FORCED_DEBUG  	/*  	 * Enable redzoning and last user accounting, except for caches with @@ -2232,11 +2246,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	if (flags & SLAB_DESTROY_BY_RCU)  		BUG_ON(flags & SLAB_POISON);  #endif -	/* -	 * Always checks flags, a caller might be expecting debug support which -	 * isn't available. -	 */ -	BUG_ON(flags & ~CREATE_MASK);  	/*  	 * Check that size is in terms of words.  This is needed to avoid @@ -2248,22 +2257,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,  		size &= ~(BYTES_PER_WORD - 1);  	} -	/* calculate the final buffer alignment: */ - -	/* 1) arch recommendation: can be overridden for debug */ -	if (flags & SLAB_HWCACHE_ALIGN) { -		/* -		 * Default alignment: as specified by the arch code.  Except if -		 * an object is really small, then squeeze multiple objects into -		 * one cacheline. -		 */ -		ralign = cache_line_size(); -		while (size <= ralign / 2) -			ralign /= 2; -	} else { -		ralign = BYTES_PER_WORD; -	} -  	/*  	 * Redzoning and user store require word alignment or possibly larger.  	 * Note this will be overridden by architecture or caller mandated @@ -2280,34 +2273,25 @@ kmem_cache_create (const char *name, size_t size, size_t align,  		size &= ~(REDZONE_ALIGN - 1);  	} -	/* 2) arch mandated alignment */ -	if (ralign < ARCH_SLAB_MINALIGN) { -		ralign = ARCH_SLAB_MINALIGN; -	}  	/* 3) caller mandated alignment */ -	if (ralign < align) { -		ralign = align; +	if (ralign < cachep->align) { +		ralign = cachep->align;  	} -	/* disable debug if not aligning with REDZONE_ALIGN */ -	if (ralign & (__alignof__(unsigned long long) - 1)) +	/* disable debug if necessary */ +	if (ralign > __alignof__(unsigned long long))  		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);  	/*  	 * 4) Store it.  	 */ -	align = ralign; +	cachep->align = ralign;  	if (slab_is_available())  		gfp = GFP_KERNEL;  	else  		gfp = GFP_NOWAIT; -	/* Get cache's description obj. */ -	cachep = kmem_cache_zalloc(&cache_cache, gfp); -	if (!cachep) -		goto oops; - +	setup_node_pointer(cachep);  #if DEBUG -	cachep->obj_size = size;  	/*  	 * Both debugging options require word-alignment which is calculated @@ -2315,8 +2299,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	 */  	if (flags & SLAB_RED_ZONE) {  		/* add space for red zone words */ -		cachep->obj_offset += align; -		size += align + sizeof(unsigned long long); +		cachep->obj_offset += sizeof(unsigned long long); +		size += 2 * sizeof(unsigned long long);  	}  	if (flags & SLAB_STORE_USER) {  		/* user store requires one word storage behind the end of @@ -2329,9 +2313,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,  			size += BYTES_PER_WORD;  	}  #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) -	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size -	    && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { -		cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); +	if (size >= kmalloc_size(INDEX_NODE + 1) +	    && cachep->object_size > cache_line_size() +	    && ALIGN(size, cachep->align) < PAGE_SIZE) { +		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);  		size = PAGE_SIZE;  	}  #endif @@ -2343,7 +2328,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	 * it too early on. Always use on-slab management when  	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)  	 */ -	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && +	if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&  	    !(flags & SLAB_NOLEAKTRACE))  		/*  		 * Size is large, assume best to place the slab management obj @@ -2351,33 +2336,33 @@ kmem_cache_create (const char *name, size_t size, size_t align,  		 */  		flags |= CFLGS_OFF_SLAB; -	size = ALIGN(size, align); +	size = ALIGN(size, cachep->align); +	/* +	 * We should restrict the number of objects in a slab to implement +	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. +	 */ +	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) +		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); -	left_over = calculate_slab_order(cachep, size, align, flags); +	left_over = calculate_slab_order(cachep, size, cachep->align, flags); -	if (!cachep->num) { -		printk(KERN_ERR -		       "kmem_cache_create: couldn't create cache %s.\n", name); -		kmem_cache_free(&cache_cache, cachep); -		cachep = NULL; -		goto oops; -	} -	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) -			  + sizeof(struct slab), align); +	if (!cachep->num) +		return -E2BIG; + +	freelist_size = calculate_freelist_size(cachep->num, cachep->align);  	/*  	 * If the slab has been placed off-slab, and we have enough space then  	 * move it on-slab. This is at the expense of any extra colouring.  	 */ -	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { +	if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {  		flags &= ~CFLGS_OFF_SLAB; -		left_over -= slab_size; +		left_over -= freelist_size;  	}  	if (flags & CFLGS_OFF_SLAB) {  		/* really off slab. No need for manual alignment */ -		slab_size = -		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); +		freelist_size = calculate_freelist_size(cachep->num, 0);  #ifdef CONFIG_PAGE_POISONING  		/* If we're going to use the generic kernel_map_pages() @@ -2391,50 +2376,48 @@ kmem_cache_create (const char *name, size_t size, size_t align,  	cachep->colour_off = cache_line_size();  	/* Offset must be a multiple of the alignment. */ -	if (cachep->colour_off < align) -		cachep->colour_off = align; +	if (cachep->colour_off < cachep->align) +		cachep->colour_off = cachep->align;  	cachep->colour = left_over / cachep->colour_off; -	cachep->slab_size = slab_size; +	cachep->freelist_size = freelist_size;  	cachep->flags = flags; -	cachep->gfpflags = 0; +	cachep->allocflags = __GFP_COMP;  	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) -		cachep->gfpflags |= GFP_DMA; -	cachep->buffer_size = size; +		cachep->allocflags |= GFP_DMA; +	cachep->size = size;  	cachep->reciprocal_buffer_size = reciprocal_value(size);  	if (flags & CFLGS_OFF_SLAB) { -		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); +		cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);  		/* -		 * This is a possibility for one of the malloc_sizes caches. +		 * This is a possibility for one of the kmalloc_{dma,}_caches.  		 * But since we go off slab only for object size greater than -		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, -		 * this should not happen at all. +		 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created +		 * in ascending order,this should not happen at all.  		 * But leave a BUG_ON for some lucky dude.  		 */ -		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); +		BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));  	} -	cachep->ctor = ctor; -	cachep->name = name; -	if (setup_cpu_cache(cachep, gfp)) { -		__kmem_cache_destroy(cachep); -		cachep = NULL; -		goto oops; +	err = setup_cpu_cache(cachep, gfp); +	if (err) { +		__kmem_cache_shutdown(cachep); +		return err;  	} -	/* cache setup completed, link it into the list */ -	list_add(&cachep->next, &cache_chain); -oops: -	if (!cachep && (flags & SLAB_PANIC)) -		panic("kmem_cache_create(): failed to create slab `%s'\n", -		      name); -	if (slab_is_available()) { -		mutex_unlock(&cache_chain_mutex); -		put_online_cpus(); -	} -	return cachep; +	if (flags & SLAB_DEBUG_OBJECTS) { +		/* +		 * Would deadlock through slab_destroy()->call_rcu()-> +		 * debug_object_activate()->kmem_cache_alloc(). +		 */ +		WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); + +		slab_set_debugobj_lock_classes(cachep); +	} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) +		on_slab_lock_classes(cachep); + +	return 0;  } -EXPORT_SYMBOL(kmem_cache_create);  #if DEBUG  static void check_irq_off(void) @@ -2451,7 +2434,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)  {  #ifdef CONFIG_SMP  	check_irq_off(); -	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); +	assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);  #endif  } @@ -2459,7 +2442,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)  {  #ifdef CONFIG_SMP  	check_irq_off(); -	assert_spin_locked(&cachep->nodelists[node]->list_lock); +	assert_spin_locked(&cachep->node[node]->list_lock);  #endif  } @@ -2470,7 +2453,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)  #define check_spinlock_acquired_node(x, y) do { } while(0)  #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,  			struct array_cache *ac,  			int force, int node); @@ -2482,29 +2465,29 @@ static void do_drain(void *arg)  	check_irq_off();  	ac = cpu_cache_get(cachep); -	spin_lock(&cachep->nodelists[node]->list_lock); +	spin_lock(&cachep->node[node]->list_lock);  	free_block(cachep, ac->entry, ac->avail, node); -	spin_unlock(&cachep->nodelists[node]->list_lock); +	spin_unlock(&cachep->node[node]->list_lock);  	ac->avail = 0;  }  static void drain_cpu_caches(struct kmem_cache *cachep)  { -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	int node;  	on_each_cpu(do_drain, cachep, 1);  	check_irq_on();  	for_each_online_node(node) { -		l3 = cachep->nodelists[node]; -		if (l3 && l3->alien) -			drain_alien_cache(cachep, l3->alien); +		n = cachep->node[node]; +		if (n && n->alien) +			drain_alien_cache(cachep, n->alien);  	}  	for_each_online_node(node) { -		l3 = cachep->nodelists[node]; -		if (l3) -			drain_array(cachep, l3, l3->shared, 1, node); +		n = cachep->node[node]; +		if (n) +			drain_array(cachep, n, n->shared, 1, node);  	}  } @@ -2515,182 +2498,139 @@ static void drain_cpu_caches(struct kmem_cache *cachep)   * Returns the actual number of slabs released.   */  static int drain_freelist(struct kmem_cache *cache, -			struct kmem_list3 *l3, int tofree) +			struct kmem_cache_node *n, int tofree)  {  	struct list_head *p;  	int nr_freed; -	struct slab *slabp; +	struct page *page;  	nr_freed = 0; -	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { +	while (nr_freed < tofree && !list_empty(&n->slabs_free)) { -		spin_lock_irq(&l3->list_lock); -		p = l3->slabs_free.prev; -		if (p == &l3->slabs_free) { -			spin_unlock_irq(&l3->list_lock); +		spin_lock_irq(&n->list_lock); +		p = n->slabs_free.prev; +		if (p == &n->slabs_free) { +			spin_unlock_irq(&n->list_lock);  			goto out;  		} -		slabp = list_entry(p, struct slab, list); +		page = list_entry(p, struct page, lru);  #if DEBUG -		BUG_ON(slabp->inuse); +		BUG_ON(page->active);  #endif -		list_del(&slabp->list); +		list_del(&page->lru);  		/*  		 * Safe to drop the lock. The slab is no longer linked  		 * to the cache.  		 */ -		l3->free_objects -= cache->num; -		spin_unlock_irq(&l3->list_lock); -		slab_destroy(cache, slabp); +		n->free_objects -= cache->num; +		spin_unlock_irq(&n->list_lock); +		slab_destroy(cache, page);  		nr_freed++;  	}  out:  	return nr_freed;  } -/* Called with cache_chain_mutex held to protect against cpu hotplug */ -static int __cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep)  {  	int ret = 0, i = 0; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	drain_cpu_caches(cachep);  	check_irq_on();  	for_each_online_node(i) { -		l3 = cachep->nodelists[i]; -		if (!l3) +		n = cachep->node[i]; +		if (!n)  			continue; -		drain_freelist(cachep, l3, l3->free_objects); +		drain_freelist(cachep, n, slabs_tofree(cachep, n)); -		ret += !list_empty(&l3->slabs_full) || -			!list_empty(&l3->slabs_partial); +		ret += !list_empty(&n->slabs_full) || +			!list_empty(&n->slabs_partial);  	}  	return (ret ? 1 : 0);  } -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shutdown(struct kmem_cache *cachep)  { -	int ret; -	BUG_ON(!cachep || in_interrupt()); +	int i; +	struct kmem_cache_node *n; +	int rc = __kmem_cache_shrink(cachep); -	get_online_cpus(); -	mutex_lock(&cache_chain_mutex); -	ret = __cache_shrink(cachep); -	mutex_unlock(&cache_chain_mutex); -	put_online_cpus(); -	return ret; -} -EXPORT_SYMBOL(kmem_cache_shrink); +	if (rc) +		return rc; -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a &struct kmem_cache object from the slab cache. - * - * It is expected this function will be called by a module when it is - * unloaded.  This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that noone will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cachep) -{ -	BUG_ON(!cachep || in_interrupt()); +	for_each_online_cpu(i) +	    kfree(cachep->array[i]); -	/* Find the cache in the chain of caches. */ -	get_online_cpus(); -	mutex_lock(&cache_chain_mutex); -	/* -	 * the chain is never empty, cache_cache is never destroyed -	 */ -	list_del(&cachep->next); -	if (__cache_shrink(cachep)) { -		slab_error(cachep, "Can't free all objects"); -		list_add(&cachep->next, &cache_chain); -		mutex_unlock(&cache_chain_mutex); -		put_online_cpus(); -		return; +	/* NUMA: free the node structures */ +	for_each_online_node(i) { +		n = cachep->node[i]; +		if (n) { +			kfree(n->shared); +			free_alien_cache(n->alien); +			kfree(n); +		}  	} - -	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) -		rcu_barrier(); - -	__kmem_cache_destroy(cachep); -	mutex_unlock(&cache_chain_mutex); -	put_online_cpus(); +	return 0;  } -EXPORT_SYMBOL(kmem_cache_destroy);  /*   * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches.  The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have slabp_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init().   */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, -				   int colour_off, gfp_t local_flags, -				   int nodeid) +static void *alloc_slabmgmt(struct kmem_cache *cachep, +				   struct page *page, int colour_off, +				   gfp_t local_flags, int nodeid)  { -	struct slab *slabp; +	void *freelist; +	void *addr = page_address(page);  	if (OFF_SLAB(cachep)) {  		/* Slab management obj is off-slab. */ -		slabp = kmem_cache_alloc_node(cachep->slabp_cache, +		freelist = kmem_cache_alloc_node(cachep->freelist_cache,  					      local_flags, nodeid); -		/* -		 * If the first object in the slab is leaked (it's allocated -		 * but no one has a reference to it), we want to make sure -		 * kmemleak does not treat the ->s_mem pointer as a reference -		 * to the object. Otherwise we will not report the leak. -		 */ -		kmemleak_scan_area(&slabp->list, sizeof(struct list_head), -				   local_flags); -		if (!slabp) +		if (!freelist)  			return NULL;  	} else { -		slabp = objp + colour_off; -		colour_off += cachep->slab_size; +		freelist = addr + colour_off; +		colour_off += cachep->freelist_size;  	} -	slabp->inuse = 0; -	slabp->colouroff = colour_off; -	slabp->s_mem = objp + colour_off; -	slabp->nodeid = nodeid; -	slabp->free = 0; -	return slabp; +	page->active = 0; +	page->s_mem = addr + colour_off; +	return freelist; +} + +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) +{ +	return ((freelist_idx_t *)page->freelist)[idx];  } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +static inline void set_free_obj(struct page *page, +					unsigned int idx, freelist_idx_t val)  { -	return (kmem_bufctl_t *) (slabp + 1); +	((freelist_idx_t *)(page->freelist))[idx] = val;  }  static void cache_init_objs(struct kmem_cache *cachep, -			    struct slab *slabp) +			    struct page *page)  {  	int i;  	for (i = 0; i < cachep->num; i++) { -		void *objp = index_to_obj(cachep, slabp, i); +		void *objp = index_to_obj(cachep, page, i);  #if DEBUG  		/* need to poison the objs? */  		if (cachep->flags & SLAB_POISON) @@ -2718,88 +2658,76 @@ static void cache_init_objs(struct kmem_cache *cachep,  				slab_error(cachep, "constructor overwrote the"  					   " start of an object");  		} -		if ((cachep->buffer_size % PAGE_SIZE) == 0 && +		if ((cachep->size % PAGE_SIZE) == 0 &&  			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)  			kernel_map_pages(virt_to_page(objp), -					 cachep->buffer_size / PAGE_SIZE, 0); +					 cachep->size / PAGE_SIZE, 0);  #else  		if (cachep->ctor)  			cachep->ctor(objp);  #endif -		slab_bufctl(slabp)[i] = i + 1; +		set_obj_status(page, i, OBJECT_FREE); +		set_free_obj(page, i, i);  	} -	slab_bufctl(slabp)[i - 1] = BUFCTL_END;  }  static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)  {  	if (CONFIG_ZONE_DMA_FLAG) {  		if (flags & GFP_DMA) -			BUG_ON(!(cachep->gfpflags & GFP_DMA)); +			BUG_ON(!(cachep->allocflags & GFP_DMA));  		else -			BUG_ON(cachep->gfpflags & GFP_DMA); +			BUG_ON(cachep->allocflags & GFP_DMA);  	}  } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,  				int nodeid)  { -	void *objp = index_to_obj(cachep, slabp, slabp->free); -	kmem_bufctl_t next; +	void *objp; -	slabp->inuse++; -	next = slab_bufctl(slabp)[slabp->free]; +	objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); +	page->active++;  #if DEBUG -	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; -	WARN_ON(slabp->nodeid != nodeid); +	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);  #endif -	slabp->free = next;  	return objp;  } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, +static void slab_put_obj(struct kmem_cache *cachep, struct page *page,  				void *objp, int nodeid)  { -	unsigned int objnr = obj_to_index(cachep, slabp, objp); - +	unsigned int objnr = obj_to_index(cachep, page, objp);  #if DEBUG +	unsigned int i; +  	/* Verify that the slab belongs to the intended node */ -	WARN_ON(slabp->nodeid != nodeid); +	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); -	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { -		printk(KERN_ERR "slab: double free detected in cache " -				"'%s', objp %p\n", cachep->name, objp); -		BUG(); +	/* Verify double free bug */ +	for (i = page->active; i < cachep->num; i++) { +		if (get_free_obj(page, i) == objnr) { +			printk(KERN_ERR "slab: double free detected in cache " +					"'%s', objp %p\n", cachep->name, objp); +			BUG(); +		}  	}  #endif -	slab_bufctl(slabp)[objnr] = slabp->free; -	slabp->free = objnr; -	slabp->inuse--; +	page->active--; +	set_free_obj(page, page->active, objnr);  }  /*   * Map pages beginning at addr to the given cache and slab. This is required   * for the slab allocator to be able to lookup the cache and slab of a - * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + * virtual address for kfree, ksize, and slab debugging.   */ -static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, -			   void *addr) +static void slab_map_pages(struct kmem_cache *cache, struct page *page, +			   void *freelist)  { -	int nr_pages; -	struct page *page; - -	page = virt_to_page(addr); - -	nr_pages = 1; -	if (likely(!PageCompound(page))) -		nr_pages <<= cache->gfporder; - -	do { -		page_set_cache(page, cache); -		page_set_slab(page, slab); -		page++; -	} while (--nr_pages); +	page->slab_cache = cache; +	page->freelist = freelist;  }  /* @@ -2807,12 +2735,12 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,   * kmem_cache_alloc() when there are no active objs left in a cache.   */  static int cache_grow(struct kmem_cache *cachep, -		gfp_t flags, int nodeid, void *objp) +		gfp_t flags, int nodeid, struct page *page)  { -	struct slab *slabp; +	void *freelist;  	size_t offset;  	gfp_t local_flags; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	/*  	 * Be lazy and only check for valid flags here,  keeping it out of the @@ -2821,17 +2749,17 @@ static int cache_grow(struct kmem_cache *cachep,  	BUG_ON(flags & GFP_SLAB_BUG_MASK);  	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); -	/* Take the l3 list lock to change the colour_next on this node */ +	/* Take the node list lock to change the colour_next on this node */  	check_irq_off(); -	l3 = cachep->nodelists[nodeid]; -	spin_lock(&l3->list_lock); +	n = cachep->node[nodeid]; +	spin_lock(&n->list_lock);  	/* Get colour for the slab, and cal the next value. */ -	offset = l3->colour_next; -	l3->colour_next++; -	if (l3->colour_next >= cachep->colour) -		l3->colour_next = 0; -	spin_unlock(&l3->list_lock); +	offset = n->colour_next; +	n->colour_next++; +	if (n->colour_next >= cachep->colour) +		n->colour_next = 0; +	spin_unlock(&n->list_lock);  	offset *= cachep->colour_off; @@ -2850,34 +2778,34 @@ static int cache_grow(struct kmem_cache *cachep,  	 * Get mem for the objs.  Attempt to allocate a physical page from  	 * 'nodeid'.  	 */ -	if (!objp) -		objp = kmem_getpages(cachep, local_flags, nodeid); -	if (!objp) +	if (!page) +		page = kmem_getpages(cachep, local_flags, nodeid); +	if (!page)  		goto failed;  	/* Get slab management. */ -	slabp = alloc_slabmgmt(cachep, objp, offset, +	freelist = alloc_slabmgmt(cachep, page, offset,  			local_flags & ~GFP_CONSTRAINT_MASK, nodeid); -	if (!slabp) +	if (!freelist)  		goto opps1; -	slab_map_pages(cachep, slabp, objp); +	slab_map_pages(cachep, page, freelist); -	cache_init_objs(cachep, slabp); +	cache_init_objs(cachep, page);  	if (local_flags & __GFP_WAIT)  		local_irq_disable();  	check_irq_off(); -	spin_lock(&l3->list_lock); +	spin_lock(&n->list_lock);  	/* Make slab active. */ -	list_add_tail(&slabp->list, &(l3->slabs_free)); +	list_add_tail(&page->lru, &(n->slabs_free));  	STATS_INC_GROWN(cachep); -	l3->free_objects += cachep->num; -	spin_unlock(&l3->list_lock); +	n->free_objects += cachep->num; +	spin_unlock(&n->list_lock);  	return 1;  opps1: -	kmem_freepages(cachep, objp); +	kmem_freepages(cachep, page);  failed:  	if (local_flags & __GFP_WAIT)  		local_irq_disable(); @@ -2923,11 +2851,10 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)  }  static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, -				   void *caller) +				   unsigned long caller)  { -	struct page *page;  	unsigned int objnr; -	struct slab *slabp; +	struct page *page;  	BUG_ON(virt_to_cache(objp) != cachep); @@ -2935,30 +2862,26 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,  	kfree_debugcheck(objp);  	page = virt_to_head_page(objp); -	slabp = page_get_slab(page); -  	if (cachep->flags & SLAB_RED_ZONE) {  		verify_redzone_free(cachep, objp);  		*dbg_redzone1(cachep, objp) = RED_INACTIVE;  		*dbg_redzone2(cachep, objp) = RED_INACTIVE;  	}  	if (cachep->flags & SLAB_STORE_USER) -		*dbg_userword(cachep, objp) = caller; +		*dbg_userword(cachep, objp) = (void *)caller; -	objnr = obj_to_index(cachep, slabp, objp); +	objnr = obj_to_index(cachep, page, objp);  	BUG_ON(objnr >= cachep->num); -	BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); +	BUG_ON(objp != index_to_obj(cachep, page, objnr)); -#ifdef CONFIG_DEBUG_SLAB_LEAK -	slab_bufctl(slabp)[objnr] = BUFCTL_FREE; -#endif +	set_obj_status(page, objnr, OBJECT_FREE);  	if (cachep->flags & SLAB_POISON) {  #ifdef CONFIG_DEBUG_PAGEALLOC -		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { -			store_stackinfo(cachep, objp, (unsigned long)caller); +		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { +			store_stackinfo(cachep, objp, caller);  			kernel_map_pages(virt_to_page(objp), -					 cachep->buffer_size / PAGE_SIZE, 0); +					 cachep->size / PAGE_SIZE, 0);  		} else {  			poison_obj(cachep, objp, POISON_FREE);  		} @@ -2969,49 +2892,24 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,  	return objp;  } -static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) -{ -	kmem_bufctl_t i; -	int entries = 0; - -	/* Check slab's freelist to see if this obj is there. */ -	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { -		entries++; -		if (entries > cachep->num || i >= cachep->num) -			goto bad; -	} -	if (entries != cachep->num - slabp->inuse) { -bad: -		printk(KERN_ERR "slab: Internal list corruption detected in " -				"cache '%s'(%d), slabp %p(%d). Hexdump:\n", -			cachep->name, cachep->num, slabp, slabp->inuse); -		for (i = 0; -		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); -		     i++) { -			if (i % 16 == 0) -				printk("\n%03x:", i); -			printk(" %02x", ((unsigned char *)slabp)[i]); -		} -		printk("\n"); -		BUG(); -	} -}  #else  #define kfree_debugcheck(x) do { } while(0)  #define cache_free_debugcheck(x,objp,z) (objp) -#define check_slabp(x,y) do { } while(0)  #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, +							bool force_refill)  {  	int batchcount; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	struct array_cache *ac;  	int node; -retry:  	check_irq_off();  	node = numa_mem_id(); +	if (unlikely(force_refill)) +		goto force_grow; +retry:  	ac = cpu_cache_get(cachep);  	batchcount = ac->batchcount;  	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3022,31 +2920,30 @@ retry:  		 */  		batchcount = BATCHREFILL_LIMIT;  	} -	l3 = cachep->nodelists[node]; +	n = cachep->node[node]; -	BUG_ON(ac->avail > 0 || !l3); -	spin_lock(&l3->list_lock); +	BUG_ON(ac->avail > 0 || !n); +	spin_lock(&n->list_lock);  	/* See if we can refill from the shared array */ -	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { -		l3->shared->touched = 1; +	if (n->shared && transfer_objects(ac, n->shared, batchcount)) { +		n->shared->touched = 1;  		goto alloc_done;  	}  	while (batchcount > 0) {  		struct list_head *entry; -		struct slab *slabp; +		struct page *page;  		/* Get slab alloc is to come from. */ -		entry = l3->slabs_partial.next; -		if (entry == &l3->slabs_partial) { -			l3->free_touched = 1; -			entry = l3->slabs_free.next; -			if (entry == &l3->slabs_free) +		entry = n->slabs_partial.next; +		if (entry == &n->slabs_partial) { +			n->free_touched = 1; +			entry = n->slabs_free.next; +			if (entry == &n->slabs_free)  				goto must_grow;  		} -		slabp = list_entry(entry, struct slab, list); -		check_slabp(cachep, slabp); +		page = list_entry(entry, struct page, lru);  		check_spinlock_acquired(cachep);  		/* @@ -3054,45 +2951,49 @@ retry:  		 * there must be at least one object available for  		 * allocation.  		 */ -		BUG_ON(slabp->inuse >= cachep->num); +		BUG_ON(page->active >= cachep->num); -		while (slabp->inuse < cachep->num && batchcount--) { +		while (page->active < cachep->num && batchcount--) {  			STATS_INC_ALLOCED(cachep);  			STATS_INC_ACTIVE(cachep);  			STATS_SET_HIGH(cachep); -			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, -							    node); +			ac_put_obj(cachep, ac, slab_get_obj(cachep, page, +									node));  		} -		check_slabp(cachep, slabp);  		/* move slabp to correct slabp list: */ -		list_del(&slabp->list); -		if (slabp->free == BUFCTL_END) -			list_add(&slabp->list, &l3->slabs_full); +		list_del(&page->lru); +		if (page->active == cachep->num) +			list_add(&page->lru, &n->slabs_full);  		else -			list_add(&slabp->list, &l3->slabs_partial); +			list_add(&page->lru, &n->slabs_partial);  	}  must_grow: -	l3->free_objects -= ac->avail; +	n->free_objects -= ac->avail;  alloc_done: -	spin_unlock(&l3->list_lock); +	spin_unlock(&n->list_lock);  	if (unlikely(!ac->avail)) {  		int x; +force_grow:  		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);  		/* cache_grow can reenable interrupts, then ac could change. */  		ac = cpu_cache_get(cachep); -		if (!x && ac->avail == 0)	/* no objects in sight? abort */ +		node = numa_mem_id(); + +		/* no objects in sight? abort */ +		if (!x && (ac->avail == 0 || force_refill))  			return NULL;  		if (!ac->avail)		/* objects refilled by interrupt? */  			goto retry;  	}  	ac->touched = 1; -	return ac->entry[--ac->avail]; + +	return ac_get_obj(cachep, ac, flags, force_refill);  }  static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -3106,15 +3007,17 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,  #if DEBUG  static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, -				gfp_t flags, void *objp, void *caller) +				gfp_t flags, void *objp, unsigned long caller)  { +	struct page *page; +  	if (!objp)  		return objp;  	if (cachep->flags & SLAB_POISON) {  #ifdef CONFIG_DEBUG_PAGEALLOC -		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) +		if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))  			kernel_map_pages(virt_to_page(objp), -					 cachep->buffer_size / PAGE_SIZE, 1); +					 cachep->size / PAGE_SIZE, 1);  		else  			check_poison_obj(cachep, objp);  #else @@ -3123,7 +3026,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,  		poison_obj(cachep, objp, POISON_INUSE);  	}  	if (cachep->flags & SLAB_STORE_USER) -		*dbg_userword(cachep, objp) = caller; +		*dbg_userword(cachep, objp) = (void *)caller;  	if (cachep->flags & SLAB_RED_ZONE) {  		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || @@ -3138,25 +3041,17 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,  		*dbg_redzone1(cachep, objp) = RED_ACTIVE;  		*dbg_redzone2(cachep, objp) = RED_ACTIVE;  	} -#ifdef CONFIG_DEBUG_SLAB_LEAK -	{ -		struct slab *slabp; -		unsigned objnr; -		slabp = page_get_slab(virt_to_head_page(objp)); -		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; -		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; -	} -#endif +	page = virt_to_head_page(objp); +	set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);  	objp += obj_offset(cachep);  	if (cachep->ctor && cachep->flags & SLAB_POISON)  		cachep->ctor(objp); -#if ARCH_SLAB_MINALIGN -	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { +	if (ARCH_SLAB_MINALIGN && +	    ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {  		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", -		       objp, ARCH_SLAB_MINALIGN); +		       objp, (int)ARCH_SLAB_MINALIGN);  	} -#endif  	return objp;  }  #else @@ -3165,33 +3060,45 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,  static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)  { -	if (cachep == &cache_cache) +	if (cachep == kmem_cache)  		return false; -	return should_failslab(obj_size(cachep), flags, cachep->flags); +	return should_failslab(cachep->object_size, flags, cachep->flags);  }  static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)  {  	void *objp;  	struct array_cache *ac; +	bool force_refill = false;  	check_irq_off();  	ac = cpu_cache_get(cachep);  	if (likely(ac->avail)) { -		STATS_INC_ALLOCHIT(cachep);  		ac->touched = 1; -		objp = ac->entry[--ac->avail]; -	} else { -		STATS_INC_ALLOCMISS(cachep); -		objp = cache_alloc_refill(cachep, flags); +		objp = ac_get_obj(cachep, ac, flags, false); +  		/* -		 * the 'ac' may be updated by cache_alloc_refill(), -		 * and kmemleak_erase() requires its correct value. +		 * Allow for the possibility all avail objects are not allowed +		 * by the current flags  		 */ -		ac = cpu_cache_get(cachep); +		if (objp) { +			STATS_INC_ALLOCHIT(cachep); +			goto out; +		} +		force_refill = true;  	} + +	STATS_INC_ALLOCMISS(cachep); +	objp = cache_alloc_refill(cachep, flags, force_refill); +	/* +	 * the 'ac' may be updated by cache_alloc_refill(), +	 * and kmemleak_erase() requires its correct value. +	 */ +	ac = cpu_cache_get(cachep); + +out:  	/*  	 * To avoid a false negative, if an object that is in one of the  	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't @@ -3204,7 +3111,7 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)  #ifdef CONFIG_NUMA  /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.   *   * If we are in_interrupt, then process context, including cpusets and   * mempolicy, may not apply and should not be used for allocation policy. @@ -3216,12 +3123,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)  	if (in_interrupt() || (flags & __GFP_THISNODE))  		return NULL;  	nid_alloc = nid_here = numa_mem_id(); -	get_mems_allowed();  	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))  		nid_alloc = cpuset_slab_spread_node();  	else if (current->mempolicy) -		nid_alloc = slab_node(current->mempolicy); -	put_mems_allowed(); +		nid_alloc = mempolicy_slab_node();  	if (nid_alloc != nid_here)  		return ____cache_alloc_node(cachep, flags, nid_alloc);  	return NULL; @@ -3230,7 +3135,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)  /*   * Fallback function if there was no memory available and no objects on a   * certain node and fall back is permitted. First we scan all the - * available nodelists for available objects. If that fails then we + * available node for available objects. If that fails then we   * perform an allocation without specifying a node. This allows the page   * allocator to do its reclaim / fallback magic. We then insert the   * slab into the proper nodelist and then allocate from it. @@ -3244,14 +3149,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)  	enum zone_type high_zoneidx = gfp_zone(flags);  	void *obj = NULL;  	int nid; +	unsigned int cpuset_mems_cookie;  	if (flags & __GFP_THISNODE)  		return NULL; -	get_mems_allowed(); -	zonelist = node_zonelist(slab_node(current->mempolicy), flags);  	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); +retry_cpuset: +	cpuset_mems_cookie = read_mems_allowed_begin(); +	zonelist = node_zonelist(mempolicy_slab_node(), flags); +  retry:  	/*  	 * Look through allowed nodes for objects available @@ -3261,8 +3169,8 @@ retry:  		nid = zone_to_nid(zone);  		if (cpuset_zone_allowed_hardwall(zone, flags) && -			cache->nodelists[nid] && -			cache->nodelists[nid]->free_objects) { +			cache->node[nid] && +			cache->node[nid]->free_objects) {  				obj = ____cache_alloc_node(cache,  					flags | GFP_THISNODE, nid);  				if (obj) @@ -3277,18 +3185,20 @@ retry:  		 * We may trigger various forms of reclaim on the allowed  		 * set and go into memory reserves if necessary.  		 */ +		struct page *page; +  		if (local_flags & __GFP_WAIT)  			local_irq_enable();  		kmem_flagcheck(cache, flags); -		obj = kmem_getpages(cache, local_flags, numa_mem_id()); +		page = kmem_getpages(cache, local_flags, numa_mem_id());  		if (local_flags & __GFP_WAIT)  			local_irq_disable(); -		if (obj) { +		if (page) {  			/*  			 * Insert into the appropriate per node queues  			 */ -			nid = page_to_nid(virt_to_page(obj)); -			if (cache_grow(cache, flags, nid, obj)) { +			nid = page_to_nid(page); +			if (cache_grow(cache, flags, nid, page)) {  				obj = ____cache_alloc_node(cache,  					flags | GFP_THISNODE, nid);  				if (!obj) @@ -3304,7 +3214,9 @@ retry:  			}  		}  	} -	put_mems_allowed(); + +	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) +		goto retry_cpuset;  	return obj;  } @@ -3315,51 +3227,50 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,  				int nodeid)  {  	struct list_head *entry; -	struct slab *slabp; -	struct kmem_list3 *l3; +	struct page *page; +	struct kmem_cache_node *n;  	void *obj;  	int x; -	l3 = cachep->nodelists[nodeid]; -	BUG_ON(!l3); +	VM_BUG_ON(nodeid > num_online_nodes()); +	n = cachep->node[nodeid]; +	BUG_ON(!n);  retry:  	check_irq_off(); -	spin_lock(&l3->list_lock); -	entry = l3->slabs_partial.next; -	if (entry == &l3->slabs_partial) { -		l3->free_touched = 1; -		entry = l3->slabs_free.next; -		if (entry == &l3->slabs_free) +	spin_lock(&n->list_lock); +	entry = n->slabs_partial.next; +	if (entry == &n->slabs_partial) { +		n->free_touched = 1; +		entry = n->slabs_free.next; +		if (entry == &n->slabs_free)  			goto must_grow;  	} -	slabp = list_entry(entry, struct slab, list); +	page = list_entry(entry, struct page, lru);  	check_spinlock_acquired_node(cachep, nodeid); -	check_slabp(cachep, slabp);  	STATS_INC_NODEALLOCS(cachep);  	STATS_INC_ACTIVE(cachep);  	STATS_SET_HIGH(cachep); -	BUG_ON(slabp->inuse == cachep->num); +	BUG_ON(page->active == cachep->num); -	obj = slab_get_obj(cachep, slabp, nodeid); -	check_slabp(cachep, slabp); -	l3->free_objects--; +	obj = slab_get_obj(cachep, page, nodeid); +	n->free_objects--;  	/* move slabp to correct slabp list: */ -	list_del(&slabp->list); +	list_del(&page->lru); -	if (slabp->free == BUFCTL_END) -		list_add(&slabp->list, &l3->slabs_full); +	if (page->active == cachep->num) +		list_add(&page->lru, &n->slabs_full);  	else -		list_add(&slabp->list, &l3->slabs_partial); +		list_add(&page->lru, &n->slabs_partial); -	spin_unlock(&l3->list_lock); +	spin_unlock(&n->list_lock);  	goto done;  must_grow: -	spin_unlock(&l3->list_lock); +	spin_unlock(&n->list_lock);  	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);  	if (x)  		goto retry; @@ -3370,21 +3281,9 @@ done:  	return obj;  } -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * @caller: return address of caller, used for debug information - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - */  static __always_inline void * -__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, -		   void *caller) +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, +		   unsigned long caller)  {  	unsigned long save_flags;  	void *ptr; @@ -3397,13 +3296,15 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,  	if (slab_should_failslab(cachep, flags))  		return NULL; +	cachep = memcg_kmem_get_cache(cachep, flags); +  	cache_alloc_debugcheck_before(cachep, flags);  	local_irq_save(save_flags); -	if (nodeid == -1) +	if (nodeid == NUMA_NO_NODE)  		nodeid = slab_node; -	if (unlikely(!cachep->nodelists[nodeid])) { +	if (unlikely(!cachep->node[nodeid])) {  		/* Node not bootstrapped yet */  		ptr = fallback_alloc(cachep, flags);  		goto out; @@ -3425,14 +3326,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,    out:  	local_irq_restore(save_flags);  	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); -	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, +	kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,  				 flags); -	if (likely(ptr)) -		kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); - -	if (unlikely((flags & __GFP_ZERO) && ptr)) -		memset(ptr, 0, obj_size(cachep)); +	if (likely(ptr)) { +		kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); +		if (unlikely(flags & __GFP_ZERO)) +			memset(ptr, 0, cachep->object_size); +	}  	return ptr;  } @@ -3442,7 +3343,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)  {  	void *objp; -	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { +	if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {  		objp = alternate_node_alloc(cache, flags);  		if (objp)  			goto out; @@ -3470,7 +3371,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)  #endif /* CONFIG_NUMA */  static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)  {  	unsigned long save_flags;  	void *objp; @@ -3482,67 +3383,70 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)  	if (slab_should_failslab(cachep, flags))  		return NULL; +	cachep = memcg_kmem_get_cache(cachep, flags); +  	cache_alloc_debugcheck_before(cachep, flags);  	local_irq_save(save_flags);  	objp = __do_cache_alloc(cachep, flags);  	local_irq_restore(save_flags);  	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); -	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, +	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,  				 flags);  	prefetchw(objp); -	if (likely(objp)) -		kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); - -	if (unlikely((flags & __GFP_ZERO) && objp)) -		memset(objp, 0, obj_size(cachep)); +	if (likely(objp)) { +		kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); +		if (unlikely(flags & __GFP_ZERO)) +			memset(objp, 0, cachep->object_size); +	}  	return objp;  }  /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock   */  static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,  		       int node)  {  	int i; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	for (i = 0; i < nr_objects; i++) { -		void *objp = objpp[i]; -		struct slab *slabp; +		void *objp; +		struct page *page; + +		clear_obj_pfmemalloc(&objpp[i]); +		objp = objpp[i]; -		slabp = virt_to_slab(objp); -		l3 = cachep->nodelists[node]; -		list_del(&slabp->list); +		page = virt_to_head_page(objp); +		n = cachep->node[node]; +		list_del(&page->lru);  		check_spinlock_acquired_node(cachep, node); -		check_slabp(cachep, slabp); -		slab_put_obj(cachep, slabp, objp, node); +		slab_put_obj(cachep, page, objp, node);  		STATS_DEC_ACTIVE(cachep); -		l3->free_objects++; -		check_slabp(cachep, slabp); +		n->free_objects++;  		/* fixup slab chains */ -		if (slabp->inuse == 0) { -			if (l3->free_objects > l3->free_limit) { -				l3->free_objects -= cachep->num; +		if (page->active == 0) { +			if (n->free_objects > n->free_limit) { +				n->free_objects -= cachep->num;  				/* No need to drop any previously held  				 * lock here, even if we have a off-slab slab  				 * descriptor it is guaranteed to come from  				 * a different cache, refer to comments before  				 * alloc_slabmgmt.  				 */ -				slab_destroy(cachep, slabp); +				slab_destroy(cachep, page);  			} else { -				list_add(&slabp->list, &l3->slabs_free); +				list_add(&page->lru, &n->slabs_free);  			}  		} else {  			/* Unconditionally move a slab to the end of the  			 * partial list on free - maximum time for the  			 * other objects to be freed, too.  			 */ -			list_add_tail(&slabp->list, &l3->slabs_partial); +			list_add_tail(&page->lru, &n->slabs_partial);  		}  	}  } @@ -3550,7 +3454,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,  static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)  {  	int batchcount; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	int node = numa_mem_id();  	batchcount = ac->batchcount; @@ -3558,10 +3462,10 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)  	BUG_ON(!batchcount || batchcount > ac->avail);  #endif  	check_irq_off(); -	l3 = cachep->nodelists[node]; -	spin_lock(&l3->list_lock); -	if (l3->shared) { -		struct array_cache *shared_array = l3->shared; +	n = cachep->node[node]; +	spin_lock(&n->list_lock); +	if (n->shared) { +		struct array_cache *shared_array = n->shared;  		int max = shared_array->limit - shared_array->avail;  		if (max) {  			if (batchcount > max) @@ -3580,12 +3484,12 @@ free_done:  		int i = 0;  		struct list_head *p; -		p = l3->slabs_free.next; -		while (p != &(l3->slabs_free)) { -			struct slab *slabp; +		p = n->slabs_free.next; +		while (p != &(n->slabs_free)) { +			struct page *page; -			slabp = list_entry(p, struct slab, list); -			BUG_ON(slabp->inuse); +			page = list_entry(p, struct page, lru); +			BUG_ON(page->active);  			i++;  			p = p->next; @@ -3593,7 +3497,7 @@ free_done:  		STATS_SET_FREEABLE(cachep, i);  	}  #endif -	spin_unlock(&l3->list_lock); +	spin_unlock(&n->list_lock);  	ac->avail -= batchcount;  	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);  } @@ -3602,15 +3506,16 @@ free_done:   * Release an obj back to its cache. If the obj has a constructed state, it must   * be in this state _before_ it is released.  Called with disabled ints.   */ -static inline void __cache_free(struct kmem_cache *cachep, void *objp) +static inline void __cache_free(struct kmem_cache *cachep, void *objp, +				unsigned long caller)  {  	struct array_cache *ac = cpu_cache_get(cachep);  	check_irq_off();  	kmemleak_free_recursive(objp, cachep->flags); -	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); +	objp = cache_free_debugcheck(cachep, objp, caller); -	kmemcheck_slab_free(cachep, objp, obj_size(cachep)); +	kmemcheck_slab_free(cachep, objp, cachep->object_size);  	/*  	 * Skip calling cache_free_alien() when the platform is not numa. @@ -3624,13 +3529,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)  	if (likely(ac->avail < ac->limit)) {  		STATS_INC_FREEHIT(cachep); -		ac->entry[ac->avail++] = objp; -		return;  	} else {  		STATS_INC_FREEMISS(cachep);  		cache_flusharray(cachep, ac); -		ac->entry[ac->avail++] = objp;  	} + +	ac_put_obj(cachep, ac, objp);  }  /** @@ -3643,61 +3547,48 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)   */  void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)  { -	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); +	void *ret = slab_alloc(cachep, flags, _RET_IP_);  	trace_kmem_cache_alloc(_RET_IP_, ret, -			       obj_size(cachep), cachep->buffer_size, flags); +			       cachep->object_size, cachep->size, flags);  	return ret;  }  EXPORT_SYMBOL(kmem_cache_alloc);  #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +void * +kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)  { -	return __cache_alloc(cachep, flags, __builtin_return_address(0)); +	void *ret; + +	ret = slab_alloc(cachep, flags, _RET_IP_); + +	trace_kmalloc(_RET_IP_, ret, +		      size, cachep->size, flags); +	return ret;  } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_trace);  #endif +#ifdef CONFIG_NUMA  /** - * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. - * @cachep: the cache we're checking against - * @ptr: pointer to validate + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node.   * - * This verifies that the untrusted pointer looks sane; - * it is _not_ a guarantee that the pointer is actually - * part of the slab cache in question, but it at least - * validates that the pointer can be dereferenced and - * looks half-way sane. + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures.   * - * Currently only used for dentry validation. + * Fallback to other node is possible if __GFP_THISNODE is not set.   */ -int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) -{ -	unsigned long size = cachep->buffer_size; -	struct page *page; - -	if (unlikely(!kern_ptr_validate(ptr, size))) -		goto out; -	page = virt_to_page(ptr); -	if (unlikely(!PageSlab(page))) -		goto out; -	if (unlikely(page_get_cache(page) != cachep)) -		goto out; -	return 1; -out: -	return 0; -} - -#ifdef CONFIG_NUMA  void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)  { -	void *ret = __cache_alloc_node(cachep, flags, nodeid, -				       __builtin_return_address(0)); +	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);  	trace_kmem_cache_alloc_node(_RET_IP_, ret, -				    obj_size(cachep), cachep->buffer_size, +				    cachep->object_size, cachep->size,  				    flags, nodeid);  	return ret; @@ -3705,51 +3596,51 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)  EXPORT_SYMBOL(kmem_cache_alloc_node);  #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, -				    gfp_t flags, -				    int nodeid) +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, +				  gfp_t flags, +				  int nodeid, +				  size_t size)  { -	return __cache_alloc_node(cachep, flags, nodeid, -				  __builtin_return_address(0)); +	void *ret; + +	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + +	trace_kmalloc_node(_RET_IP_, ret, +			   size, cachep->size, +			   flags, nodeid); +	return ret;  } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace);  #endif  static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) +__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)  {  	struct kmem_cache *cachep; -	void *ret; -	cachep = kmem_find_general_cachep(size, flags); +	cachep = kmalloc_slab(size, flags);  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))  		return cachep; -	ret = kmem_cache_alloc_node_notrace(cachep, flags, node); - -	trace_kmalloc_node((unsigned long) caller, ret, -			   size, cachep->buffer_size, flags, node); - -	return ret; +	return kmem_cache_alloc_node_trace(cachep, flags, node, size);  }  #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)  void *__kmalloc_node(size_t size, gfp_t flags, int node)  { -	return __do_kmalloc_node(size, flags, node, -			__builtin_return_address(0)); +	return __do_kmalloc_node(size, flags, node, _RET_IP_);  }  EXPORT_SYMBOL(__kmalloc_node);  void *__kmalloc_node_track_caller(size_t size, gfp_t flags,  		int node, unsigned long caller)  { -	return __do_kmalloc_node(size, flags, node, (void *)caller); +	return __do_kmalloc_node(size, flags, node, caller);  }  EXPORT_SYMBOL(__kmalloc_node_track_caller);  #else  void *__kmalloc_node(size_t size, gfp_t flags, int node)  { -	return __do_kmalloc_node(size, flags, node, NULL); +	return __do_kmalloc_node(size, flags, node, 0);  }  EXPORT_SYMBOL(__kmalloc_node);  #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ @@ -3762,23 +3653,18 @@ EXPORT_SYMBOL(__kmalloc_node);   * @caller: function caller for debug tracking of the caller   */  static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, -					  void *caller) +					  unsigned long caller)  {  	struct kmem_cache *cachep;  	void *ret; -	/* If you want to save a few bytes .text space: replace -	 * __ with kmem_. -	 * Then kmalloc uses the uninlined functions instead of the inline -	 * functions. -	 */ -	cachep = __find_general_cachep(size, flags); +	cachep = kmalloc_slab(size, flags);  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))  		return cachep; -	ret = __cache_alloc(cachep, flags, caller); +	ret = slab_alloc(cachep, flags, caller); -	trace_kmalloc((unsigned long) caller, ret, -		      size, cachep->buffer_size, flags); +	trace_kmalloc(caller, ret, +		      size, cachep->size, flags);  	return ret;  } @@ -3787,20 +3673,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,  #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)  void *__kmalloc(size_t size, gfp_t flags)  { -	return __do_kmalloc(size, flags, __builtin_return_address(0)); +	return __do_kmalloc(size, flags, _RET_IP_);  }  EXPORT_SYMBOL(__kmalloc);  void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)  { -	return __do_kmalloc(size, flags, (void *)caller); +	return __do_kmalloc(size, flags, caller);  }  EXPORT_SYMBOL(__kmalloc_track_caller);  #else  void *__kmalloc(size_t size, gfp_t flags)  { -	return __do_kmalloc(size, flags, NULL); +	return __do_kmalloc(size, flags, 0);  }  EXPORT_SYMBOL(__kmalloc);  #endif @@ -3816,12 +3702,15 @@ EXPORT_SYMBOL(__kmalloc);  void kmem_cache_free(struct kmem_cache *cachep, void *objp)  {  	unsigned long flags; +	cachep = cache_from_obj(cachep, objp); +	if (!cachep) +		return;  	local_irq_save(flags); -	debug_check_no_locks_freed(objp, obj_size(cachep)); +	debug_check_no_locks_freed(objp, cachep->object_size);  	if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) -		debug_check_no_obj_freed(objp, obj_size(cachep)); -	__cache_free(cachep, objp); +		debug_check_no_obj_freed(objp, cachep->object_size); +	__cache_free(cachep, objp, _RET_IP_);  	local_irq_restore(flags);  	trace_kmem_cache_free(_RET_IP_, objp); @@ -3849,32 +3738,21 @@ void kfree(const void *objp)  	local_irq_save(flags);  	kfree_debugcheck(objp);  	c = virt_to_cache(objp); -	debug_check_no_locks_freed(objp, obj_size(c)); -	debug_check_no_obj_freed(objp, obj_size(c)); -	__cache_free(c, (void *)objp); +	debug_check_no_locks_freed(objp, c->object_size); + +	debug_check_no_obj_freed(objp, c->object_size); +	__cache_free(c, (void *)objp, _RET_IP_);  	local_irq_restore(flags);  }  EXPORT_SYMBOL(kfree); -unsigned int kmem_cache_size(struct kmem_cache *cachep) -{ -	return obj_size(cachep); -} -EXPORT_SYMBOL(kmem_cache_size); - -const char *kmem_cache_name(struct kmem_cache *cachep) -{ -	return cachep->name; -} -EXPORT_SYMBOL_GPL(kmem_cache_name); -  /* - * This initializes kmem_list3 or resizes various caches for all nodes. + * This initializes kmem_cache_node or resizes various caches for all nodes.   */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)  {  	int node; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	struct array_cache *new_shared;  	struct array_cache **new_alien = NULL; @@ -3897,58 +3775,58 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)  			}  		} -		l3 = cachep->nodelists[node]; -		if (l3) { -			struct array_cache *shared = l3->shared; +		n = cachep->node[node]; +		if (n) { +			struct array_cache *shared = n->shared; -			spin_lock_irq(&l3->list_lock); +			spin_lock_irq(&n->list_lock);  			if (shared)  				free_block(cachep, shared->entry,  						shared->avail, node); -			l3->shared = new_shared; -			if (!l3->alien) { -				l3->alien = new_alien; +			n->shared = new_shared; +			if (!n->alien) { +				n->alien = new_alien;  				new_alien = NULL;  			} -			l3->free_limit = (1 + nr_cpus_node(node)) * +			n->free_limit = (1 + nr_cpus_node(node)) *  					cachep->batchcount + cachep->num; -			spin_unlock_irq(&l3->list_lock); +			spin_unlock_irq(&n->list_lock);  			kfree(shared);  			free_alien_cache(new_alien);  			continue;  		} -		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); -		if (!l3) { +		n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); +		if (!n) {  			free_alien_cache(new_alien);  			kfree(new_shared);  			goto fail;  		} -		kmem_list3_init(l3); -		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + -				((unsigned long)cachep) % REAPTIMEOUT_LIST3; -		l3->shared = new_shared; -		l3->alien = new_alien; -		l3->free_limit = (1 + nr_cpus_node(node)) * +		kmem_cache_node_init(n); +		n->next_reap = jiffies + REAPTIMEOUT_NODE + +				((unsigned long)cachep) % REAPTIMEOUT_NODE; +		n->shared = new_shared; +		n->alien = new_alien; +		n->free_limit = (1 + nr_cpus_node(node)) *  					cachep->batchcount + cachep->num; -		cachep->nodelists[node] = l3; +		cachep->node[node] = n;  	}  	return 0;  fail: -	if (!cachep->next.next) { +	if (!cachep->list.next) {  		/* Cache is not active yet. Roll back what we did */  		node--;  		while (node >= 0) { -			if (cachep->nodelists[node]) { -				l3 = cachep->nodelists[node]; +			if (cachep->node[node]) { +				n = cachep->node[node]; -				kfree(l3->shared); -				free_alien_cache(l3->alien); -				kfree(l3); -				cachep->nodelists[node] = NULL; +				kfree(n->shared); +				free_alien_cache(n->alien); +				kfree(n); +				cachep->node[node] = NULL;  			}  			node--;  		} @@ -3958,7 +3836,7 @@ fail:  struct ccupdate_struct {  	struct kmem_cache *cachep; -	struct array_cache *new[NR_CPUS]; +	struct array_cache *new[0];  };  static void do_ccupdate_local(void *info) @@ -3973,14 +3851,15 @@ static void do_ccupdate_local(void *info)  	new->new[smp_processor_id()] = old;  } -/* Always called with the cache_chain_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +/* Always called with the slab_mutex held */ +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,  				int batchcount, int shared, gfp_t gfp)  {  	struct ccupdate_struct *new;  	int i; -	new = kzalloc(sizeof(*new), gfp); +	new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), +		      gfp);  	if (!new)  		return -ENOMEM; @@ -4007,21 +3886,58 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,  		struct array_cache *ccold = new->new[i];  		if (!ccold)  			continue; -		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); +		spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);  		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); -		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); +		spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);  		kfree(ccold);  	}  	kfree(new); -	return alloc_kmemlist(cachep, gfp); +	return alloc_kmem_cache_node(cachep, gfp); +} + +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +				int batchcount, int shared, gfp_t gfp) +{ +	int ret; +	struct kmem_cache *c = NULL; +	int i = 0; + +	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + +	if (slab_state < FULL) +		return ret; + +	if ((ret < 0) || !is_root_cache(cachep)) +		return ret; + +	VM_BUG_ON(!mutex_is_locked(&slab_mutex)); +	for_each_memcg_cache_index(i) { +		c = cache_from_memcg_idx(cachep, i); +		if (c) +			/* return value determined by the parent cache only */ +			__do_tune_cpucache(c, limit, batchcount, shared, gfp); +	} + +	return ret;  } -/* Called with cache_chain_mutex held always */ +/* Called with slab_mutex held always */  static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)  {  	int err; -	int limit, shared; +	int limit = 0; +	int shared = 0; +	int batchcount = 0; + +	if (!is_root_cache(cachep)) { +		struct kmem_cache *root = memcg_root_cache(cachep); +		limit = root->limit; +		shared = root->shared; +		batchcount = root->batchcount; +	} +	if (limit && shared && batchcount) +		goto skip_setup;  	/*  	 * The head array serves three purposes:  	 * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4031,13 +3947,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)  	 * The numbers are guessed, we should auto-tune as described by  	 * Bonwick.  	 */ -	if (cachep->buffer_size > 131072) +	if (cachep->size > 131072)  		limit = 1; -	else if (cachep->buffer_size > PAGE_SIZE) +	else if (cachep->size > PAGE_SIZE)  		limit = 8; -	else if (cachep->buffer_size > 1024) +	else if (cachep->size > 1024)  		limit = 24; -	else if (cachep->buffer_size > 256) +	else if (cachep->size > 256)  		limit = 54;  	else  		limit = 120; @@ -4052,7 +3968,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)  	 * to a larger limit. Thus disabled by default.  	 */  	shared = 0; -	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) +	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)  		shared = 8;  #if DEBUG @@ -4063,7 +3979,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)  	if (limit > 32)  		limit = 32;  #endif -	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); +	batchcount = (limit + 1) / 2; +skip_setup: +	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);  	if (err)  		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",  		       cachep->name, -err); @@ -4071,11 +3989,11 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)  }  /* - * Drain an array if it contains any elements taking the l3 lock only if - * necessary. Note that the l3 listlock also protects the array_cache + * Drain an array if it contains any elements taking the node lock only if + * necessary. Note that the node listlock also protects the array_cache   * if drain_array() is used on the shared array.   */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,  			 struct array_cache *ac, int force, int node)  {  	int tofree; @@ -4085,7 +4003,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,  	if (ac->touched && !force) {  		ac->touched = 0;  	} else { -		spin_lock_irq(&l3->list_lock); +		spin_lock_irq(&n->list_lock);  		if (ac->avail) {  			tofree = force ? ac->avail : (ac->limit + 4) / 5;  			if (tofree > ac->avail) @@ -4095,7 +4013,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,  			memmove(ac->entry, &(ac->entry[tofree]),  				sizeof(void *) * ac->avail);  		} -		spin_unlock_irq(&l3->list_lock); +		spin_unlock_irq(&n->list_lock);  	}  } @@ -4114,45 +4032,45 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,  static void cache_reap(struct work_struct *w)  {  	struct kmem_cache *searchp; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	int node = numa_mem_id();  	struct delayed_work *work = to_delayed_work(w); -	if (!mutex_trylock(&cache_chain_mutex)) +	if (!mutex_trylock(&slab_mutex))  		/* Give up. Setup the next iteration. */  		goto out; -	list_for_each_entry(searchp, &cache_chain, next) { +	list_for_each_entry(searchp, &slab_caches, list) {  		check_irq_on();  		/* -		 * We only take the l3 lock if absolutely necessary and we +		 * We only take the node lock if absolutely necessary and we  		 * have established with reasonable certainty that  		 * we can do some work if the lock was obtained.  		 */ -		l3 = searchp->nodelists[node]; +		n = searchp->node[node]; -		reap_alien(searchp, l3); +		reap_alien(searchp, n); -		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); +		drain_array(searchp, n, cpu_cache_get(searchp), 0, node);  		/*  		 * These are racy checks but it does not matter  		 * if we skip one check or scan twice.  		 */ -		if (time_after(l3->next_reap, jiffies)) +		if (time_after(n->next_reap, jiffies))  			goto next; -		l3->next_reap = jiffies + REAPTIMEOUT_LIST3; +		n->next_reap = jiffies + REAPTIMEOUT_NODE; -		drain_array(searchp, l3, l3->shared, 0, node); +		drain_array(searchp, n, n->shared, 0, node); -		if (l3->free_touched) -			l3->free_touched = 0; +		if (n->free_touched) +			n->free_touched = 0;  		else {  			int freed; -			freed = drain_freelist(searchp, l3, (l3->free_limit + +			freed = drain_freelist(searchp, n, (n->free_limit +  				5 * searchp->num - 1) / (5 * searchp->num));  			STATS_ADD_REAPED(searchp, freed);  		} @@ -4160,63 +4078,17 @@ next:  		cond_resched();  	}  	check_irq_on(); -	mutex_unlock(&cache_chain_mutex); +	mutex_unlock(&slab_mutex);  	next_reap_node();  out:  	/* Set up the next iteration */ -	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); +	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));  }  #ifdef CONFIG_SLABINFO - -static void print_slabinfo_header(struct seq_file *m) -{ -	/* -	 * Output format version, so at least we can change it -	 * without _too_ many complaints. -	 */ -#if STATS -	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else -	seq_puts(m, "slabinfo - version: 2.1\n"); -#endif -	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> " -		 "<objperslab> <pagesperslab>"); -	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); -	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); -#if STATS -	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " -		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); -	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); -#endif -	seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ -	loff_t n = *pos; - -	mutex_lock(&cache_chain_mutex); -	if (!n) -		print_slabinfo_header(m); - -	return seq_list_start(&cache_chain, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ -	return seq_list_next(p, &cache_chain, pos); -} - -static void s_stop(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)  { -	mutex_unlock(&cache_chain_mutex); -} - -static int s_show(struct seq_file *m, void *p) -{ -	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); -	struct slab *slabp; +	struct page *page;  	unsigned long active_objs;  	unsigned long num_objs;  	unsigned long active_slabs = 0; @@ -4224,42 +4096,42 @@ static int s_show(struct seq_file *m, void *p)  	const char *name;  	char *error = NULL;  	int node; -	struct kmem_list3 *l3; +	struct kmem_cache_node *n;  	active_objs = 0;  	num_slabs = 0;  	for_each_online_node(node) { -		l3 = cachep->nodelists[node]; -		if (!l3) +		n = cachep->node[node]; +		if (!n)  			continue;  		check_irq_on(); -		spin_lock_irq(&l3->list_lock); +		spin_lock_irq(&n->list_lock); -		list_for_each_entry(slabp, &l3->slabs_full, list) { -			if (slabp->inuse != cachep->num && !error) +		list_for_each_entry(page, &n->slabs_full, lru) { +			if (page->active != cachep->num && !error)  				error = "slabs_full accounting error";  			active_objs += cachep->num;  			active_slabs++;  		} -		list_for_each_entry(slabp, &l3->slabs_partial, list) { -			if (slabp->inuse == cachep->num && !error) -				error = "slabs_partial inuse accounting error"; -			if (!slabp->inuse && !error) -				error = "slabs_partial/inuse accounting error"; -			active_objs += slabp->inuse; +		list_for_each_entry(page, &n->slabs_partial, lru) { +			if (page->active == cachep->num && !error) +				error = "slabs_partial accounting error"; +			if (!page->active && !error) +				error = "slabs_partial accounting error"; +			active_objs += page->active;  			active_slabs++;  		} -		list_for_each_entry(slabp, &l3->slabs_free, list) { -			if (slabp->inuse && !error) -				error = "slabs_free/inuse accounting error"; +		list_for_each_entry(page, &n->slabs_free, lru) { +			if (page->active && !error) +				error = "slabs_free accounting error";  			num_slabs++;  		} -		free_objects += l3->free_objects; -		if (l3->shared) -			shared_avail += l3->shared->avail; +		free_objects += n->free_objects; +		if (n->shared) +			shared_avail += n->shared->avail; -		spin_unlock_irq(&l3->list_lock); +		spin_unlock_irq(&n->list_lock);  	}  	num_slabs += active_slabs;  	num_objs = num_slabs * cachep->num; @@ -4270,15 +4142,22 @@ static int s_show(struct seq_file *m, void *p)  	if (error)  		printk(KERN_ERR "slab: cache %s error: %s\n", name, error); -	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", -		   name, active_objs, num_objs, cachep->buffer_size, -		   cachep->num, (1 << cachep->gfporder)); -	seq_printf(m, " : tunables %4u %4u %4u", -		   cachep->limit, cachep->batchcount, cachep->shared); -	seq_printf(m, " : slabdata %6lu %6lu %6lu", -		   active_slabs, num_slabs, shared_avail); +	sinfo->active_objs = active_objs; +	sinfo->num_objs = num_objs; +	sinfo->active_slabs = active_slabs; +	sinfo->num_slabs = num_slabs; +	sinfo->shared_avail = shared_avail; +	sinfo->limit = cachep->limit; +	sinfo->batchcount = cachep->batchcount; +	sinfo->shared = cachep->shared; +	sinfo->objects_per_slab = cachep->num; +	sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{  #if STATS -	{			/* list3 stats */ +	{			/* node stats */  		unsigned long high = cachep->high_mark;  		unsigned long allocs = cachep->num_allocations;  		unsigned long grown = cachep->grown; @@ -4306,31 +4185,8 @@ static int s_show(struct seq_file *m, void *p)  			   allochit, allocmiss, freehit, freemiss);  	}  #endif -	seq_putc(m, '\n'); -	return 0;  } -/* - * slabinfo_op - iterator that generates /proc/slabinfo - * - * Output layout: - * cache-name - * num-active-objs - * total-objs - * object size - * num-active-slabs - * total-slabs - * num-pages-per-slab - * + further values on SMP and with statistics enabled - */ - -static const struct seq_operations slabinfo_op = { -	.start = s_start, -	.next = s_next, -	.stop = s_stop, -	.show = s_show, -}; -  #define MAX_SLABINFO_WRITE 128  /**   * slabinfo_write - Tuning for the slab allocator @@ -4339,7 +4195,7 @@ static const struct seq_operations slabinfo_op = {   * @count: data length   * @ppos: unused   */ -ssize_t slabinfo_write(struct file *file, const char __user * buffer, +ssize_t slabinfo_write(struct file *file, const char __user *buffer,  		       size_t count, loff_t *ppos)  {  	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; @@ -4361,9 +4217,9 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,  		return -EINVAL;  	/* Find the cache in the chain of caches. */ -	mutex_lock(&cache_chain_mutex); +	mutex_lock(&slab_mutex);  	res = -EINVAL; -	list_for_each_entry(cachep, &cache_chain, next) { +	list_for_each_entry(cachep, &slab_caches, list) {  		if (!strcmp(cachep->name, kbuf)) {  			if (limit < 1 || batchcount < 1 ||  					batchcount > limit || shared < 0) { @@ -4376,31 +4232,18 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,  			break;  		}  	} -	mutex_unlock(&cache_chain_mutex); +	mutex_unlock(&slab_mutex);  	if (res >= 0)  		res = count;  	return res;  } -static int slabinfo_open(struct inode *inode, struct file *file) -{ -	return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { -	.open		= slabinfo_open, -	.read		= seq_read, -	.write		= slabinfo_write, -	.llseek		= seq_lseek, -	.release	= seq_release, -}; -  #ifdef CONFIG_DEBUG_SLAB_LEAK  static void *leaks_start(struct seq_file *m, loff_t *pos)  { -	mutex_lock(&cache_chain_mutex); -	return seq_list_start(&cache_chain, *pos); +	mutex_lock(&slab_mutex); +	return seq_list_start(&slab_caches, *pos);  }  static inline int add_caller(unsigned long *n, unsigned long v) @@ -4433,15 +4276,18 @@ static inline int add_caller(unsigned long *n, unsigned long v)  	return 1;  } -static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +static void handle_slab(unsigned long *n, struct kmem_cache *c, +						struct page *page)  {  	void *p;  	int i; +  	if (n[0] == n[1])  		return; -	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { -		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) +	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { +		if (get_obj_status(page, i) != OBJECT_ACTIVE)  			continue; +  		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))  			return;  	} @@ -4465,11 +4311,11 @@ static void show_symbol(struct seq_file *m, unsigned long address)  static int leaks_show(struct seq_file *m, void *p)  { -	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); -	struct slab *slabp; -	struct kmem_list3 *l3; +	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); +	struct page *page; +	struct kmem_cache_node *n;  	const char *name; -	unsigned long *n = m->private; +	unsigned long *x = m->private;  	int node;  	int i; @@ -4480,43 +4326,43 @@ static int leaks_show(struct seq_file *m, void *p)  	/* OK, we can do it */ -	n[1] = 0; +	x[1] = 0;  	for_each_online_node(node) { -		l3 = cachep->nodelists[node]; -		if (!l3) +		n = cachep->node[node]; +		if (!n)  			continue;  		check_irq_on(); -		spin_lock_irq(&l3->list_lock); +		spin_lock_irq(&n->list_lock); -		list_for_each_entry(slabp, &l3->slabs_full, list) -			handle_slab(n, cachep, slabp); -		list_for_each_entry(slabp, &l3->slabs_partial, list) -			handle_slab(n, cachep, slabp); -		spin_unlock_irq(&l3->list_lock); +		list_for_each_entry(page, &n->slabs_full, lru) +			handle_slab(x, cachep, page); +		list_for_each_entry(page, &n->slabs_partial, lru) +			handle_slab(x, cachep, page); +		spin_unlock_irq(&n->list_lock);  	}  	name = cachep->name; -	if (n[0] == n[1]) { +	if (x[0] == x[1]) {  		/* Increase the buffer size */ -		mutex_unlock(&cache_chain_mutex); -		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); +		mutex_unlock(&slab_mutex); +		m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);  		if (!m->private) {  			/* Too bad, we are really out */ -			m->private = n; -			mutex_lock(&cache_chain_mutex); +			m->private = x; +			mutex_lock(&slab_mutex);  			return -ENOMEM;  		} -		*(unsigned long *)m->private = n[0] * 2; -		kfree(n); -		mutex_lock(&cache_chain_mutex); +		*(unsigned long *)m->private = x[0] * 2; +		kfree(x); +		mutex_lock(&slab_mutex);  		/* Now make sure this entry will be retried */  		m->count = m->size;  		return 0;  	} -	for (i = 0; i < n[1]; i++) { -		seq_printf(m, "%s: %lu ", name, n[2*i+3]); -		show_symbol(m, n[2*i+2]); +	for (i = 0; i < x[1]; i++) { +		seq_printf(m, "%s: %lu ", name, x[2*i+3]); +		show_symbol(m, x[2*i+2]);  		seq_putc(m, '\n');  	} @@ -4525,8 +4371,8 @@ static int leaks_show(struct seq_file *m, void *p)  static const struct seq_operations slabstats_op = {  	.start = leaks_start, -	.next = s_next, -	.stop = s_stop, +	.next = slab_next, +	.stop = slab_stop,  	.show = leaks_show,  }; @@ -4557,7 +4403,6 @@ static const struct file_operations proc_slabstats_operations = {  static int __init slab_proc_init(void)  { -	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);  #ifdef CONFIG_DEBUG_SLAB_LEAK  	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);  #endif @@ -4584,6 +4429,6 @@ size_t ksize(const void *objp)  	if (unlikely(objp == ZERO_SIZE_PTR))  		return 0; -	return obj_size(virt_to_cache(objp)); +	return virt_to_cache(objp)->object_size;  }  EXPORT_SYMBOL(ksize);  | 
