diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 2342 | 
1 files changed, 1186 insertions, 1156 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839..1f14a430c65 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,26 +45,29 @@  #include <linux/swapops.h>  #include <linux/spinlock.h>  #include <linux/eventfd.h> +#include <linux/poll.h>  #include <linux/sort.h>  #include <linux/fs.h>  #include <linux/seq_file.h> -#include <linux/vmalloc.h>  #include <linux/vmpressure.h>  #include <linux/mm_inline.h>  #include <linux/page_cgroup.h>  #include <linux/cpu.h>  #include <linux/oom.h> +#include <linux/lockdep.h> +#include <linux/file.h>  #include "internal.h"  #include <net/sock.h>  #include <net/ip.h>  #include <net/tcp_memcontrol.h> +#include "slab.h"  #include <asm/uaccess.h>  #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys);  #define MEM_CGROUP_RECLAIM_RETRIES	5  static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -77,7 +80,7 @@ int do_swap_account __read_mostly;  #ifdef CONFIG_MEMCG_SWAP_ENABLED  static int really_do_swap_account __initdata = 1;  #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata;  #endif  #else @@ -146,7 +149,7 @@ struct mem_cgroup_reclaim_iter {  	 * matches memcg->dead_count of the hierarchy root group.  	 */  	struct mem_cgroup *last_visited; -	unsigned long last_dead_count; +	int last_dead_count;  	/* scan generation, increased every round-trip */  	unsigned int generation; @@ -225,6 +228,46 @@ struct mem_cgroup_eventfd_list {  	struct eventfd_ctx *eventfd;  }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { +	/* +	 * memcg which the event belongs to. +	 */ +	struct mem_cgroup *memcg; +	/* +	 * eventfd to signal userspace about the event. +	 */ +	struct eventfd_ctx *eventfd; +	/* +	 * Each of these stored in a list by the cgroup. +	 */ +	struct list_head list; +	/* +	 * register_event() callback will be used to add new userspace +	 * waiter for changes related to this event.  Use eventfd_signal() +	 * on eventfd to send notification to userspace. +	 */ +	int (*register_event)(struct mem_cgroup *memcg, +			      struct eventfd_ctx *eventfd, const char *args); +	/* +	 * unregister_event() callback will be called when userspace closes +	 * the eventfd or on cgroup removing.  This callback must be set, +	 * if you want provide notification functionality. +	 */ +	void (*unregister_event)(struct mem_cgroup *memcg, +				 struct eventfd_ctx *eventfd); +	/* +	 * All fields below needed to unregister event when +	 * userspace closes eventfd. +	 */ +	poll_table pt; +	wait_queue_head_t *wqh; +	wait_queue_t wait; +	struct work_struct remove; +}; +  static void mem_cgroup_threshold(struct mem_cgroup *memcg);  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -311,13 +354,12 @@ struct mem_cgroup {  	atomic_t	dead_count;  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) -	struct tcp_memcontrol tcp_mem; +	struct cg_proto tcp_mem;  #endif  #if defined(CONFIG_MEMCG_KMEM) -	/* analogous to slab_common's slab_caches list. per-memcg */ +	/* analogous to slab_common's slab_caches list, but per-memcg; +	 * protected by memcg_slab_mutex */  	struct list_head memcg_slab_caches; -	/* Not a spinlock, we can take a lot of time walking the list */ -	struct mutex slab_caches_mutex;          /* Index in the kmem_cache->memcg_params->memcg_caches array */  	int kmemcg_id;  #endif @@ -329,27 +371,20 @@ struct mem_cgroup {  	atomic_t	numainfo_updating;  #endif +	/* List of events which userspace want to receive */ +	struct list_head event_list; +	spinlock_t event_list_lock; +  	struct mem_cgroup_per_node *nodeinfo[0];  	/* WARNING: nodeinfo must be the last member here */  }; -static size_t memcg_size(void) -{ -	return sizeof(struct mem_cgroup) + -		nr_node_ids * sizeof(struct mem_cgroup_per_node); -} -  /* internal only representation about the status of kmem accounting. */  enum { -	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ -	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ +	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */  	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */  }; -/* We account when limit is on, but only after call sites are patched */ -#define KMEM_ACCOUNTED_MASK \ -		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) -  #ifdef CONFIG_MEMCG_KMEM  static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)  { @@ -361,16 +396,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)  	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);  } -static void memcg_kmem_set_activated(struct mem_cgroup *memcg) -{ -	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} - -static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) -{ -	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); -} -  static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)  {  	/* @@ -488,14 +513,28 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;  } -struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)  { -	return &mem_cgroup_from_css(css)->vmpressure; +	return (memcg == root_mem_cgroup);  } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX	USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)  { -	return (memcg == root_mem_cgroup); +	return memcg->css.id; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ +	struct cgroup_subsys_state *css; + +	css = css_from_id(id, &memory_cgrp_subsys); +	return mem_cgroup_from_css(css);  }  /* Writing them here to avoid exposing memcg's inner layout */ @@ -527,7 +566,8 @@ void sock_update_memcg(struct sock *sk)  		memcg = mem_cgroup_from_task(current);  		cg_proto = sk->sk_prot->proto_cgroup(memcg);  		if (!mem_cgroup_is_root(memcg) && -		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { +		    memcg_proto_active(cg_proto) && +		    css_tryget_online(&memcg->css)) {  			sk->sk_cgrp = cg_proto;  		}  		rcu_read_unlock(); @@ -550,13 +590,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)  	if (!memcg || mem_cgroup_is_root(memcg))  		return NULL; -	return &memcg->tcp_mem.cg_proto; +	return &memcg->tcp_mem;  }  EXPORT_SYMBOL(tcp_proto_cgroup);  static void disarm_sock_keys(struct mem_cgroup *memcg)  { -	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) +	if (!memcg_proto_activated(&memcg->tcp_mem))  		return;  	static_key_slow_dec(&memcg_socket_limit_enabled);  } @@ -569,16 +609,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)  #ifdef CONFIG_MEMCG_KMEM  /*   * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * There are two main reasons for not using the css_id for this: - *  1) this works better in sparse environments, where we have a lot of memcgs, - *     but only a few kmem-limited. Or also, if we have, for instance, 200 - *     memcgs, and none but the 200th is kmem-limited, we'd have to have a - *     200 entry array for that. - * - *  2) In order not to violate the cgroup API, we would like to do all memory - *     allocation in ->create(). At that point, we haven't yet allocated the - *     css_id. Having a separate index prevents us from messing with the cgroup - *     core for this + * The main reason for not using cgroup id for this: + *  this works better in sparse environments, where we have a lot of memcgs, + *  but only a few kmem-limited. Or also, if we have, for instance, 200 + *  memcgs, and none but the 200th is kmem-limited, we'd have to have a + *  200 entry array for that.   *   * The current size of the caches array is stored in   * memcg_limited_groups_array_size.  It will double each time we have to @@ -593,14 +628,14 @@ int memcg_limited_groups_array_size;   * cgroups is a reasonable guess. In the future, it could be a parameter or   * tunable, but that is strictly not necessary.   * - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get   * this constant directly from cgroup, but it is understandable that this is   * better kept as an internal representation in cgroup.c. In any case, the - * css_id space is not getting any smaller, and we don't have to necessarily + * cgrp_id space is not getting any smaller, and we don't have to necessarily   * increase ours as well if it increases.   */  #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE 65535 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX  /*   * A lot of the calls to the cache allocation functions are expected to be @@ -638,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)  static void drain_all_stock_async(struct mem_cgroup *memcg);  static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)  { -	VM_BUG_ON((unsigned)nid >= nr_node_ids); +	int nid = zone_to_nid(zone); +	int zid = zone_idx(zone); +  	return &memcg->nodeinfo[nid]->zoneinfo[zid];  } @@ -650,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)  }  static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)  {  	int nid = page_to_nid(page);  	int zid = page_zonenum(page); -	return mem_cgroup_zoneinfo(memcg, nid, zid); +	return &memcg->nodeinfo[nid]->zoneinfo[zid];  }  static struct mem_cgroup_tree_per_zone * @@ -673,11 +710,9 @@ soft_limit_tree_from_page(struct page *page)  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];  } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz, -				unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, +					 struct mem_cgroup_tree_per_zone *mctz, +					 unsigned long long new_usage_in_excess)  {  	struct rb_node **p = &mctz->rb_root.rb_node;  	struct rb_node *parent = NULL; @@ -707,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,  	mz->on_tree = true;  } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, +					 struct mem_cgroup_tree_per_zone *mctz)  {  	if (!mz->on_tree)  		return; @@ -718,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,  	mz->on_tree = false;  } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, +				       struct mem_cgroup_tree_per_zone *mctz)  {  	spin_lock(&mctz->lock); -	__mem_cgroup_remove_exceeded(memcg, mz, mctz); +	__mem_cgroup_remove_exceeded(mz, mctz);  	spin_unlock(&mctz->lock);  } @@ -734,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  	unsigned long long excess;  	struct mem_cgroup_per_zone *mz;  	struct mem_cgroup_tree_per_zone *mctz; -	int nid = page_to_nid(page); -	int zid = page_zonenum(page); -	mctz = soft_limit_tree_from_page(page); +	mctz = soft_limit_tree_from_page(page);  	/*  	 * Necessary to update all ancestors when hierarchy is used.  	 * because their event counter is not touched.  	 */  	for (; memcg; memcg = parent_mem_cgroup(memcg)) { -		mz = mem_cgroup_zoneinfo(memcg, nid, zid); +		mz = mem_cgroup_page_zoneinfo(memcg, page);  		excess = res_counter_soft_limit_excess(&memcg->res);  		/*  		 * We have to update the tree if mz is on RB-tree or @@ -753,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  			spin_lock(&mctz->lock);  			/* if on-tree, remove it */  			if (mz->on_tree) -				__mem_cgroup_remove_exceeded(memcg, mz, mctz); +				__mem_cgroup_remove_exceeded(mz, mctz);  			/*  			 * Insert again. mz->usage_in_excess will be updated.  			 * If excess is 0, no tree ops.  			 */ -			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); +			__mem_cgroup_insert_exceeded(mz, mctz, excess);  			spin_unlock(&mctz->lock);  		}  	} @@ -766,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)  { -	int node, zone; -	struct mem_cgroup_per_zone *mz;  	struct mem_cgroup_tree_per_zone *mctz; +	struct mem_cgroup_per_zone *mz; +	int nid, zid; -	for_each_node(node) { -		for (zone = 0; zone < MAX_NR_ZONES; zone++) { -			mz = mem_cgroup_zoneinfo(memcg, node, zone); -			mctz = soft_limit_tree_node_zone(node, zone); -			mem_cgroup_remove_exceeded(memcg, mz, mctz); +	for_each_node(nid) { +		for (zid = 0; zid < MAX_NR_ZONES; zid++) { +			mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; +			mctz = soft_limit_tree_node_zone(nid, zid); +			mem_cgroup_remove_exceeded(mz, mctz);  		}  	}  } @@ -797,9 +826,9 @@ retry:  	 * we will to add it back at the end of reclaim to its correct  	 * position in the tree.  	 */ -	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +	__mem_cgroup_remove_exceeded(mz, mctz);  	if (!res_counter_soft_limit_excess(&mz->memcg->res) || -		!css_tryget(&mz->memcg->css)) +	    !css_tryget_online(&mz->memcg->css))  		goto retry;  done:  	return mz; @@ -866,6 +895,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  	unsigned long val = 0;  	int cpu; +	get_online_cpus();  	for_each_online_cpu(cpu)  		val += per_cpu(memcg->stat->events[idx], cpu);  #ifdef CONFIG_HOTPLUG_CPU @@ -873,6 +903,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  	val += memcg->nocpu_base.events[idx];  	spin_unlock(&memcg->pcp_counter_lock);  #endif +	put_online_cpus();  	return val;  } @@ -880,8 +911,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  					 struct page *page,  					 bool anon, int nr_pages)  { -	preempt_disable(); -  	/*  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is  	 * counted as CACHE even if it's on ANON LRU. @@ -906,12 +935,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  	}  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages); - -	preempt_enable();  } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)  {  	struct mem_cgroup_per_zone *mz; @@ -919,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)  	return mz->lru_size[lru];  } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, -			unsigned int lru_mask) -{ -	struct mem_cgroup_per_zone *mz; -	enum lru_list lru; -	unsigned long ret = 0; - -	mz = mem_cgroup_zoneinfo(memcg, nid, zid); - -	for_each_lru(lru) { -		if (BIT(lru) & lru_mask) -			ret += mz->lru_size[lru]; -	} -	return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, -			int nid, unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +						  int nid, +						  unsigned int lru_mask)  { -	u64 total = 0; +	unsigned long nr = 0;  	int zid; -	for (zid = 0; zid < MAX_NR_ZONES; zid++) -		total += mem_cgroup_zone_nr_lru_pages(memcg, -						nid, zid, lru_mask); +	VM_BUG_ON((unsigned)nid >= nr_node_ids); -	return total; +	for (zid = 0; zid < MAX_NR_ZONES; zid++) { +		struct mem_cgroup_per_zone *mz; +		enum lru_list lru; + +		for_each_lru(lru) { +			if (!(BIT(lru) & lru_mask)) +				continue; +			mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; +			nr += mz->lru_size[lru]; +		} +	} +	return nr;  }  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,  			unsigned int lru_mask)  { +	unsigned long nr = 0;  	int nid; -	u64 total = 0;  	for_each_node_state(nid, N_MEMORY) -		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); -	return total; +		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); +	return nr;  }  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1031,26 +1049,28 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)  	if (unlikely(!p))  		return NULL; -	return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); +	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));  } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  {  	struct mem_cgroup *memcg = NULL; -	if (!mm) -		return NULL; -	/* -	 * Because we have no locks, mm->owner's may be being moved to other -	 * cgroup. We use css_tryget() here even if this looks -	 * pessimistic (rather than adding locks here). -	 */  	rcu_read_lock();  	do { -		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -		if (unlikely(!memcg)) -			break; -	} while (!css_tryget(&memcg->css)); +		/* +		 * Page cache insertions can happen withou an +		 * actual mm context, e.g. during disk probing +		 * on boot, loopback IO, acct() writes etc. +		 */ +		if (unlikely(!mm)) +			memcg = root_mem_cgroup; +		else { +			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); +			if (unlikely(!memcg)) +				memcg = root_mem_cgroup; +		} +	} while (!css_tryget_online(&memcg->css));  	rcu_read_unlock();  	return memcg;  } @@ -1076,16 +1096,23 @@ skip_node:  	 * skipped and we should continue the tree walk.  	 * last_visited css is safe to use because it is  	 * protected by css_get and the tree walk is rcu safe. +	 * +	 * We do not take a reference on the root of the tree walk +	 * because we might race with the root removal when it would +	 * be the only node in the iterated hierarchy and mem_cgroup_iter +	 * would end up in an endless loop because it expects that at +	 * least one valid node will be returned. Root cannot disappear +	 * because caller of the iterator should hold it already so +	 * skipping css reference should be safe.  	 */  	if (next_css) { -		struct mem_cgroup *mem = mem_cgroup_from_css(next_css); +		if ((next_css == &root->css) || +		    ((next_css->flags & CSS_ONLINE) && +		     css_tryget_online(next_css))) +			return mem_cgroup_from_css(next_css); -		if (css_tryget(&mem->css)) -			return mem; -		else { -			prev_css = next_css; -			goto skip_node; -		} +		prev_css = next_css; +		goto skip_node;  	}  	return NULL; @@ -1119,7 +1146,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,  	if (iter->last_dead_count == *sequence) {  		smp_rmb();  		position = iter->last_visited; -		if (position && !css_tryget(&position->css)) + +		/* +		 * We cannot take a reference to root because we might race +		 * with root removal and returning NULL would end up in +		 * an endless loop on the iterator user level when root +		 * would be returned all the time. +		 */ +		if (position && position != root && +		    !css_tryget_online(&position->css))  			position = NULL;  	}  	return position; @@ -1128,9 +1163,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,  static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,  				   struct mem_cgroup *last_visited,  				   struct mem_cgroup *new_position, +				   struct mem_cgroup *root,  				   int sequence)  { -	if (last_visited) +	/* root reference counting symmetric to mem_cgroup_iter_load */ +	if (last_visited && last_visited != root)  		css_put(&last_visited->css);  	/*  	 * We store the sequence count from the time @last_visited was @@ -1188,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  		int uninitialized_var(seq);  		if (reclaim) { -			int nid = zone_to_nid(reclaim->zone); -			int zid = zone_idx(reclaim->zone);  			struct mem_cgroup_per_zone *mz; -			mz = mem_cgroup_zoneinfo(root, nid, zid); +			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);  			iter = &mz->reclaim_iter[reclaim->priority];  			if (prev && reclaim->generation != iter->generation) {  				iter->last_visited = NULL; @@ -1205,7 +1240,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  		memcg = __mem_cgroup_iter_next(root, last_visited);  		if (reclaim) { -			mem_cgroup_iter_update(iter, last_visited, memcg, seq); +			mem_cgroup_iter_update(iter, last_visited, memcg, root, +					seq);  			if (!memcg)  				iter->generation++; @@ -1298,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,  		goto out;  	} -	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); +	mz = mem_cgroup_zone_zoneinfo(memcg, zone);  	lruvec = &mz->lruvec;  out:  	/* @@ -1357,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)  	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)  		pc->mem_cgroup = memcg = root_mem_cgroup; -	mz = page_cgroup_zoneinfo(memcg, page); +	mz = mem_cgroup_page_zoneinfo(memcg, page);  	lruvec = &mz->lruvec;  out:  	/* @@ -1405,7 +1441,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,  		return true;  	if (!root_memcg->use_hierarchy || !memcg)  		return false; -	return css_is_ancestor(&memcg->css, &root_memcg->css); +	return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);  }  static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, @@ -1428,7 +1464,7 @@ bool task_in_mem_cgroup(struct task_struct *task,  	p = find_lock_task_mm(task);  	if (p) { -		curr = try_get_mem_cgroup_from_mm(p->mm); +		curr = get_mem_cgroup_from_mm(p->mm);  		task_unlock(p);  	} else {  		/* @@ -1442,8 +1478,6 @@ bool task_in_mem_cgroup(struct task_struct *task,  			css_get(&curr->css);  		rcu_read_unlock();  	} -	if (!curr) -		return false;  	/*  	 * We should check use_hierarchy of "memcg" not "curr". Because checking  	 * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1497,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)  int mem_cgroup_swappiness(struct mem_cgroup *memcg)  {  	/* root ? */ -	if (!css_parent(&memcg->css)) +	if (mem_cgroup_disabled() || !memcg->css.parent)  		return vm_swappiness;  	return memcg->swappiness; @@ -1541,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)  }  /* - * 2 routines for checking "mem" is under move_account() or not. - * - * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This - *			  is used for avoiding races in accounting.  If true, - *			  pc->mem_cgroup may be overwritten. + * A routine for checking "mem" is under move_account() or not.   * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - *			  under hierarchy of moving cgroups. This is for - *			  waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move".   */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ -	VM_BUG_ON(!rcu_read_lock_held()); -	return atomic_read(&memcg->moving_account) > 0; -} -  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)  {  	struct mem_cgroup *from; @@ -1600,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)   * Take this lock when   * - a code tries to modify page's memcg while it's USED.   * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too.   */  static void move_lock_mem_cgroup(struct mem_cgroup *memcg,  				  unsigned long *flags) @@ -1625,53 +1647,25 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,   */  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)  { -	struct cgroup *task_cgrp; -	struct cgroup *mem_cgrp; -	/* -	 * Need a buffer in BSS, can't rely on allocations. The code relies -	 * on the assumption that OOM is serialized for memory controller. -	 * If this assumption is broken, revisit this code. -	 */ -	static char memcg_name[PATH_MAX]; -	int ret; +	/* oom_info_lock ensures that parallel ooms do not interleave */ +	static DEFINE_MUTEX(oom_info_lock);  	struct mem_cgroup *iter;  	unsigned int i;  	if (!p)  		return; +	mutex_lock(&oom_info_lock);  	rcu_read_lock(); -	mem_cgrp = memcg->css.cgroup; -	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); - -	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); -	if (ret < 0) { -		/* -		 * Unfortunately, we are unable to convert to a useful name -		 * But we'll still print out the usage information -		 */ -		rcu_read_unlock(); -		goto done; -	} -	rcu_read_unlock(); - -	pr_info("Task in %s killed", memcg_name); +	pr_info("Task in "); +	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); +	pr_info(" killed as a result of limit of "); +	pr_cont_cgroup_path(memcg->css.cgroup); +	pr_info("\n"); -	rcu_read_lock(); -	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); -	if (ret < 0) { -		rcu_read_unlock(); -		goto done; -	}  	rcu_read_unlock(); -	/* -	 * Continues from above, so we don't need an KERN_ level -	 */ -	pr_cont(" as a result of limit of %s\n", memcg_name); -done: -  	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",  		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,  		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1686,13 +1680,8 @@ done:  		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));  	for_each_mem_cgroup_tree(iter, memcg) { -		pr_info("Memory cgroup stats"); - -		rcu_read_lock(); -		ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); -		if (!ret) -			pr_cont(" for %s", memcg_name); -		rcu_read_unlock(); +		pr_info("Memory cgroup stats for "); +		pr_cont_cgroup_path(iter->css.cgroup);  		pr_cont(":");  		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -1708,6 +1697,7 @@ done:  		pr_cont("\n");  	} +	mutex_unlock(&oom_info_lock);  }  /* @@ -1800,13 +1790,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,  				break;  			};  			points = oom_badness(task, memcg, NULL, totalpages); -			if (points > chosen_points) { -				if (chosen) -					put_task_struct(chosen); -				chosen = task; -				chosen_points = points; -				get_task_struct(chosen); -			} +			if (!points || points < chosen_points) +				continue; +			/* Prefer thread group leaders for display purposes */ +			if (points == chosen_points && +			    thread_group_leader(chosen)) +				continue; + +			if (chosen) +				put_task_struct(chosen); +			chosen = task; +			chosen_points = points; +			get_task_struct(chosen);  		}  		css_task_iter_end(&it);  	} @@ -2044,6 +2039,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,  	return total;  } +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { +	.name = "memcg_oom_lock", +}; +#endif +  static DEFINE_SPINLOCK(memcg_oom_lock);  /* @@ -2081,7 +2082,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)  			}  			iter->oom_lock = false;  		} -	} +	} else +		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);  	spin_unlock(&memcg_oom_lock); @@ -2093,6 +2095,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)  	struct mem_cgroup *iter;  	spin_lock(&memcg_oom_lock); +	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);  	for_each_mem_cgroup_tree(iter, memcg)  		iter->oom_lock = false;  	spin_unlock(&memcg_oom_lock); @@ -2159,110 +2162,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)  		memcg_wakeup_oom(memcg);  } -/* - * try to call OOM killer - */  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)  { -	bool locked; -	int wakeups; -  	if (!current->memcg_oom.may_oom)  		return; - -	current->memcg_oom.in_memcg_oom = 1; -  	/* -	 * As with any blocking lock, a contender needs to start -	 * listening for wakeups before attempting the trylock, -	 * otherwise it can miss the wakeup from the unlock and sleep -	 * indefinitely.  This is just open-coded because our locking -	 * is so particular to memcg hierarchies. +	 * We are in the middle of the charge context here, so we +	 * don't want to block when potentially sitting on a callstack +	 * that holds all kinds of filesystem and mm locks. +	 * +	 * Also, the caller may handle a failed allocation gracefully +	 * (like optional page cache readahead) and so an OOM killer +	 * invocation might not even be necessary. +	 * +	 * That's why we don't do anything here except remember the +	 * OOM context and then deal with it at the end of the page +	 * fault when the stack is unwound, the locks are released, +	 * and when we know whether the fault was overall successful.  	 */ -	wakeups = atomic_read(&memcg->oom_wakeups); -	mem_cgroup_mark_under_oom(memcg); - -	locked = mem_cgroup_oom_trylock(memcg); - -	if (locked) -		mem_cgroup_oom_notify(memcg); - -	if (locked && !memcg->oom_kill_disable) { -		mem_cgroup_unmark_under_oom(memcg); -		mem_cgroup_out_of_memory(memcg, mask, order); -		mem_cgroup_oom_unlock(memcg); -		/* -		 * There is no guarantee that an OOM-lock contender -		 * sees the wakeups triggered by the OOM kill -		 * uncharges.  Wake any sleepers explicitely. -		 */ -		memcg_oom_recover(memcg); -	} else { -		/* -		 * A system call can just return -ENOMEM, but if this -		 * is a page fault and somebody else is handling the -		 * OOM already, we need to sleep on the OOM waitqueue -		 * for this memcg until the situation is resolved. -		 * Which can take some time because it might be -		 * handled by a userspace task. -		 * -		 * However, this is the charge context, which means -		 * that we may sit on a large call stack and hold -		 * various filesystem locks, the mmap_sem etc. and we -		 * don't want the OOM handler to deadlock on them -		 * while we sit here and wait.  Store the current OOM -		 * context in the task_struct, then return -ENOMEM. -		 * At the end of the page fault handler, with the -		 * stack unwound, pagefault_out_of_memory() will check -		 * back with us by calling -		 * mem_cgroup_oom_synchronize(), possibly putting the -		 * task to sleep. -		 */ -		current->memcg_oom.oom_locked = locked; -		current->memcg_oom.wakeups = wakeups; -		css_get(&memcg->css); -		current->memcg_oom.wait_on_memcg = memcg; -	} +	css_get(&memcg->css); +	current->memcg_oom.memcg = memcg; +	current->memcg_oom.gfp_mask = mask; +	current->memcg_oom.order = order;  }  /**   * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state   * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled.   * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must   * sleep on a waitqueue until the userspace task resolves the   * situation.  Sleeping directly in the charge context with all kinds   * of locks held is not a good idea, instead we remember an OOM state   * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling.   *   * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise.   */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle)  { +	struct mem_cgroup *memcg = current->memcg_oom.memcg;  	struct oom_wait_info owait; -	struct mem_cgroup *memcg; +	bool locked;  	/* OOM is global, do not handle */ -	if (!current->memcg_oom.in_memcg_oom) -		return false; - -	/* -	 * We invoked the OOM killer but there is a chance that a kill -	 * did not free up any charges.  Everybody else might already -	 * be sleeping, so restart the fault and keep the rampage -	 * going until some charges are released. -	 */ -	memcg = current->memcg_oom.wait_on_memcg;  	if (!memcg) -		goto out; +		return false; -	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) -		goto out_memcg; +	if (!handle) +		goto cleanup;  	owait.memcg = memcg;  	owait.wait.flags = 0; @@ -2271,13 +2223,25 @@ bool mem_cgroup_oom_synchronize(void)  	INIT_LIST_HEAD(&owait.wait.task_list);  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); -	/* Only sleep if we didn't miss any wakeups since OOM */ -	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) +	mem_cgroup_mark_under_oom(memcg); + +	locked = mem_cgroup_oom_trylock(memcg); + +	if (locked) +		mem_cgroup_oom_notify(memcg); + +	if (locked && !memcg->oom_kill_disable) { +		mem_cgroup_unmark_under_oom(memcg); +		finish_wait(&memcg_oom_waitq, &owait.wait); +		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, +					 current->memcg_oom.order); +	} else {  		schedule(); -	finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: -	mem_cgroup_unmark_under_oom(memcg); -	if (current->memcg_oom.oom_locked) { +		mem_cgroup_unmark_under_oom(memcg); +		finish_wait(&memcg_oom_waitq, &owait.wait); +	} + +	if (locked) {  		mem_cgroup_oom_unlock(memcg);  		/*  		 * There is no guarantee that an OOM-lock contender @@ -2286,20 +2250,18 @@ out_memcg:  		 */  		memcg_oom_recover(memcg);  	} +cleanup: +	current->memcg_oom.memcg = NULL;  	css_put(&memcg->css); -	current->memcg_oom.wait_on_memcg = NULL; -out: -	current->memcg_oom.in_memcg_oom = 0;  	return true;  }  /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics.   *   * Notes: Race condition   * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but   * it tends to be costly. But considering some conditions, we doesn't need   * to do so _always_.   * @@ -2313,8 +2275,8 @@ out:   * by flags.   *   * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock.   */  void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2332,9 +2294,10 @@ again:  	 * If this memory cgroup is not under account moving, we don't  	 * need to take move_lock_mem_cgroup(). Because we already hold  	 * rcu_read_lock(), any calls to move_account will be delayed until -	 * rcu_read_unlock() if mem_cgroup_stolen() == true. +	 * rcu_read_unlock().  	 */ -	if (!mem_cgroup_stolen(memcg)) +	VM_BUG_ON(!rcu_read_lock_held()); +	if (atomic_read(&memcg->moving_account) <= 0)  		return;  	move_lock_mem_cgroup(memcg, flags); @@ -2442,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)   */  static void drain_local_stock(struct work_struct *dummy)  { -	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); +	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);  	drain_stock(stock);  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);  } @@ -2589,7 +2552,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,  } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */  enum {  	CHARGE_OK,		/* success */  	CHARGE_RETRY,		/* need to retry but retry is not bad */ @@ -2662,113 +2625,52 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	return CHARGE_NOMEM;  } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - *  0       ...  on success, filling *ptr with a valid memcg pointer. - *  -ENOMEM ...  charge failure because of resource limits. - *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails   * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.   */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, -				   gfp_t gfp_mask, -				   unsigned int nr_pages, -				   struct mem_cgroup **ptr, -				   bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, +				 gfp_t gfp_mask, +				 unsigned int nr_pages, +				 bool oom)  {  	unsigned int batch = max(CHARGE_BATCH, nr_pages);  	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; -	struct mem_cgroup *memcg = NULL;  	int ret; +	if (mem_cgroup_is_root(memcg)) +		goto done;  	/* -	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage -	 * in system level. So, allow to go ahead dying process in addition to -	 * MEMDIE process. +	 * Unlike in global OOM situations, memcg is not in a physical +	 * memory shortage.  Allow dying and OOM-killed tasks to +	 * bypass the last charges so that they can exit quickly and +	 * free their memory.  	 */ -	if (unlikely(test_thread_flag(TIF_MEMDIE) -		     || fatal_signal_pending(current))) +	if (unlikely(test_thread_flag(TIF_MEMDIE) || +		     fatal_signal_pending(current) || +		     current->flags & PF_EXITING))  		goto bypass; -	/* -	 * We always charge the cgroup the mm_struct belongs to. -	 * The mm_struct's mem_cgroup changes on task migration if the -	 * thread group leader migrates. It's possible that mm is not -	 * set, if so charge the root memcg (happens for pagecache usage). -	 */ -	if (!*ptr && !mm) -		*ptr = root_mem_cgroup; -again: -	if (*ptr) { /* css should be a valid one */ -		memcg = *ptr; -		if (mem_cgroup_is_root(memcg)) -			goto done; -		if (consume_stock(memcg, nr_pages)) -			goto done; -		css_get(&memcg->css); -	} else { -		struct task_struct *p; +	if (unlikely(task_in_memcg_oom(current))) +		goto nomem; -		rcu_read_lock(); -		p = rcu_dereference(mm->owner); -		/* -		 * Because we don't have task_lock(), "p" can exit. -		 * In that case, "memcg" can point to root or p can be NULL with -		 * race with swapoff. Then, we have small risk of mis-accouning. -		 * But such kind of mis-account by race always happens because -		 * we don't have cgroup_mutex(). It's overkill and we allo that -		 * small race, here. -		 * (*) swapoff at el will charge against mm-struct not against -		 * task-struct. So, mm->owner can be NULL. -		 */ -		memcg = mem_cgroup_from_task(p); -		if (!memcg) -			memcg = root_mem_cgroup; -		if (mem_cgroup_is_root(memcg)) { -			rcu_read_unlock(); -			goto done; -		} -		if (consume_stock(memcg, nr_pages)) { -			/* -			 * It seems dagerous to access memcg without css_get(). -			 * But considering how consume_stok works, it's not -			 * necessary. If consume_stock success, some charges -			 * from this memcg are cached on this cpu. So, we -			 * don't need to call css_get()/css_tryget() before -			 * calling consume_stock(). -			 */ -			rcu_read_unlock(); -			goto done; -		} -		/* after here, we may be blocked. we need to get refcnt */ -		if (!css_tryget(&memcg->css)) { -			rcu_read_unlock(); -			goto again; -		} -		rcu_read_unlock(); -	} +	if (gfp_mask & __GFP_NOFAIL) +		oom = false; +again: +	if (consume_stock(memcg, nr_pages)) +		goto done;  	do {  		bool invoke_oom = oom && !nr_oom_retries;  		/* If killed, bypass charge */ -		if (fatal_signal_pending(current)) { -			css_put(&memcg->css); +		if (fatal_signal_pending(current))  			goto bypass; -		}  		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,  					   nr_pages, invoke_oom); @@ -2777,17 +2679,12 @@ again:  			break;  		case CHARGE_RETRY: /* not in OOM situation but retry */  			batch = nr_pages; -			css_put(&memcg->css); -			memcg = NULL;  			goto again;  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ -			css_put(&memcg->css);  			goto nomem;  		case CHARGE_NOMEM: /* OOM routine works */ -			if (!oom || invoke_oom) { -				css_put(&memcg->css); +			if (!oom || invoke_oom)  				goto nomem; -			}  			nr_oom_retries--;  			break;  		} @@ -2795,18 +2692,44 @@ again:  	if (batch > nr_pages)  		refill_stock(memcg, batch - nr_pages); -	css_put(&memcg->css);  done: -	*ptr = memcg;  	return 0;  nomem: -	*ptr = NULL; -	return -ENOMEM; +	if (!(gfp_mask & __GFP_NOFAIL)) +		return -ENOMEM;  bypass: -	*ptr = root_mem_cgroup;  	return -EINTR;  } +/** + * mem_cgroup_try_charge_mm - try charging a mm + * @mm: mm_struct to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails + * + * Returns the charged mem_cgroup associated with the given mm_struct or + * NULL the charge failed. + */ +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, +				 gfp_t gfp_mask, +				 unsigned int nr_pages, +				 bool oom) + +{ +	struct mem_cgroup *memcg; +	int ret; + +	memcg = get_mem_cgroup_from_mm(mm); +	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); +	css_put(&memcg->css); +	if (ret == -EINTR) +		memcg = root_mem_cgroup; +	else if (ret) +		memcg = NULL; + +	return memcg; +} +  /*   * Somemtimes we have to undo a charge we got by try_charge().   * This function is for that and do uncharge, put css's refcnt. @@ -2844,21 +2767,16 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,  /*   * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock().  The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock().  The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.)   */  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)  { -	struct cgroup_subsys_state *css; -  	/* ID 0 is unused ID */  	if (!id)  		return NULL; -	css = css_lookup(&mem_cgroup_subsys, id); -	if (!css) -		return NULL; -	return mem_cgroup_from_css(css); +	return mem_cgroup_from_id(id);  }  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -2868,20 +2786,20 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  	unsigned short id;  	swp_entry_t ent; -	VM_BUG_ON(!PageLocked(page)); +	VM_BUG_ON_PAGE(!PageLocked(page), page);  	pc = lookup_page_cgroup(page);  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) {  		memcg = pc->mem_cgroup; -		if (memcg && !css_tryget(&memcg->css)) +		if (memcg && !css_tryget_online(&memcg->css))  			memcg = NULL;  	} else if (PageSwapCache(page)) {  		ent.val = page_private(page);  		id = lookup_swap_cgroup_id(ent);  		rcu_read_lock();  		memcg = mem_cgroup_lookup(id); -		if (memcg && !css_tryget(&memcg->css)) +		if (memcg && !css_tryget_online(&memcg->css))  			memcg = NULL;  		rcu_read_unlock();  	} @@ -2902,7 +2820,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	bool anon;  	lock_page_cgroup(pc); -	VM_BUG_ON(PageCgroupUsed(pc)); +	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);  	/*  	 * we don't need page_cgroup_lock about tail pages, becase they are not  	 * accessed by any other context at this point. @@ -2937,7 +2855,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	if (lrucare) {  		if (was_on_lru) {  			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); -			VM_BUG_ON(PageLRU(page)); +			VM_BUG_ON_PAGE(PageLRU(page), page);  			SetPageLRU(page);  			add_page_to_lru_list(page, lruvec, page_lru(page));  		} @@ -2963,10 +2881,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  static DEFINE_MUTEX(set_limit_mutex);  #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + +static DEFINE_MUTEX(activate_kmem_mutex); +  static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)  {  	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && -		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); +		memcg_kmem_is_active(memcg);  }  /* @@ -2979,14 +2905,13 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)  	VM_BUG_ON(p->is_root_cache);  	cachep = p->root_cache; -	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; +	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));  }  #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, -				    struct cftype *cft, struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));  	struct memcg_cache_params *params;  	if (!memcg_can_account_kmem(memcg)) @@ -2994,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,  	print_slabinfo_header(m); -	mutex_lock(&memcg->slab_caches_mutex); +	mutex_lock(&memcg_slab_mutex);  	list_for_each_entry(params, &memcg->memcg_slab_caches, list)  		cache_show(memcg_params_to_cache(params), m); -	mutex_unlock(&memcg->slab_caches_mutex); +	mutex_unlock(&memcg_slab_mutex);  	return 0;  } @@ -3006,27 +2931,17 @@ static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,  static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)  {  	struct res_counter *fail_res; -	struct mem_cgroup *_memcg;  	int ret = 0; -	bool may_oom;  	ret = res_counter_charge(&memcg->kmem, size, &fail_res);  	if (ret)  		return ret; -	/* -	 * Conditions under which we can wait for the oom_killer. Those are -	 * the same conditions tested by the core page allocator -	 */ -	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); - -	_memcg = memcg; -	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, -				      &_memcg, may_oom); - +	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, +				    oom_gfp_allowed(gfp));  	if (ret == -EINTR)  {  		/* -		 * __mem_cgroup_try_charge() chosed to bypass to root due to +		 * mem_cgroup_try_charge() chosed to bypass to root due to  		 * OOM kill or fatal signal.  Since our only options are to  		 * either fail the allocation or charge it to this cgroup, do  		 * it as a temporary condition. But we can't fail. From a @@ -3036,7 +2951,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)  		 *  		 * This condition will only trigger if the task entered  		 * memcg_charge_kmem in a sane state, but was OOM-killed during -		 * __mem_cgroup_try_charge() above. Tasks that were already +		 * mem_cgroup_try_charge() above. Tasks that were already  		 * dying when the allocation triggers should have been already  		 * directed to the root cgroup in memcontrol.h  		 */ @@ -3073,16 +2988,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)  		css_put(&memcg->css);  } -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) -{ -	if (!memcg) -		return; - -	mutex_lock(&memcg->slab_caches_mutex); -	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); -	mutex_unlock(&memcg->slab_caches_mutex); -} -  /*   * helper for acessing a memcg's index. It will be used as an index in the   * child cache array in kmem_cache, and also to derive its name. This function @@ -3093,43 +2998,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)  	return memcg ? memcg->kmemcg_id : -1;  } -/* - * This ends up being protected by the set_limit mutex, during normal - * operation, because that is its main call site. - * - * But when we create a new cache, we can call this as well if its parent - * is kmem-limited. That will have to hold set_limit_mutex as well. - */ -int memcg_update_cache_sizes(struct mem_cgroup *memcg) -{ -	int num, ret; - -	num = ida_simple_get(&kmem_limited_groups, -				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); -	if (num < 0) -		return num; -	/* -	 * After this point, kmem_accounted (that we test atomically in -	 * the beginning of this conditional), is no longer 0. This -	 * guarantees only one process will set the following boolean -	 * to true. We don't need test_and_set because we're protected -	 * by the set_limit_mutex anyway. -	 */ -	memcg_kmem_set_activated(memcg); - -	ret = memcg_update_all_caches(num+1); -	if (ret) { -		ida_simple_remove(&kmem_limited_groups, num); -		memcg_kmem_clear_activated(memcg); -		return ret; -	} - -	memcg->kmemcg_id = num; -	INIT_LIST_HEAD(&memcg->memcg_slab_caches); -	mutex_init(&memcg->slab_caches_mutex); -	return 0; -} -  static size_t memcg_caches_array_size(int num_groups)  {  	ssize_t size; @@ -3156,28 +3024,25 @@ void memcg_update_array_size(int num)  		memcg_limited_groups_array_size = memcg_caches_array_size(num);  } -static void kmem_cache_destroy_work_func(struct work_struct *w); -  int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  {  	struct memcg_cache_params *cur_params = s->memcg_params; -	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); +	VM_BUG_ON(!is_root_cache(s));  	if (num_groups > memcg_limited_groups_array_size) {  		int i; +		struct memcg_cache_params *new_params;  		ssize_t size = memcg_caches_array_size(num_groups);  		size *= sizeof(void *);  		size += offsetof(struct memcg_cache_params, memcg_caches); -		s->memcg_params = kzalloc(size, GFP_KERNEL); -		if (!s->memcg_params) { -			s->memcg_params = cur_params; +		new_params = kzalloc(size, GFP_KERNEL); +		if (!new_params)  			return -ENOMEM; -		} -		s->memcg_params->is_root_cache = true; +		new_params->is_root_cache = true;  		/*  		 * There is the chance it will be bigger than @@ -3191,7 +3056,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  		for (i = 0; i < memcg_limited_groups_array_size; i++) {  			if (!cur_params->memcg_caches[i])  				continue; -			s->memcg_params->memcg_caches[i] = +			new_params->memcg_caches[i] =  						cur_params->memcg_caches[i];  		} @@ -3204,13 +3069,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  		 * bigger than the others. And all updates will reset this  		 * anyway.  		 */ -		kfree(cur_params); +		rcu_assign_pointer(s->memcg_params, new_params); +		if (cur_params) +			kfree_rcu(cur_params, rcu_head);  	}  	return 0;  } -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, -			 struct kmem_cache *root_cache) +int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, +			     struct kmem_cache *root_cache)  {  	size_t size; @@ -3230,43 +3097,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,  	if (memcg) {  		s->memcg_params->memcg = memcg;  		s->memcg_params->root_cache = root_cache; -		INIT_WORK(&s->memcg_params->destroy, -				kmem_cache_destroy_work_func); +		css_get(&memcg->css);  	} else  		s->memcg_params->is_root_cache = true;  	return 0;  } -void memcg_release_cache(struct kmem_cache *s) +void memcg_free_cache_params(struct kmem_cache *s)  { -	struct kmem_cache *root; -	struct mem_cgroup *memcg; +	if (!s->memcg_params) +		return; +	if (!s->memcg_params->is_root_cache) +		css_put(&s->memcg_params->memcg->css); +	kfree(s->memcg_params); +} + +static void memcg_register_cache(struct mem_cgroup *memcg, +				 struct kmem_cache *root_cache) +{ +	static char memcg_name_buf[NAME_MAX + 1]; /* protected by +						     memcg_slab_mutex */ +	struct kmem_cache *cachep;  	int id; +	lockdep_assert_held(&memcg_slab_mutex); + +	id = memcg_cache_id(memcg); +  	/* -	 * This happens, for instance, when a root cache goes away before we -	 * add any memcg. +	 * Since per-memcg caches are created asynchronously on first +	 * allocation (see memcg_kmem_get_cache()), several threads can try to +	 * create the same cache, but only one of them may succeed.  	 */ -	if (!s->memcg_params) +	if (cache_from_memcg_idx(root_cache, id))  		return; -	if (s->memcg_params->is_root_cache) -		goto out; +	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); +	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); +	/* +	 * If we could not create a memcg cache, do not complain, because +	 * that's not critical at all as we can always proceed with the root +	 * cache. +	 */ +	if (!cachep) +		return; -	memcg = s->memcg_params->memcg; -	id  = memcg_cache_id(memcg); +	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); -	root = s->memcg_params->root_cache; -	root->memcg_params->memcg_caches[id] = NULL; +	/* +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a +	 * barrier here to ensure nobody will see the kmem_cache partially +	 * initialized. +	 */ +	smp_wmb(); -	mutex_lock(&memcg->slab_caches_mutex); -	list_del(&s->memcg_params->list); -	mutex_unlock(&memcg->slab_caches_mutex); +	BUG_ON(root_cache->memcg_params->memcg_caches[id]); +	root_cache->memcg_params->memcg_caches[id] = cachep; +} -	css_put(&memcg->css); -out: -	kfree(s->memcg_params); +static void memcg_unregister_cache(struct kmem_cache *cachep) +{ +	struct kmem_cache *root_cache; +	struct mem_cgroup *memcg; +	int id; + +	lockdep_assert_held(&memcg_slab_mutex); + +	BUG_ON(is_root_cache(cachep)); + +	root_cache = cachep->memcg_params->root_cache; +	memcg = cachep->memcg_params->memcg; +	id = memcg_cache_id(memcg); + +	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); +	root_cache->memcg_params->memcg_caches[id] = NULL; + +	list_del(&cachep->memcg_params->list); + +	kmem_cache_destroy(cachep);  }  /* @@ -3300,241 +3209,74 @@ static inline void memcg_resume_kmem_account(void)  	current->memcg_kmem_skip_account--;  } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ -	struct kmem_cache *cachep; -	struct memcg_cache_params *p; - -	p = container_of(w, struct memcg_cache_params, destroy); - -	cachep = memcg_params_to_cache(p); - -	/* -	 * If we get down to 0 after shrink, we could delete right away. -	 * However, memcg_release_pages() already puts us back in the workqueue -	 * in that case. If we proceed deleting, we'll get a dangling -	 * reference, and removing the object from the workqueue in that case -	 * is unnecessary complication. We are not a fast path. -	 * -	 * Note that this case is fundamentally different from racing with -	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in -	 * kmem_cache_shrink, not only we would be reinserting a dead cache -	 * into the queue, but doing so from inside the worker racing to -	 * destroy it. -	 * -	 * So if we aren't down to zero, we'll just schedule a worker and try -	 * again -	 */ -	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { -		kmem_cache_shrink(cachep); -		if (atomic_read(&cachep->memcg_params->nr_pages) == 0) -			return; -	} else -		kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ -	if (!cachep->memcg_params->dead) -		return; - -	/* -	 * There are many ways in which we can get here. -	 * -	 * We can get to a memory-pressure situation while the delayed work is -	 * still pending to run. The vmscan shrinkers can then release all -	 * cache memory and get us to destruction. If this is the case, we'll -	 * be executed twice, which is a bug (the second time will execute over -	 * bogus data). In this case, cancelling the work should be fine. -	 * -	 * But we can also get here from the worker itself, if -	 * kmem_cache_shrink is enough to shake all the remaining objects and -	 * get the page count to 0. In this case, we'll deadlock if we try to -	 * cancel the work (the worker runs with an internal lock held, which -	 * is the same lock we would hold for cancel_work_sync().) -	 * -	 * Since we can't possibly know who got us here, just refrain from -	 * running if there is already work pending -	 */ -	if (work_pending(&cachep->memcg_params->destroy)) -		return; -	/* -	 * We have to defer the actual destroying to a workqueue, because -	 * we might currently be in a context that cannot sleep. -	 */ -	schedule_work(&cachep->memcg_params->destroy); -} - -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); - -/* - * Called with memcg_cache_mutex held - */ -static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, -					 struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s)  { -	struct kmem_cache *new; -	static char *tmp_name = NULL; - -	lockdep_assert_held(&memcg_cache_mutex); - -	/* -	 * kmem_cache_create_memcg duplicates the given name and -	 * cgroup_name for this name requires RCU context. -	 * This static temporary buffer is used to prevent from -	 * pointless shortliving allocation. -	 */ -	if (!tmp_name) { -		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); -		if (!tmp_name) -			return NULL; -	} - -	rcu_read_lock(); -	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, -			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); -	rcu_read_unlock(); - -	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, -				      (s->flags & ~SLAB_PANIC), s->ctor, s); - -	if (new) -		new->allocflags |= __GFP_KMEMCG; - -	return new; -} - -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, -						  struct kmem_cache *cachep) -{ -	struct kmem_cache *new_cachep; -	int idx; +	struct kmem_cache *c; +	int i, failed = 0; -	BUG_ON(!memcg_can_account_kmem(memcg)); +	mutex_lock(&memcg_slab_mutex); +	for_each_memcg_cache_index(i) { +		c = cache_from_memcg_idx(s, i); +		if (!c) +			continue; -	idx = memcg_cache_id(memcg); +		memcg_unregister_cache(c); -	mutex_lock(&memcg_cache_mutex); -	new_cachep = cachep->memcg_params->memcg_caches[idx]; -	if (new_cachep) { -		css_put(&memcg->css); -		goto out; -	} - -	new_cachep = kmem_cache_dup(memcg, cachep); -	if (new_cachep == NULL) { -		new_cachep = cachep; -		css_put(&memcg->css); -		goto out; +		if (cache_from_memcg_idx(s, i)) +			failed++;  	} - -	atomic_set(&new_cachep->memcg_params->nr_pages , 0); - -	cachep->memcg_params->memcg_caches[idx] = new_cachep; -	/* -	 * the readers won't lock, make sure everybody sees the updated value, -	 * so they won't put stuff in the queue again for no reason -	 */ -	wmb(); -out: -	mutex_unlock(&memcg_cache_mutex); -	return new_cachep; +	mutex_unlock(&memcg_slab_mutex); +	return failed;  } -void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg)  { -	struct kmem_cache *c; -	int i; +	struct kmem_cache *cachep; +	struct memcg_cache_params *params, *tmp; -	if (!s->memcg_params) -		return; -	if (!s->memcg_params->is_root_cache) +	if (!memcg_kmem_is_active(memcg))  		return; -	/* -	 * If the cache is being destroyed, we trust that there is no one else -	 * requesting objects from it. Even if there are, the sanity checks in -	 * kmem_cache_destroy should caught this ill-case. -	 * -	 * Still, we don't want anyone else freeing memcg_caches under our -	 * noses, which can happen if a new memcg comes to life. As usual, -	 * we'll take the set_limit_mutex to protect ourselves against this. -	 */ -	mutex_lock(&set_limit_mutex); -	for (i = 0; i < memcg_limited_groups_array_size; i++) { -		c = s->memcg_params->memcg_caches[i]; -		if (!c) -			continue; - -		/* -		 * We will now manually delete the caches, so to avoid races -		 * we need to cancel all pending destruction workers and -		 * proceed with destruction ourselves. -		 * -		 * kmem_cache_destroy() will call kmem_cache_shrink internally, -		 * and that could spawn the workers again: it is likely that -		 * the cache still have active pages until this very moment. -		 * This would lead us back to mem_cgroup_destroy_cache. -		 * -		 * But that will not execute at all if the "dead" flag is not -		 * set, so flip it down to guarantee we are in control. -		 */ -		c->memcg_params->dead = false; -		cancel_work_sync(&c->memcg_params->destroy); -		kmem_cache_destroy(c); +	mutex_lock(&memcg_slab_mutex); +	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { +		cachep = memcg_params_to_cache(params); +		kmem_cache_shrink(cachep); +		if (atomic_read(&cachep->memcg_params->nr_pages) == 0) +			memcg_unregister_cache(cachep);  	} -	mutex_unlock(&set_limit_mutex); +	mutex_unlock(&memcg_slab_mutex);  } -struct create_work { +struct memcg_register_cache_work {  	struct mem_cgroup *memcg;  	struct kmem_cache *cachep;  	struct work_struct work;  }; -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_register_cache_func(struct work_struct *w)  { -	struct kmem_cache *cachep; -	struct memcg_cache_params *params; - -	if (!memcg_kmem_is_active(memcg)) -		return; +	struct memcg_register_cache_work *cw = +		container_of(w, struct memcg_register_cache_work, work); +	struct mem_cgroup *memcg = cw->memcg; +	struct kmem_cache *cachep = cw->cachep; -	mutex_lock(&memcg->slab_caches_mutex); -	list_for_each_entry(params, &memcg->memcg_slab_caches, list) { -		cachep = memcg_params_to_cache(params); -		cachep->memcg_params->dead = true; -		schedule_work(&cachep->memcg_params->destroy); -	} -	mutex_unlock(&memcg->slab_caches_mutex); -} +	mutex_lock(&memcg_slab_mutex); +	memcg_register_cache(memcg, cachep); +	mutex_unlock(&memcg_slab_mutex); -static void memcg_create_cache_work_func(struct work_struct *w) -{ -	struct create_work *cw; - -	cw = container_of(w, struct create_work, work); -	memcg_create_kmem_cache(cw->memcg, cw->cachep); +	css_put(&memcg->css);  	kfree(cw);  }  /*   * Enqueue the creation of a per-memcg kmem_cache.   */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, -					 struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, +					    struct kmem_cache *cachep)  { -	struct create_work *cw; +	struct memcg_register_cache_work *cw; -	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); +	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);  	if (cw == NULL) {  		css_put(&memcg->css);  		return; @@ -3543,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,  	cw->memcg = memcg;  	cw->cachep = cachep; -	INIT_WORK(&cw->work, memcg_create_cache_work_func); +	INIT_WORK(&cw->work, memcg_register_cache_func);  	schedule_work(&cw->work);  } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, -				       struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, +					  struct kmem_cache *cachep)  {  	/*  	 * We need to stop accounting when we kmalloc, because if the  	 * corresponding kmalloc cache is not yet created, the first allocation -	 * in __memcg_create_cache_enqueue will recurse. +	 * in __memcg_schedule_register_cache will recurse.  	 *  	 * However, it is better to enclose the whole function. Depending on  	 * the debugging options enabled, INIT_WORK(), for instance, can @@ -3562,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,  	 * the safest choice is to do it like this, wrapping the whole function.  	 */  	memcg_stop_kmem_account(); -	__memcg_create_cache_enqueue(memcg, cachep); +	__memcg_schedule_register_cache(memcg, cachep);  	memcg_resume_kmem_account();  } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ +	int res; + +	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, +				PAGE_SIZE << order); +	if (!res) +		atomic_add(1 << order, &cachep->memcg_params->nr_pages); +	return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ +	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); +	atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} +  /*   * Return the kmem_cache we're supposed to use for a slab allocation.   * We try to use the current memcg's version of the cache. @@ -3582,7 +3342,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  					  gfp_t gfp)  {  	struct mem_cgroup *memcg; -	int idx; +	struct kmem_cache *memcg_cachep;  	VM_BUG_ON(!cachep->memcg_params);  	VM_BUG_ON(!cachep->memcg_params->is_root_cache); @@ -3596,20 +3356,14 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  	if (!memcg_can_account_kmem(memcg))  		goto out; -	idx = memcg_cache_id(memcg); - -	/* -	 * barrier to mare sure we're always seeing the up to date value.  The -	 * code updating memcg_caches will issue a write barrier to match this. -	 */ -	read_barrier_depends(); -	if (likely(cachep->memcg_params->memcg_caches[idx])) { -		cachep = cachep->memcg_params->memcg_caches[idx]; +	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); +	if (likely(memcg_cachep)) { +		cachep = memcg_cachep;  		goto out;  	}  	/* The corresponding put will be done in the workqueue. */ -	if (!css_tryget(&memcg->css)) +	if (!css_tryget_online(&memcg->css))  		goto out;  	rcu_read_unlock(); @@ -3621,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  	 *  	 * However, there are some clashes that can arrive from locking.  	 * For instance, because we acquire the slab_mutex while doing -	 * kmem_cache_dup, this means no further allocation could happen -	 * with the slab_mutex held. -	 * -	 * Also, because cache creation issue get_online_cpus(), this -	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, -	 * that ends up reversed during cpu hotplug. (cpuset allocates -	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, -	 * better to defer everything. +	 * memcg_create_kmem_cache, this means no further allocation +	 * could happen with the slab_mutex held. So it's better to +	 * defer everything.  	 */ -	memcg_create_cache_enqueue(memcg, cachep); +	memcg_schedule_register_cache(memcg, cachep);  	return cachep;  out:  	rcu_read_unlock();  	return cachep;  } -EXPORT_SYMBOL(__memcg_kmem_get_cache);  /*   * We need to verify if the allocation against current->mm->owner's memcg is @@ -3663,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)  	/*  	 * Disabling accounting is only relevant for some specific memcg  	 * internal allocations. Therefore we would initially not have such -	 * check here, since direct calls to the page allocator that are marked -	 * with GFP_KMEMCG only happen outside memcg core. We are mostly -	 * concerned with cache allocations, and by having this test at -	 * memcg_kmem_get_cache, we are already able to relay the allocation to -	 * the root cache and bypass the memcg cache altogether. +	 * check here, since direct calls to the page allocator that are +	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen +	 * outside memcg core. We are mostly concerned with cache allocations, +	 * and by having this test at memcg_kmem_get_cache, we are already able +	 * to relay the allocation to the root cache and bypass the memcg cache +	 * altogether.  	 *  	 * There is one exception, though: the SLUB allocator does not create  	 * large order caches, but rather service large kmallocs directly from @@ -3687,15 +3436,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)  	if (!current->mm || current->memcg_kmem_skip_account)  		return true; -	memcg = try_get_mem_cgroup_from_mm(current->mm); - -	/* -	 * very rare case described in mem_cgroup_from_task. Unfortunately there -	 * isn't much we can do without complicating this too much, and it would -	 * be gfp-dependent anyway. Just let it go -	 */ -	if (unlikely(!memcg)) -		return true; +	memcg = get_mem_cgroup_from_mm(current->mm);  	if (!memcg_can_account_kmem(memcg)) {  		css_put(&memcg->css); @@ -3758,11 +3499,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)  	if (!memcg)  		return; -	VM_BUG_ON(mem_cgroup_is_root(memcg)); +	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);  	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);  }  #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)  {  }  #endif /* CONFIG_MEMCG_KMEM */ @@ -3798,20 +3539,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)  }  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline -void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, -					struct mem_cgroup *to, -					unsigned int nr_pages, -					enum mem_cgroup_stat_index idx) -{ -	/* Update stat data for mem_cgroup */ -	preempt_disable(); -	WARN_ON_ONCE(from->stat->count[idx] < nr_pages); -	__this_cpu_add(from->stat->count[idx], -nr_pages); -	__this_cpu_add(to->stat->count[idx], nr_pages); -	preempt_enable(); -} -  /**   * mem_cgroup_move_account - move account of the page   * @page: the page @@ -3838,7 +3565,7 @@ static int mem_cgroup_move_account(struct page *page,  	bool anon = PageAnon(page);  	VM_BUG_ON(from == to); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page);  	/*  	 * The page is isolated from LRU. So, collapse function  	 * will not handle this page. But page splitting can happen. @@ -3857,13 +3584,19 @@ static int mem_cgroup_move_account(struct page *page,  	move_lock_mem_cgroup(from, &flags); -	if (!anon && page_mapped(page)) -		mem_cgroup_move_account_page_stat(from, to, nr_pages, -			MEM_CGROUP_STAT_FILE_MAPPED); +	if (!anon && page_mapped(page)) { +		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], +			       nr_pages); +		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], +			       nr_pages); +	} -	if (PageWriteback(page)) -		mem_cgroup_move_account_page_stat(from, to, nr_pages, -			MEM_CGROUP_STAT_WRITEBACK); +	if (PageWriteback(page)) { +		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], +			       nr_pages); +		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], +			       nr_pages); +	}  	mem_cgroup_charge_statistics(from, page, anon, -nr_pages); @@ -3931,7 +3664,7 @@ static int mem_cgroup_move_parent(struct page *page,  		parent = root_mem_cgroup;  	if (nr_pages > 1) { -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  		flags = compound_lock_irqsave(page);  	} @@ -3949,23 +3682,23 @@ out:  	return ret;  } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, -				gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_charge_anon(struct page *page, +			      struct mm_struct *mm, gfp_t gfp_mask)  { -	struct mem_cgroup *memcg = NULL;  	unsigned int nr_pages = 1; +	struct mem_cgroup *memcg;  	bool oom = true; -	int ret; + +	if (mem_cgroup_disabled()) +		return 0; + +	VM_BUG_ON_PAGE(page_mapped(page), page); +	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); +	VM_BUG_ON(!mm);  	if (PageTransHuge(page)) {  		nr_pages <<= compound_order(page); -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  		/*  		 * Never OOM-kill a process for a huge page.  The  		 * fault handler will fall back to regular pages. @@ -3973,25 +3706,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,  		oom = false;  	} -	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); -	if (ret == -ENOMEM) -		return ret; -	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); +	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); +	if (!memcg) +		return -ENOMEM; +	__mem_cgroup_commit_charge(memcg, page, nr_pages, +				   MEM_CGROUP_CHARGE_TYPE_ANON, false);  	return 0;  } -int mem_cgroup_newpage_charge(struct page *page, -			      struct mm_struct *mm, gfp_t gfp_mask) -{ -	if (mem_cgroup_disabled()) -		return 0; -	VM_BUG_ON(page_mapped(page)); -	VM_BUG_ON(page->mapping && !PageAnon(page)); -	VM_BUG_ON(!mm); -	return mem_cgroup_charge_common(page, mm, gfp_mask, -					MEM_CGROUP_CHARGE_TYPE_ANON); -} -  /*   * While swap-in, try_charge -> commit or cancel, the page is locked.   * And when try_charge() successfully returns, one refcnt to memcg without @@ -4003,7 +3725,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,  					  gfp_t mask,  					  struct mem_cgroup **memcgp)  { -	struct mem_cgroup *memcg; +	struct mem_cgroup *memcg = NULL;  	struct page_cgroup *pc;  	int ret; @@ -4016,31 +3738,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,  	 * in turn serializes uncharging.  	 */  	if (PageCgroupUsed(pc)) -		return 0; -	if (!do_swap_account) -		goto charge_cur_mm; -	memcg = try_get_mem_cgroup_from_page(page); +		goto out; +	if (do_swap_account) +		memcg = try_get_mem_cgroup_from_page(page);  	if (!memcg) -		goto charge_cur_mm; -	*memcgp = memcg; -	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); +		memcg = get_mem_cgroup_from_mm(mm); +	ret = mem_cgroup_try_charge(memcg, mask, 1, true);  	css_put(&memcg->css);  	if (ret == -EINTR) -		ret = 0; -	return ret; -charge_cur_mm: -	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); -	if (ret == -EINTR) -		ret = 0; -	return ret; +		memcg = root_mem_cgroup; +	else if (ret) +		return ret; +out: +	*memcgp = memcg; +	return 0;  }  int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,  				 gfp_t gfp_mask, struct mem_cgroup **memcgp)  { -	*memcgp = NULL; -	if (mem_cgroup_disabled()) +	if (mem_cgroup_disabled()) { +		*memcgp = NULL;  		return 0; +	}  	/*  	 * A racing thread's fault, or swapoff, may have already  	 * updated the pte, and even removed page from swap cache: in @@ -4048,12 +3768,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,  	 * there's also a KSM case which does need to charge the page.  	 */  	if (!PageSwapCache(page)) { -		int ret; +		struct mem_cgroup *memcg; -		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); -		if (ret == -EINTR) -			ret = 0; -		return ret; +		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); +		if (!memcg) +			return -ENOMEM; +		*memcgp = memcg; +		return 0;  	}  	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);  } @@ -4097,11 +3818,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,  					  MEM_CGROUP_CHARGE_TYPE_ANON);  } -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,  				gfp_t gfp_mask)  { -	struct mem_cgroup *memcg = NULL;  	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; +	struct mem_cgroup *memcg;  	int ret;  	if (mem_cgroup_disabled()) @@ -4109,15 +3830,20 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  	if (PageCompound(page))  		return 0; -	if (!PageSwapCache(page)) -		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); -	else { /* page is swapcache/shmem */ +	if (PageSwapCache(page)) { /* shmem */  		ret = __mem_cgroup_try_charge_swapin(mm, page,  						     gfp_mask, &memcg); -		if (!ret) -			__mem_cgroup_commit_charge_swapin(page, memcg, type); +		if (ret) +			return ret; +		__mem_cgroup_commit_charge_swapin(page, memcg, type); +		return 0;  	} -	return ret; + +	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); +	if (!memcg) +		return -ENOMEM; +	__mem_cgroup_commit_charge(memcg, page, 1, type, false); +	return 0;  }  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, @@ -4190,7 +3916,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,  	if (PageTransHuge(page)) {  		nr_pages <<= compound_order(page); -		VM_BUG_ON(!PageTransHuge(page)); +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);  	}  	/*  	 * Check if our page_cgroup is valid @@ -4282,7 +4008,7 @@ void mem_cgroup_uncharge_page(struct page *page)  	/* early check. */  	if (page_mapped(page))  		return; -	VM_BUG_ON(page->mapping && !PageAnon(page)); +	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);  	/*  	 * If the page is in swap cache, uncharge should be deferred  	 * to the swap path, which also properly accounts swap usage @@ -4302,8 +4028,8 @@ void mem_cgroup_uncharge_page(struct page *page)  void mem_cgroup_uncharge_cache_page(struct page *page)  { -	VM_BUG_ON(page_mapped(page)); -	VM_BUG_ON(page->mapping); +	VM_BUG_ON_PAGE(page_mapped(page), page); +	VM_BUG_ON_PAGE(page->mapping, page);  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);  } @@ -4375,7 +4101,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)  	 * css_get() was called in uncharge().  	 */  	if (do_swap_account && swapout && memcg) -		swap_cgroup_record(ent, css_id(&memcg->css)); +		swap_cgroup_record(ent, mem_cgroup_id(memcg));  }  #endif @@ -4397,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)  	memcg = mem_cgroup_lookup(id);  	if (memcg) {  		/* -		 * We uncharge this because swap is freed. -		 * This memcg can be obsolete one. We avoid calling css_tryget +		 * We uncharge this because swap is freed.  This memcg can +		 * be obsolete one. We avoid calling css_tryget_online().  		 */  		if (!mem_cgroup_is_root(memcg))  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE); @@ -4427,8 +4153,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,  {  	unsigned short old_id, new_id; -	old_id = css_id(&from->css); -	new_id = css_id(&to->css); +	old_id = mem_cgroup_id(from); +	new_id = mem_cgroup_id(to);  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {  		mem_cgroup_swap_statistics(from, false); @@ -4852,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  					break;  			} while (1);  		} -		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +		__mem_cgroup_remove_exceeded(mz, mctz);  		excess = res_counter_soft_limit_excess(&mz->memcg->res);  		/*  		 * One school of thought says that we should not add @@ -4863,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  		 * term TODO.  		 */  		/* If excess == 0, no tree ops */ -		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); +		__mem_cgroup_insert_exceeded(mz, mctz, excess);  		spin_unlock(&mctz->lock);  		css_put(&mz->memcg->css);  		loop++; @@ -4930,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,  		if (mem_cgroup_move_parent(page, pc, memcg)) {  			/* found lock contention or "pc" is obsolete. */  			busy = page; -			cond_resched();  		} else  			busy = NULL; +		cond_resched();  	} while (!list_empty(list));  } @@ -4984,30 +4710,27 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)  }  /* - * This mainly exists for tests during the setting of set of use_hierarchy. - * Since this is the very setting we are changing, the current hierarchy value - * is meaningless + * Test whether @memcg has children, dead or alive.  Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy.  Testing use_hierarchy is the caller's responsiblity.   */ -static inline bool __memcg_has_children(struct mem_cgroup *memcg) +static inline bool memcg_has_children(struct mem_cgroup *memcg)  { -	struct cgroup_subsys_state *pos; +	bool ret; -	/* bounce at first found */ -	css_for_each_child(pos, &memcg->css) -		return true; -	return false; -} +	/* +	 * The lock does not prevent addition or deletion of children, but +	 * it prevents a new child from being initialized based on this +	 * parent in css_online(), so it's enough to decide whether +	 * hierarchically inherited attributes can still be changed or not. +	 */ +	lockdep_assert_held(&memcg_create_mutex); -/* - * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed - * to be already dead (as in mem_cgroup_force_empty, for instance).  This is - * from mem_cgroup_count_children(), in the sense that we don't really care how - * many children we have; we only need to know if we have any.  It also counts - * any memcg without hierarchy as infertile. - */ -static inline bool memcg_has_children(struct mem_cgroup *memcg) -{ -	return memcg->use_hierarchy && __memcg_has_children(memcg); +	rcu_read_lock(); +	ret = css_next_child(NULL, &memcg->css); +	rcu_read_unlock(); +	return ret;  }  /* @@ -5019,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  {  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; -	struct cgroup *cgrp = memcg->css.cgroup; - -	/* returns EBUSY if there is a task or if we come here twice. */ -	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) -		return -EBUSY;  	/* we call try-to-free pages for make this cgroup empty */  	lru_add_drain_all(); @@ -5043,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  		}  	} -	lru_add_drain(); -	mem_cgroup_reparent_charges(memcg);  	return 0;  } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, -					unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, +					    char *buf, size_t nbytes, +					    loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	if (mem_cgroup_is_root(memcg))  		return -EINVAL; -	return mem_cgroup_force_empty(memcg); +	return mem_cgroup_force_empty(memcg) ?: nbytes;  }  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -5070,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,  {  	int retval = 0;  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); +	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);  	mutex_lock(&memcg_create_mutex); @@ -5087,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,  	 */  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&  				(val == 1 || val == 0)) { -		if (!__memcg_has_children(memcg)) +		if (!memcg_has_children(memcg))  			memcg->use_hierarchy = val;  		else  			retval = -EBUSY; @@ -5140,14 +4857,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)  	return val << PAGE_SHIFT;  } -static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, -			       struct cftype *cft, struct file *file, -			       char __user *buf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, +				   struct cftype *cft)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	char str[64];  	u64 val; -	int name, len; +	int name;  	enum res_type type;  	type = MEMFILE_TYPE(cft->private); @@ -5173,15 +4888,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,  		BUG();  	} -	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); -	return simple_read_from_buffer(buf, nbytes, ppos, str, len); +	return val;  } -static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) -{ -	int ret = -EINVAL;  #ifdef CONFIG_MEMCG_KMEM -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +/* should be called with activate_kmem_mutex held */ +static int __memcg_activate_kmem(struct mem_cgroup *memcg, +				 unsigned long long limit) +{ +	int err = 0; +	int memcg_id; + +	if (memcg_kmem_is_active(memcg)) +		return 0; + +	/* +	 * We are going to allocate memory for data shared by all memory +	 * cgroups so let's stop accounting here. +	 */ +	memcg_stop_kmem_account(); +  	/*  	 * For simplicity, we won't allow this to be disabled.  It also can't  	 * be changed if the cgroup has children already, or if tasks had @@ -5195,89 +4921,121 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)  	 * of course permitted.  	 */  	mutex_lock(&memcg_create_mutex); -	mutex_lock(&set_limit_mutex); -	if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { -		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { -			ret = -EBUSY; -			goto out; -		} -		ret = res_counter_set_limit(&memcg->kmem, val); -		VM_BUG_ON(ret); +	if (cgroup_has_tasks(memcg->css.cgroup) || +	    (memcg->use_hierarchy && memcg_has_children(memcg))) +		err = -EBUSY; +	mutex_unlock(&memcg_create_mutex); +	if (err) +		goto out; -		ret = memcg_update_cache_sizes(memcg); -		if (ret) { -			res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); -			goto out; -		} -		static_key_slow_inc(&memcg_kmem_enabled_key); -		/* -		 * setting the active bit after the inc will guarantee no one -		 * starts accounting before all call sites are patched -		 */ -		memcg_kmem_set_active(memcg); -	} else -		ret = res_counter_set_limit(&memcg->kmem, val); +	memcg_id = ida_simple_get(&kmem_limited_groups, +				  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); +	if (memcg_id < 0) { +		err = memcg_id; +		goto out; +	} + +	/* +	 * Make sure we have enough space for this cgroup in each root cache's +	 * memcg_params. +	 */ +	mutex_lock(&memcg_slab_mutex); +	err = memcg_update_all_caches(memcg_id + 1); +	mutex_unlock(&memcg_slab_mutex); +	if (err) +		goto out_rmid; + +	memcg->kmemcg_id = memcg_id; +	INIT_LIST_HEAD(&memcg->memcg_slab_caches); + +	/* +	 * We couldn't have accounted to this cgroup, because it hasn't got the +	 * active bit set yet, so this should succeed. +	 */ +	err = res_counter_set_limit(&memcg->kmem, limit); +	VM_BUG_ON(err); + +	static_key_slow_inc(&memcg_kmem_enabled_key); +	/* +	 * Setting the active bit after enabling static branching will +	 * guarantee no one starts accounting before all call sites are +	 * patched. +	 */ +	memcg_kmem_set_active(memcg);  out: -	mutex_unlock(&set_limit_mutex); -	mutex_unlock(&memcg_create_mutex); -#endif +	memcg_resume_kmem_account(); +	return err; + +out_rmid: +	ida_simple_remove(&kmem_limited_groups, memcg_id); +	goto out; +} + +static int memcg_activate_kmem(struct mem_cgroup *memcg, +			       unsigned long long limit) +{ +	int ret; + +	mutex_lock(&activate_kmem_mutex); +	ret = __memcg_activate_kmem(memcg, limit); +	mutex_unlock(&activate_kmem_mutex); +	return ret; +} + +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, +				   unsigned long long val) +{ +	int ret; + +	if (!memcg_kmem_is_active(memcg)) +		ret = memcg_activate_kmem(memcg, val); +	else +		ret = res_counter_set_limit(&memcg->kmem, val);  	return ret;  } -#ifdef CONFIG_MEMCG_KMEM  static int memcg_propagate_kmem(struct mem_cgroup *memcg)  {  	int ret = 0;  	struct mem_cgroup *parent = parent_mem_cgroup(memcg); -	if (!parent) -		goto out; -	memcg->kmem_account_flags = parent->kmem_account_flags; -	/* -	 * When that happen, we need to disable the static branch only on those -	 * memcgs that enabled it. To achieve this, we would be forced to -	 * complicate the code by keeping track of which memcgs were the ones -	 * that actually enabled limits, and which ones got it from its -	 * parents. -	 * -	 * It is a lot simpler just to do static_key_slow_inc() on every child -	 * that is accounted. -	 */ -	if (!memcg_kmem_is_active(memcg)) -		goto out; +	if (!parent) +		return 0; +	mutex_lock(&activate_kmem_mutex);  	/* -	 * __mem_cgroup_free() will issue static_key_slow_dec() because this -	 * memcg is active already. If the later initialization fails then the -	 * cgroup core triggers the cleanup so we do not have to do it here. +	 * If the parent cgroup is not kmem-active now, it cannot be activated +	 * after this point, because it has at least one child already.  	 */ -	static_key_slow_inc(&memcg_kmem_enabled_key); - -	mutex_lock(&set_limit_mutex); -	memcg_stop_kmem_account(); -	ret = memcg_update_cache_sizes(memcg); -	memcg_resume_kmem_account(); -	mutex_unlock(&set_limit_mutex); -out: +	if (memcg_kmem_is_active(parent)) +		ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); +	mutex_unlock(&activate_kmem_mutex);  	return ret;  } +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, +				   unsigned long long val) +{ +	return -EINVAL; +}  #endif /* CONFIG_MEMCG_KMEM */  /*   * The user of this function is...   * RES_LIMIT.   */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, -			    const char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	enum res_type type;  	int name;  	unsigned long long val;  	int ret; -	type = MEMFILE_TYPE(cft->private); -	name = MEMFILE_ATTR(cft->private); +	buf = strstrip(buf); +	type = MEMFILE_TYPE(of_cft(of)->private); +	name = MEMFILE_ATTR(of_cft(of)->private);  	switch (name) {  	case RES_LIMIT: @@ -5286,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  			break;  		}  		/* This function does all necessary parse...reuse it */ -		ret = res_counter_memparse_write_strategy(buffer, &val); +		ret = res_counter_memparse_write_strategy(buf, &val);  		if (ret)  			break;  		if (type == _MEM) @@ -5294,12 +5052,12 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  		else if (type == _MEMSWAP)  			ret = mem_cgroup_resize_memsw_limit(memcg, val);  		else if (type == _KMEM) -			ret = memcg_update_kmem_limit(css, val); +			ret = memcg_update_kmem_limit(memcg, val);  		else  			return -EINVAL;  		break;  	case RES_SOFT_LIMIT: -		ret = res_counter_memparse_write_strategy(buffer, &val); +		ret = res_counter_memparse_write_strategy(buf, &val);  		if (ret)  			break;  		/* @@ -5316,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  		ret = -EINVAL; /* should be BUG() ? */  		break;  	} -	return ret; +	return ret ?: nbytes;  }  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5329,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,  	if (!memcg->use_hierarchy)  		goto out; -	while (css_parent(&memcg->css)) { -		memcg = mem_cgroup_from_css(css_parent(&memcg->css)); +	while (memcg->css.parent) { +		memcg = mem_cgroup_from_css(memcg->css.parent);  		if (!memcg->use_hierarchy)  			break;  		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5343,14 +5101,15 @@ out:  	*memsw_limit = min_memsw_limit;  } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, +				size_t nbytes, loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	int name;  	enum res_type type; -	type = MEMFILE_TYPE(event); -	name = MEMFILE_ATTR(event); +	type = MEMFILE_TYPE(of_cft(of)->private); +	name = MEMFILE_ATTR(of_cft(of)->private);  	switch (name) {  	case RES_MAX_USAGE: @@ -5375,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)  		break;  	} -	return 0; +	return nbytes;  }  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -5411,48 +5170,52 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,  #endif  #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup_subsys_state *css, -				struct cftype *cft, struct seq_file *m) +static int memcg_numa_stat_show(struct seq_file *m, void *v)  { -	int nid; -	unsigned long total_nr, file_nr, anon_nr, unevictable_nr; -	unsigned long node_nr; -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); - -	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); -	seq_printf(m, "total=%lu", total_nr); -	for_each_node_state(nid, N_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); -		seq_printf(m, " N%d=%lu", nid, node_nr); -	} -	seq_putc(m, '\n'); - -	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); -	seq_printf(m, "file=%lu", file_nr); -	for_each_node_state(nid, N_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, -				LRU_ALL_FILE); -		seq_printf(m, " N%d=%lu", nid, node_nr); -	} -	seq_putc(m, '\n'); +	struct numa_stat { +		const char *name; +		unsigned int lru_mask; +	}; -	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); -	seq_printf(m, "anon=%lu", anon_nr); -	for_each_node_state(nid, N_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, -				LRU_ALL_ANON); -		seq_printf(m, " N%d=%lu", nid, node_nr); +	static const struct numa_stat stats[] = { +		{ "total", LRU_ALL }, +		{ "file", LRU_ALL_FILE }, +		{ "anon", LRU_ALL_ANON }, +		{ "unevictable", BIT(LRU_UNEVICTABLE) }, +	}; +	const struct numa_stat *stat; +	int nid; +	unsigned long nr; +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + +	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { +		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); +		seq_printf(m, "%s=%lu", stat->name, nr); +		for_each_node_state(nid, N_MEMORY) { +			nr = mem_cgroup_node_nr_lru_pages(memcg, nid, +							  stat->lru_mask); +			seq_printf(m, " N%d=%lu", nid, nr); +		} +		seq_putc(m, '\n'); +	} + +	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { +		struct mem_cgroup *iter; + +		nr = 0; +		for_each_mem_cgroup_tree(iter, memcg) +			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); +		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); +		for_each_node_state(nid, N_MEMORY) { +			nr = 0; +			for_each_mem_cgroup_tree(iter, memcg) +				nr += mem_cgroup_node_nr_lru_pages( +					iter, nid, stat->lru_mask); +			seq_printf(m, " N%d=%lu", nid, nr); +		} +		seq_putc(m, '\n');  	} -	seq_putc(m, '\n'); -	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); -	seq_printf(m, "unevictable=%lu", unevictable_nr); -	for_each_node_state(nid, N_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, -				BIT(LRU_UNEVICTABLE)); -		seq_printf(m, " N%d=%lu", nid, node_nr); -	} -	seq_putc(m, '\n');  	return 0;  }  #endif /* CONFIG_NUMA */ @@ -5462,10 +5225,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);  } -static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, -				 struct seq_file *m) +static int memcg_stat_show(struct seq_file *m, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));  	struct mem_cgroup *mi;  	unsigned int i; @@ -5531,7 +5293,7 @@ static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,  		for_each_online_node(nid)  			for (zid = 0; zid < MAX_NR_ZONES; zid++) { -				mz = mem_cgroup_zoneinfo(memcg, nid, zid); +				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];  				rstat = &mz->lruvec.reclaim_stat;  				recent_rotated[0] += rstat->recent_rotated[0]; @@ -5561,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,  				       struct cftype *cft, u64 val)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - -	if (val > 100 || !parent) -		return -EINVAL; - -	mutex_lock(&memcg_create_mutex); -	/* If under hierarchy, only empty-root can set this value */ -	if ((parent->use_hierarchy) || memcg_has_children(memcg)) { -		mutex_unlock(&memcg_create_mutex); +	if (val > 100)  		return -EINVAL; -	} - -	memcg->swappiness = val; -	mutex_unlock(&memcg_create_mutex); +	if (css->parent) +		memcg->swappiness = val; +	else +		vm_swappiness = val;  	return 0;  } @@ -5661,8 +5415,12 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)  {  	struct mem_cgroup_eventfd_list *ev; +	spin_lock(&memcg_oom_lock); +  	list_for_each_entry(ev, &memcg->oom_notify, list)  		eventfd_signal(ev->eventfd, 1); + +	spin_unlock(&memcg_oom_lock);  	return 0;  } @@ -5674,13 +5432,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)  		mem_cgroup_oom_notify_cb(iter);  } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args, enum res_type type)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 threshold, usage;  	int i, size, ret; @@ -5757,13 +5513,23 @@ unlock:  	return ret;  } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args) +{ +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args) +{ +	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, enum res_type type)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_thresholds *thresholds;  	struct mem_cgroup_threshold_ary *new; -	enum res_type type = MEMFILE_TYPE(cft->private);  	u64 usage;  	int i, j, size; @@ -5836,14 +5602,23 @@ unlock:  	mutex_unlock(&memcg->thresholds_lock);  } -static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd) +{ +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd) +{ +	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd, const char *args)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_eventfd_list *event; -	enum res_type type = MEMFILE_TYPE(cft->private); -	BUG_ON(type != _OOM_TYPE);  	event = kmalloc(sizeof(*event),	GFP_KERNEL);  	if (!event)  		return -ENOMEM; @@ -5861,14 +5636,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,  	return 0;  } -static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, -	struct cftype *cft, struct eventfd_ctx *eventfd) +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, +	struct eventfd_ctx *eventfd)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_eventfd_list *ev, *tmp; -	enum res_type type = MEMFILE_TYPE(cft->private); - -	BUG_ON(type != _OOM_TYPE);  	spin_lock(&memcg_oom_lock); @@ -5882,17 +5653,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,  	spin_unlock(&memcg_oom_lock);  } -static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, -	struct cftype *cft,  struct cgroup_map_cb *cb) +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); - -	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); +	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); -	if (atomic_read(&memcg->under_oom)) -		cb->fill(cb, "under_oom", 1); -	else -		cb->fill(cb, "under_oom", 0); +	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); +	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));  	return 0;  } @@ -5900,22 +5666,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,  	struct cftype *cft, u64 val)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));  	/* cannot set to root cgroup and only 0 and 1 are allowed */ -	if (!parent || !((val == 0) || (val == 1))) +	if (!css->parent || !((val == 0) || (val == 1)))  		return -EINVAL; -	mutex_lock(&memcg_create_mutex); -	/* oom-kill-disable is a flag for subhierarchy. */ -	if ((parent->use_hierarchy) || memcg_has_children(memcg)) { -		mutex_unlock(&memcg_create_mutex); -		return -EINVAL; -	}  	memcg->oom_kill_disable = val;  	if (!val)  		memcg_oom_recover(memcg); -	mutex_unlock(&memcg_create_mutex); +  	return 0;  } @@ -5955,10 +5714,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)  	 * which is then paired with css_put during uncharge resp. here.  	 *  	 * Although this might sound strange as this path is called from -	 * css_offline() when the referencemight have dropped down to 0 -	 * and shouldn't be incremented anymore (css_tryget would fail) -	 * we do not have other options because of the kmem allocations -	 * lifetime. +	 * css_offline() when the referencemight have dropped down to 0 and +	 * shouldn't be incremented anymore (css_tryget_online() would +	 * fail) we do not have other options because of the kmem +	 * allocations lifetime.  	 */  	css_get(&memcg->css); @@ -5985,45 +5744,266 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)  }  #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered.  It tries to support fully configurable + * events for each user.  Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ +	struct mem_cgroup_event *event = +		container_of(work, struct mem_cgroup_event, remove); +	struct mem_cgroup *memcg = event->memcg; + +	remove_wait_queue(event->wqh, &event->wait); + +	event->unregister_event(memcg, event->eventfd); + +	/* Notify userspace the event is going away. */ +	eventfd_signal(event->eventfd, 1); + +	eventfd_ctx_put(event->eventfd); +	kfree(event); +	css_put(&memcg->css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +			    int sync, void *key) +{ +	struct mem_cgroup_event *event = +		container_of(wait, struct mem_cgroup_event, wait); +	struct mem_cgroup *memcg = event->memcg; +	unsigned long flags = (unsigned long)key; + +	if (flags & POLLHUP) { +		/* +		 * If the event has been detached at cgroup removal, we +		 * can simply return knowing the other side will cleanup +		 * for us. +		 * +		 * We can't race against event freeing since the other +		 * side will require wqh->lock via remove_wait_queue(), +		 * which we hold. +		 */ +		spin_lock(&memcg->event_list_lock); +		if (!list_empty(&event->list)) { +			list_del_init(&event->list); +			/* +			 * We are in atomic context, but cgroup_event_remove() +			 * may sleep, so we have to call it in workqueue. +			 */ +			schedule_work(&event->remove); +		} +		spin_unlock(&memcg->event_list_lock); +	} + +	return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, +		wait_queue_head_t *wqh, poll_table *pt) +{ +	struct mem_cgroup_event *event = +		container_of(pt, struct mem_cgroup_event, pt); + +	event->wqh = wqh; +	add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, +					 char *buf, size_t nbytes, loff_t off) +{ +	struct cgroup_subsys_state *css = of_css(of); +	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup_event *event; +	struct cgroup_subsys_state *cfile_css; +	unsigned int efd, cfd; +	struct fd efile; +	struct fd cfile; +	const char *name; +	char *endp; +	int ret; + +	buf = strstrip(buf); + +	efd = simple_strtoul(buf, &endp, 10); +	if (*endp != ' ') +		return -EINVAL; +	buf = endp + 1; + +	cfd = simple_strtoul(buf, &endp, 10); +	if ((*endp != ' ') && (*endp != '\0')) +		return -EINVAL; +	buf = endp + 1; + +	event = kzalloc(sizeof(*event), GFP_KERNEL); +	if (!event) +		return -ENOMEM; + +	event->memcg = memcg; +	INIT_LIST_HEAD(&event->list); +	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); +	init_waitqueue_func_entry(&event->wait, memcg_event_wake); +	INIT_WORK(&event->remove, memcg_event_remove); + +	efile = fdget(efd); +	if (!efile.file) { +		ret = -EBADF; +		goto out_kfree; +	} + +	event->eventfd = eventfd_ctx_fileget(efile.file); +	if (IS_ERR(event->eventfd)) { +		ret = PTR_ERR(event->eventfd); +		goto out_put_efile; +	} + +	cfile = fdget(cfd); +	if (!cfile.file) { +		ret = -EBADF; +		goto out_put_eventfd; +	} + +	/* the process need read permission on control file */ +	/* AV: shouldn't we check that it's been opened for read instead? */ +	ret = inode_permission(file_inode(cfile.file), MAY_READ); +	if (ret < 0) +		goto out_put_cfile; + +	/* +	 * Determine the event callbacks and set them in @event.  This used +	 * to be done via struct cftype but cgroup core no longer knows +	 * about these events.  The following is crude but the whole thing +	 * is for compatibility anyway. +	 * +	 * DO NOT ADD NEW FILES. +	 */ +	name = cfile.file->f_dentry->d_name.name; + +	if (!strcmp(name, "memory.usage_in_bytes")) { +		event->register_event = mem_cgroup_usage_register_event; +		event->unregister_event = mem_cgroup_usage_unregister_event; +	} else if (!strcmp(name, "memory.oom_control")) { +		event->register_event = mem_cgroup_oom_register_event; +		event->unregister_event = mem_cgroup_oom_unregister_event; +	} else if (!strcmp(name, "memory.pressure_level")) { +		event->register_event = vmpressure_register_event; +		event->unregister_event = vmpressure_unregister_event; +	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { +		event->register_event = memsw_cgroup_usage_register_event; +		event->unregister_event = memsw_cgroup_usage_unregister_event; +	} else { +		ret = -EINVAL; +		goto out_put_cfile; +	} + +	/* +	 * Verify @cfile should belong to @css.  Also, remaining events are +	 * automatically removed on cgroup destruction but the removal is +	 * asynchronous, so take an extra ref on @css. +	 */ +	cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, +					       &memory_cgrp_subsys); +	ret = -EINVAL; +	if (IS_ERR(cfile_css)) +		goto out_put_cfile; +	if (cfile_css != css) { +		css_put(cfile_css); +		goto out_put_cfile; +	} + +	ret = event->register_event(memcg, event->eventfd, buf); +	if (ret) +		goto out_put_css; + +	efile.file->f_op->poll(efile.file, &event->pt); + +	spin_lock(&memcg->event_list_lock); +	list_add(&event->list, &memcg->event_list); +	spin_unlock(&memcg->event_list_lock); + +	fdput(cfile); +	fdput(efile); + +	return nbytes; + +out_put_css: +	css_put(css); +out_put_cfile: +	fdput(cfile); +out_put_eventfd: +	eventfd_ctx_put(event->eventfd); +out_put_efile: +	fdput(efile); +out_kfree: +	kfree(event); + +	return ret; +} +  static struct cftype mem_cgroup_files[] = {  	{  		.name = "usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE), -		.read = mem_cgroup_read, -		.register_event = mem_cgroup_usage_register_event, -		.unregister_event = mem_cgroup_usage_unregister_event, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), -		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.write = mem_cgroup_write, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "soft_limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), -		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.write = mem_cgroup_write, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "failcnt",  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "stat", -		.read_seq_string = memcg_stat_show, +		.seq_show = memcg_stat_show,  	},  	{  		.name = "force_empty", -		.trigger = mem_cgroup_force_empty_write, +		.write = mem_cgroup_force_empty_write,  	},  	{  		.name = "use_hierarchy", @@ -6032,6 +6012,12 @@ static struct cftype mem_cgroup_files[] = {  		.read_u64 = mem_cgroup_hierarchy_read,  	},  	{ +		.name = "cgroup.event_control",		/* XXX: for compat */ +		.write = memcg_write_event_control, +		.flags = CFTYPE_NO_PREFIX, +		.mode = S_IWUGO, +	}, +	{  		.name = "swappiness",  		.read_u64 = mem_cgroup_swappiness_read,  		.write_u64 = mem_cgroup_swappiness_write, @@ -6043,51 +6029,47 @@ static struct cftype mem_cgroup_files[] = {  	},  	{  		.name = "oom_control", -		.read_map = mem_cgroup_oom_control_read, +		.seq_show = mem_cgroup_oom_control_read,  		.write_u64 = mem_cgroup_oom_control_write, -		.register_event = mem_cgroup_oom_register_event, -		.unregister_event = mem_cgroup_oom_unregister_event,  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),  	},  	{  		.name = "pressure_level", -		.register_event = vmpressure_register_event, -		.unregister_event = vmpressure_unregister_event,  	},  #ifdef CONFIG_NUMA  	{  		.name = "numa_stat", -		.read_seq_string = memcg_numa_stat_show, +		.seq_show = memcg_numa_stat_show,  	},  #endif  #ifdef CONFIG_MEMCG_KMEM  	{  		.name = "kmem.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), -		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.write = mem_cgroup_write, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.usage_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), -		.read = mem_cgroup_read, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.failcnt",  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  #ifdef CONFIG_SLABINFO  	{  		.name = "kmem.slabinfo", -		.read_seq_string = mem_cgroup_slabinfo_read, +		.seq_show = mem_cgroup_slabinfo_read,  	},  #endif  #endif @@ -6099,27 +6081,25 @@ static struct cftype memsw_cgroup_files[] = {  	{  		.name = "memsw.usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), -		.read = mem_cgroup_read, -		.register_event = mem_cgroup_usage_register_event, -		.unregister_event = mem_cgroup_usage_unregister_event, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), -		.write_string = mem_cgroup_write, -		.read = mem_cgroup_read, +		.write = mem_cgroup_write, +		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.failcnt",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), -		.trigger = mem_cgroup_reset, -		.read = mem_cgroup_read, +		.write = mem_cgroup_reset, +		.read_u64 = mem_cgroup_read_u64,  	},  	{ },	/* terminate */  }; @@ -6162,14 +6142,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  static struct mem_cgroup *mem_cgroup_alloc(void)  {  	struct mem_cgroup *memcg; -	size_t size = memcg_size(); +	size_t size; -	/* Can be very big if nr_node_ids is very big */ -	if (size < PAGE_SIZE) -		memcg = kzalloc(size, GFP_KERNEL); -	else -		memcg = vzalloc(size); +	size = sizeof(struct mem_cgroup); +	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); +	memcg = kzalloc(size, GFP_KERNEL);  	if (!memcg)  		return NULL; @@ -6180,10 +6158,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)  	return memcg;  out_free: -	if (size < PAGE_SIZE) -		kfree(memcg); -	else -		vfree(memcg); +	kfree(memcg);  	return NULL;  } @@ -6201,10 +6176,8 @@ out_free:  static void __mem_cgroup_free(struct mem_cgroup *memcg)  {  	int node; -	size_t size = memcg_size();  	mem_cgroup_remove_from_trees(memcg); -	free_css_id(&mem_cgroup_subsys, &memcg->css);  	for_each_node(node)  		free_mem_cgroup_per_zone_info(memcg, node); @@ -6223,10 +6196,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)  	 * the cgroup_lock.  	 */  	disarm_static_keys(memcg); -	if (size < PAGE_SIZE) -		kfree(memcg); -	else -		vfree(memcg); +	kfree(memcg);  }  /* @@ -6292,6 +6262,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)  	mutex_init(&memcg->thresholds_lock);  	spin_lock_init(&memcg->move_lock);  	vmpressure_init(&memcg->vmpressure); +	INIT_LIST_HEAD(&memcg->event_list); +	spin_lock_init(&memcg->event_list_lock);  	return &memcg->css; @@ -6304,8 +6276,10 @@ static int  mem_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); -	int error = 0; +	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + +	if (css->id > MEM_CGROUP_ID_MAX) +		return -ENOSPC;  	if (!parent)  		return 0; @@ -6335,12 +6309,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)  		 * unfortunate state in our controller.  		 */  		if (parent != root_mem_cgroup) -			mem_cgroup_subsys.broken_hierarchy = true; +			memory_cgrp_subsys.broken_hierarchy = true;  	} - -	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);  	mutex_unlock(&memcg_create_mutex); -	return error; + +	return memcg_init_kmem(memcg, &memory_cgrp_subsys);  }  /* @@ -6364,18 +6337,75 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup_event *event, *tmp; +	struct cgroup_subsys_state *iter; + +	/* +	 * Unregister events and notify userspace. +	 * Notify userspace about cgroup removing only after rmdir of cgroup +	 * directory to avoid race between userspace and kernelspace. +	 */ +	spin_lock(&memcg->event_list_lock); +	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { +		list_del_init(&event->list); +		schedule_work(&event->remove); +	} +	spin_unlock(&memcg->event_list_lock);  	kmem_cgroup_css_offline(memcg);  	mem_cgroup_invalidate_reclaim_iterators(memcg); -	mem_cgroup_reparent_charges(memcg); -	mem_cgroup_destroy_all_caches(memcg); + +	/* +	 * This requires that offlining is serialized.  Right now that is +	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. +	 */ +	css_for_each_descendant_post(iter, css) +		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + +	memcg_unregister_all_caches(memcg);  	vmpressure_cleanup(&memcg->vmpressure);  }  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	/* +	 * XXX: css_offline() would be where we should reparent all +	 * memory to prepare the cgroup for destruction.  However, +	 * memcg does not do css_tryget_online() and res_counter charging +	 * under the same RCU lock region, which means that charging +	 * could race with offlining.  Offlining only happens to +	 * cgroups with no tasks in them but charges can show up +	 * without any tasks from the swapin path when the target +	 * memcg is looked up from the swapout record and not from the +	 * current task as it usually is.  A race like this can leak +	 * charges and put pages with stale cgroup pointers into +	 * circulation: +	 * +	 * #0                        #1 +	 *                           lookup_swap_cgroup_id() +	 *                           rcu_read_lock() +	 *                           mem_cgroup_lookup() +	 *                           css_tryget_online() +	 *                           rcu_read_unlock() +	 * disable css_tryget_online() +	 * call_rcu() +	 *   offline_css() +	 *     reparent_charges() +	 *                           res_counter_charge() +	 *                           css_put() +	 *                             css_free() +	 *                           pc->mem_cgroup = dead memcg +	 *                           add page to lru +	 * +	 * The bulk of the charges are still moved in offline_css() to +	 * avoid pinning a lot of pages in case a long-term reference +	 * like a swapout record is deferring the css_free() to long +	 * after offlining.  But this makes sure we catch any charges +	 * made after offlining: +	 */ +	mem_cgroup_reparent_charges(memcg);  	memcg_destroy_kmem(memcg);  	__mem_cgroup_free(memcg); @@ -6425,8 +6455,7 @@ one_by_one:  			batch_count = PRECHARGE_COUNT_AT_ONCE;  			cond_resched();  		} -		ret = __mem_cgroup_try_charge(NULL, -					GFP_KERNEL, 1, &memcg, false); +		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);  		if (ret)  			/* mem_cgroup_clear_mc() will do uncharge later */  			return ret; @@ -6530,16 +6559,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,  		pgoff = pte_to_pgoff(ptent);  	/* page is moved even if it's not RSS of this task(page-faulted). */ -	page = find_get_page(mapping, pgoff); -  #ifdef CONFIG_SWAP  	/* shmem/tmpfs may report page out on swap: account for that too. */ -	if (radix_tree_exceptional_entry(page)) { -		swp_entry_t swap = radix_to_swp_entry(page); -		if (do_swap_account) -			*entry = swap; -		page = find_get_page(swap_address_space(swap), swap.val); -	} +	if (shmem_mapping(mapping)) { +		page = find_get_entry(mapping, pgoff); +		if (radix_tree_exceptional_entry(page)) { +			swp_entry_t swp = radix_to_swp_entry(page); +			if (do_swap_account) +				*entry = swp; +			page = find_get_page(swap_address_space(swp), swp.val); +		} +	} else +		page = find_get_page(mapping, pgoff); +#else +	page = find_get_page(mapping, pgoff);  #endif  	return page;  } @@ -6578,7 +6611,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  	}  	/* There is a swap entry and a page doesn't exist or isn't charged */  	if (ent.val && !ret && -			css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { +	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {  		ret = MC_TARGET_SWAP;  		if (target)  			target->ent = ent; @@ -6600,7 +6633,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,  	enum mc_target_type ret = MC_TARGET_NONE;  	page = pmd_page(pmd); -	VM_BUG_ON(!page || !PageHead(page)); +	VM_BUG_ON_PAGE(!page || !PageHead(page), page);  	if (!move_anon())  		return ret;  	pc = lookup_page_cgroup(page); @@ -6629,10 +6662,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,  	pte_t *pte;  	spinlock_t *ptl; -	if (pmd_trans_huge_lock(pmd, vma) == 1) { +	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)  			mc.precharge += HPAGE_PMD_NR; -		spin_unlock(&vma->vm_mm->page_table_lock); +		spin_unlock(ptl);  		return 0;  	} @@ -6821,9 +6854,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,  	 *    to be unlocked in __split_huge_page_splitting(), where the main  	 *    part of thp split is not executed yet.  	 */ -	if (pmd_trans_huge_lock(pmd, vma) == 1) { +	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {  		if (mc.precharge < HPAGE_PMD_NR) { -			spin_unlock(&vma->vm_mm->page_table_lock); +			spin_unlock(ptl);  			return 0;  		}  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6840,7 +6873,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,  			}  			put_page(page);  		} -		spin_unlock(&vma->vm_mm->page_table_lock); +		spin_unlock(ptl);  		return 0;  	} @@ -6985,9 +7018,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)  		mem_cgroup_from_css(root_css)->use_hierarchy = true;  } -struct cgroup_subsys mem_cgroup_subsys = { -	.name = "memory", -	.subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = {  	.css_alloc = mem_cgroup_css_alloc,  	.css_online = mem_cgroup_css_online,  	.css_offline = mem_cgroup_css_offline, @@ -6998,7 +7029,6 @@ struct cgroup_subsys mem_cgroup_subsys = {  	.bind = mem_cgroup_bind,  	.base_cftypes = mem_cgroup_files,  	.early_init = 0, -	.use_id = 1,  };  #ifdef CONFIG_MEMCG_SWAP @@ -7014,7 +7044,7 @@ __setup("swapaccount=", enable_swap_account);  static void __init memsw_file_init(void)  { -	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); +	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));  }  static void __init enable_swap_cgroup(void)  | 
