aboutsummaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c555
1 files changed, 210 insertions, 345 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b6f91d61b3a..a2c7bcb0e6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly;
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
-static int really_do_swap_account __initdata = 0;
+static int really_do_swap_account __initdata;
#endif
#else
@@ -357,10 +357,9 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list. per-memcg */
+ /* analogous to slab_common's slab_caches list, but per-memcg;
+ * protected by memcg_slab_mutex */
struct list_head memcg_slab_caches;
- /* Not a spinlock, we can take a lot of time walking the list */
- struct mutex slab_caches_mutex;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
@@ -674,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
+mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
+
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
@@ -686,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
- return mem_cgroup_zoneinfo(memcg, nid, zid);
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
static struct mem_cgroup_tree_per_zone *
@@ -709,11 +710,9 @@ soft_limit_tree_from_page(struct page *page)
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
-static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -743,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
mz->on_tree = true;
}
-static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
@@ -754,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
mz->on_tree = false;
}
-static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock(&mctz->lock);
}
@@ -770,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
- int nid = page_to_nid(page);
- int zid = page_zonenum(page);
- mctz = soft_limit_tree_from_page(page);
+ mctz = soft_limit_tree_from_page(page);
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
@@ -789,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
@@ -802,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
- int node, zone;
- struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
- for_each_node(node) {
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(memcg, node, zone);
- mctz = soft_limit_tree_node_zone(node, zone);
- mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ mctz = soft_limit_tree_node_zone(nid, zid);
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
}
@@ -833,7 +826,7 @@ retry:
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
!css_tryget_online(&mz->memcg->css))
goto retry;
@@ -944,8 +937,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
@@ -953,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return mz->lru_size[lru];
}
-static unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
- unsigned int lru_mask)
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid,
+ unsigned int lru_mask)
{
- struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
- unsigned long ret = 0;
-
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
- for_each_lru(lru) {
- if (BIT(lru) & lru_mask)
- ret += mz->lru_size[lru];
- }
- return ret;
-}
-
-static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- u64 total = 0;
+ unsigned long nr = 0;
int zid;
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- total += mem_cgroup_zone_nr_lru_pages(memcg,
- nid, zid, lru_mask);
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
- return total;
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ nr += mz->lru_size[lru];
+ }
+ }
+ return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
+ unsigned long nr = 0;
int nid;
- u64 total = 0;
for_each_node_state(nid, N_MEMORY)
- total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return total;
+ nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
+ return nr;
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -1074,9 +1058,18 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
rcu_read_lock();
do {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
+ /*
+ * Page cache insertions can happen withou an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ */
+ if (unlikely(!mm))
memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ memcg = root_mem_cgroup;
+ }
} while (!css_tryget_online(&memcg->css));
rcu_read_unlock();
return memcg;
@@ -1232,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
int uninitialized_var(seq);
if (reclaim) {
- int nid = zone_to_nid(reclaim->zone);
- int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
- mz = mem_cgroup_zoneinfo(root, nid, zid);
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
iter->last_visited = NULL;
@@ -1343,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
goto out;
}
- mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
+ mz = mem_cgroup_zone_zoneinfo(memcg, zone);
lruvec = &mz->lruvec;
out:
/*
@@ -1402,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
- mz = page_cgroup_zoneinfo(memcg, page);
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
@@ -1540,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
/* root ? */
- if (!memcg->css.parent)
+ if (mem_cgroup_disabled() || !memcg->css.parent)
return vm_swappiness;
return memcg->swappiness;
@@ -1584,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
}
/*
- * 2 routines for checking "mem" is under move_account() or not.
- *
- * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
- * is used for avoiding races in accounting. If true,
- * pc->mem_cgroup may be overwritten.
+ * A routine for checking "mem" is under move_account() or not.
*
- * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
- * under hierarchy of moving cgroups. This is for
- * waiting at hith-memory prressure caused by "move".
+ * Checking a cgroup is mc.from or mc.to or under hierarchy of
+ * moving cgroups. This is for waiting at high-memory pressure
+ * caused by "move".
*/
-
-static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
-{
- VM_BUG_ON(!rcu_read_lock_held());
- return atomic_read(&memcg->moving_account) > 0;
-}
-
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
@@ -1643,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
- * see mem_cgroup_stolen(), too.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
@@ -2278,12 +2257,11 @@ cleanup:
}
/*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * Used to update mapped file or writeback or other statistics.
*
* Notes: Race condition
*
- * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * We usually use lock_page_cgroup() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
@@ -2297,8 +2275,8 @@ cleanup:
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
- * small, we check mm->moving_account and detect there are possibility of race
- * If there is, we take a lock.
+ * small, we check memcg->moving_account and detect there are possibility
+ * of race or not. If there is, we take a lock.
*/
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2316,9 +2294,10 @@ again:
* If this memory cgroup is not under account moving, we don't
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock() if mem_cgroup_stolen() == true.
+ * rcu_read_unlock().
*/
- if (!mem_cgroup_stolen(memcg))
+ VM_BUG_ON(!rcu_read_lock_held());
+ if (atomic_read(&memcg->moving_account) <= 0)
return;
move_lock_mem_cgroup(memcg, flags);
@@ -2426,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
*/
static void drain_local_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
+ struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
@@ -2673,7 +2652,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
* free their memory.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE) ||
- fatal_signal_pending(current)))
+ fatal_signal_pending(current) ||
+ current->flags & PF_EXITING))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
@@ -2901,6 +2881,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+ * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
+ */
+static DEFINE_MUTEX(memcg_slab_mutex);
+
static DEFINE_MUTEX(activate_kmem_mutex);
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
@@ -2933,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
print_slabinfo_header(m);
- mutex_lock(&memcg->slab_caches_mutex);
+ mutex_lock(&memcg_slab_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list)
cache_show(memcg_params_to_cache(params), m);
- mutex_unlock(&memcg->slab_caches_mutex);
+ mutex_unlock(&memcg_slab_mutex);
return 0;
}
@@ -3038,8 +3024,6 @@ void memcg_update_array_size(int num)
memcg_limited_groups_array_size = memcg_caches_array_size(num);
}
-static void kmem_cache_destroy_work_func(struct work_struct *w);
-
int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
@@ -3092,29 +3076,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
return 0;
}
-char *memcg_create_cache_name(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char *buf = NULL;
-
- /*
- * We need a mutex here to protect the shared buffer. Since this is
- * expected to be called only on cache creation, we can employ the
- * slab_mutex for that purpose.
- */
- lockdep_assert_held(&slab_mutex);
-
- if (!buf) {
- buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- return NULL;
- }
-
- cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
- return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), buf);
-}
-
int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache)
{
@@ -3136,8 +3097,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
- INIT_WORK(&s->memcg_params->destroy,
- kmem_cache_destroy_work_func);
css_get(&memcg->css);
} else
s->memcg_params->is_root_cache = true;
@@ -3154,24 +3113,37 @@ void memcg_free_cache_params(struct kmem_cache *s)
kfree(s->memcg_params);
}
-void memcg_register_cache(struct kmem_cache *s)
+static void memcg_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
{
- struct kmem_cache *root;
- struct mem_cgroup *memcg;
+ static char memcg_name_buf[NAME_MAX + 1]; /* protected by
+ memcg_slab_mutex */
+ struct kmem_cache *cachep;
int id;
- if (is_root_cache(s))
+ lockdep_assert_held(&memcg_slab_mutex);
+
+ id = memcg_cache_id(memcg);
+
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (cache_from_memcg_idx(root_cache, id))
return;
+ cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
+ cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
/*
- * Holding the slab_mutex assures nobody will touch the memcg_caches
- * array while we are modifying it.
+ * If we could not create a memcg cache, do not complain, because
+ * that's not critical at all as we can always proceed with the root
+ * cache.
*/
- lockdep_assert_held(&slab_mutex);
+ if (!cachep)
+ return;
- root = s->memcg_params->root_cache;
- memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3180,49 +3152,30 @@ void memcg_register_cache(struct kmem_cache *s)
*/
smp_wmb();
- /*
- * Initialize the pointer to this cache in its parent's memcg_params
- * before adding it to the memcg_slab_caches list, otherwise we can
- * fail to convert memcg_params_to_cache() while traversing the list.
- */
- VM_BUG_ON(root->memcg_params->memcg_caches[id]);
- root->memcg_params->memcg_caches[id] = s;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
+ BUG_ON(root_cache->memcg_params->memcg_caches[id]);
+ root_cache->memcg_params->memcg_caches[id] = cachep;
}
-void memcg_unregister_cache(struct kmem_cache *s)
+static void memcg_unregister_cache(struct kmem_cache *cachep)
{
- struct kmem_cache *root;
+ struct kmem_cache *root_cache;
struct mem_cgroup *memcg;
int id;
- if (is_root_cache(s))
- return;
+ lockdep_assert_held(&memcg_slab_mutex);
- /*
- * Holding the slab_mutex assures nobody will touch the memcg_caches
- * array while we are modifying it.
- */
- lockdep_assert_held(&slab_mutex);
+ BUG_ON(is_root_cache(cachep));
- root = s->memcg_params->root_cache;
- memcg = s->memcg_params->memcg;
+ root_cache = cachep->memcg_params->root_cache;
+ memcg = cachep->memcg_params->memcg;
id = memcg_cache_id(memcg);
- mutex_lock(&memcg->slab_caches_mutex);
- list_del(&s->memcg_params->list);
- mutex_unlock(&memcg->slab_caches_mutex);
+ BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ root_cache->memcg_params->memcg_caches[id] = NULL;
- /*
- * Clear the pointer to this cache in its parent's memcg_params only
- * after removing it from the memcg_slab_caches list, otherwise we can
- * fail to convert memcg_params_to_cache() while traversing the list.
- */
- VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
- root->memcg_params->memcg_caches[id] = NULL;
+ list_del(&cachep->memcg_params->list);
+
+ kmem_cache_destroy(cachep);
}
/*
@@ -3256,144 +3209,61 @@ static inline void memcg_resume_kmem_account(void)
current->memcg_kmem_skip_account--;
}
-static void kmem_cache_destroy_work_func(struct work_struct *w)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_params *p;
-
- p = container_of(w, struct memcg_cache_params, destroy);
-
- cachep = memcg_params_to_cache(p);
-
- /*
- * If we get down to 0 after shrink, we could delete right away.
- * However, memcg_release_pages() already puts us back in the workqueue
- * in that case. If we proceed deleting, we'll get a dangling
- * reference, and removing the object from the workqueue in that case
- * is unnecessary complication. We are not a fast path.
- *
- * Note that this case is fundamentally different from racing with
- * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
- * kmem_cache_shrink, not only we would be reinserting a dead cache
- * into the queue, but doing so from inside the worker racing to
- * destroy it.
- *
- * So if we aren't down to zero, we'll just schedule a worker and try
- * again
- */
- if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
- kmem_cache_shrink(cachep);
- else
- kmem_cache_destroy(cachep);
-}
-
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
-{
- if (!cachep->memcg_params->dead)
- return;
-
- /*
- * There are many ways in which we can get here.
- *
- * We can get to a memory-pressure situation while the delayed work is
- * still pending to run. The vmscan shrinkers can then release all
- * cache memory and get us to destruction. If this is the case, we'll
- * be executed twice, which is a bug (the second time will execute over
- * bogus data). In this case, cancelling the work should be fine.
- *
- * But we can also get here from the worker itself, if
- * kmem_cache_shrink is enough to shake all the remaining objects and
- * get the page count to 0. In this case, we'll deadlock if we try to
- * cancel the work (the worker runs with an internal lock held, which
- * is the same lock we would hold for cancel_work_sync().)
- *
- * Since we can't possibly know who got us here, just refrain from
- * running if there is already work pending
- */
- if (work_pending(&cachep->memcg_params->destroy))
- return;
- /*
- * We have to defer the actual destroying to a workqueue, because
- * we might currently be in a context that cannot sleep.
- */
- schedule_work(&cachep->memcg_params->destroy);
-}
-
-int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+int __memcg_cleanup_cache_params(struct kmem_cache *s)
{
struct kmem_cache *c;
int i, failed = 0;
- /*
- * If the cache is being destroyed, we trust that there is no one else
- * requesting objects from it. Even if there are, the sanity checks in
- * kmem_cache_destroy should caught this ill-case.
- *
- * Still, we don't want anyone else freeing memcg_caches under our
- * noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the activate_kmem_mutex to protect ourselves against
- * this.
- */
- mutex_lock(&activate_kmem_mutex);
+ mutex_lock(&memcg_slab_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
continue;
- /*
- * We will now manually delete the caches, so to avoid races
- * we need to cancel all pending destruction workers and
- * proceed with destruction ourselves.
- *
- * kmem_cache_destroy() will call kmem_cache_shrink internally,
- * and that could spawn the workers again: it is likely that
- * the cache still have active pages until this very moment.
- * This would lead us back to mem_cgroup_destroy_cache.
- *
- * But that will not execute at all if the "dead" flag is not
- * set, so flip it down to guarantee we are in control.
- */
- c->memcg_params->dead = false;
- cancel_work_sync(&c->memcg_params->destroy);
- kmem_cache_destroy(c);
+ memcg_unregister_cache(c);
if (cache_from_memcg_idx(s, i))
failed++;
}
- mutex_unlock(&activate_kmem_mutex);
+ mutex_unlock(&memcg_slab_mutex);
return failed;
}
-static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
- struct memcg_cache_params *params;
+ struct memcg_cache_params *params, *tmp;
if (!memcg_kmem_is_active(memcg))
return;
- mutex_lock(&memcg->slab_caches_mutex);
- list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ mutex_lock(&memcg_slab_mutex);
+ list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
- cachep->memcg_params->dead = true;
- schedule_work(&cachep->memcg_params->destroy);
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ memcg_unregister_cache(cachep);
}
- mutex_unlock(&memcg->slab_caches_mutex);
+ mutex_unlock(&memcg_slab_mutex);
}
-struct create_work {
+struct memcg_register_cache_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
-static void memcg_create_cache_work_func(struct work_struct *w)
+static void memcg_register_cache_func(struct work_struct *w)
{
- struct create_work *cw = container_of(w, struct create_work, work);
+ struct memcg_register_cache_work *cw =
+ container_of(w, struct memcg_register_cache_work, work);
struct mem_cgroup *memcg = cw->memcg;
struct kmem_cache *cachep = cw->cachep;
- kmem_cache_create_memcg(memcg, cachep);
+ mutex_lock(&memcg_slab_mutex);
+ memcg_register_cache(memcg, cachep);
+ mutex_unlock(&memcg_slab_mutex);
+
css_put(&memcg->css);
kfree(cw);
}
@@ -3401,12 +3271,12 @@ static void memcg_create_cache_work_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct create_work *cw;
+ struct memcg_register_cache_work *cw;
- cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+ cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (cw == NULL) {
css_put(&memcg->css);
return;
@@ -3415,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
cw->memcg = memcg;
cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_create_cache_work_func);
+ INIT_WORK(&cw->work, memcg_register_cache_func);
schedule_work(&cw->work);
}
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_create_cache_enqueue will recurse.
+ * in __memcg_schedule_register_cache will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -3434,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
memcg_stop_kmem_account();
- __memcg_create_cache_enqueue(memcg, cachep);
+ __memcg_schedule_register_cache(memcg, cachep);
memcg_resume_kmem_account();
}
+
+int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
+{
+ int res;
+
+ res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
+ PAGE_SIZE << order);
+ if (!res)
+ atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+ return res;
+}
+
+void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
+{
+ memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
+ atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+}
+
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -3487,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
- * kmem_cache_dup, this means no further allocation could happen
- * with the slab_mutex held.
- *
- * Also, because cache creation issue get_online_cpus(), this
- * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
- * that ends up reversed during cpu hotplug. (cpuset allocates
- * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
- * better to defer everything.
+ * memcg_create_kmem_cache, this means no further allocation
+ * could happen with the slab_mutex held. So it's better to
+ * defer everything.
*/
- memcg_create_cache_enqueue(memcg, cachep);
+ memcg_schedule_register_cache(memcg, cachep);
return cachep;
out:
rcu_read_unlock();
return cachep;
}
-EXPORT_SYMBOL(__memcg_kmem_get_cache);
/*
* We need to verify if the allocation against current->mm->owner's memcg is
@@ -3529,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
/*
* Disabling accounting is only relevant for some specific memcg
* internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are marked
- * with GFP_KMEMCG only happen outside memcg core. We are mostly
- * concerned with cache allocations, and by having this test at
- * memcg_kmem_get_cache, we are already able to relay the allocation to
- * the root cache and bypass the memcg cache altogether.
+ * check here, since direct calls to the page allocator that are
+ * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+ * outside memcg core. We are mostly concerned with cache allocations,
+ * and by having this test at memcg_kmem_get_cache, we are already able
+ * to relay the allocation to the root cache and bypass the memcg cache
+ * altogether.
*
* There is one exception, though: the SLUB allocator does not create
* large order caches, but rather service large kmallocs directly from
@@ -3620,7 +3503,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
-static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -3956,17 +3839,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
return 0;
}
- /*
- * Page cache insertions can happen without an actual mm
- * context, e.g. during disk probing on boot.
- */
- if (unlikely(!mm))
- memcg = root_mem_cgroup;
- else {
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- if (!memcg)
- return -ENOMEM;
- }
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
__mem_cgroup_commit_charge(memcg, page, 1, type, false);
return 0;
}
@@ -4703,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
} while (1);
}
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
excess = res_counter_soft_limit_excess(&mz->memcg->res);
/*
* One school of thought says that we should not add
@@ -4714,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* term TODO.
*/
/* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
@@ -4781,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
if (mem_cgroup_move_parent(page, pc, memcg)) {
/* found lock contention or "pc" is obsolete. */
busy = page;
- cond_resched();
} else
busy = NULL;
+ cond_resched();
} while (!list_empty(list));
}
@@ -5064,13 +4939,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
* Make sure we have enough space for this cgroup in each root cache's
* memcg_params.
*/
+ mutex_lock(&memcg_slab_mutex);
err = memcg_update_all_caches(memcg_id + 1);
+ mutex_unlock(&memcg_slab_mutex);
if (err)
goto out_rmid;
memcg->kmemcg_id = memcg_id;
INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
/*
* We couldn't have accounted to this cgroup, because it hasn't got the
@@ -5417,7 +5293,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
rstat = &mz->lruvec.reclaim_stat;
recent_rotated[0] += rstat->recent_rotated[0];
@@ -5447,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent);
-
- if (val > 100 || !parent)
- return -EINVAL;
- mutex_lock(&memcg_create_mutex);
-
- /* If under hierarchy, only empty-root can set this value */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
+ if (val > 100)
return -EINVAL;
- }
- memcg->swappiness = val;
-
- mutex_unlock(&memcg_create_mutex);
+ if (css->parent)
+ memcg->swappiness = val;
+ else
+ vm_swappiness = val;
return 0;
}
@@ -5794,22 +5662,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent);
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!parent || !((val == 0) || (val == 1)))
+ if (!css->parent || !((val == 0) || (val == 1)))
return -EINVAL;
- mutex_lock(&memcg_create_mutex);
- /* oom-kill-disable is a flag for subhierarchy. */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
- return -EINVAL;
- }
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
- mutex_unlock(&memcg_create_mutex);
+
return 0;
}
@@ -6498,7 +6359,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
css_for_each_descendant_post(iter, css)
mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
- mem_cgroup_destroy_all_caches(memcg);
+ memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
@@ -6694,16 +6555,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
- page = find_get_page(mapping, pgoff);
-
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- if (do_swap_account)
- *entry = swap;
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swp;
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
return page;
}