diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 555 |
1 files changed, 210 insertions, 345 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b6f91d61b3a..a2c7bcb0e6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -674,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -686,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -709,11 +710,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -743,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -754,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock(&mctz->lock); } @@ -770,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or @@ -789,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); } } @@ -802,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -833,7 +826,7 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || !css_tryget_online(&mz->memcg->css)) goto retry; @@ -944,8 +937,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -953,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); - return total; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1074,9 +1058,18 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; @@ -1232,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1343,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1402,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1540,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!memcg->css.parent) + if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; return memcg->swappiness; @@ -1584,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. - * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1643,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -2278,12 +2257,11 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but * it tends to be costly. But considering some conditions, we doesn't need * to do so _always_. * @@ -2297,8 +2275,8 @@ cleanup: * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2316,9 +2294,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); @@ -2426,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } @@ -2673,7 +2652,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, * free their memory. */ if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current))) + fatal_signal_pending(current) || + current->flags & PF_EXITING)) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2901,6 +2881,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2933,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3038,8 +3024,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3092,29 +3076,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - static char *buf = NULL; - - /* - * We need a mutex here to protect the shared buffer. Since this is - * expected to be called only on cache creation, we can employ the - * slab_mutex for that purpose. - */ - lockdep_assert_held(&slab_mutex); - - if (!buf) { - buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!buf) - return NULL; - } - - cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); - return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), buf); -} - int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3136,8 +3097,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3154,24 +3113,37 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, id)) return; + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. */ - lockdep_assert_held(&slab_mutex); + if (!cachep) + return; - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3180,49 +3152,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_unregister_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); + + kmem_cache_destroy(cachep); } /* @@ -3256,144 +3209,61 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else - kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. - */ - c->memcg_params->dead = false; - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_unregister_cache(c); if (cache_from_memcg_idx(s, i)) failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); return failed; } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_unregister_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } -struct create_work { +struct memcg_register_cache_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw = container_of(w, struct create_work, work); + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - kmem_cache_create_memcg(memcg, cachep); + mutex_lock(&memcg_slab_mutex); + memcg_register_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + css_put(&memcg->css); kfree(cw); } @@ -3401,12 +3271,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3415,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3434,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3487,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is @@ -3529,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from @@ -3620,7 +3503,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -3956,17 +3839,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, return 0; } - /* - * Page cache insertions can happen without an actual mm - * context, e.g. during disk probing on boot. - */ - if (unlikely(!mm)) - memcg = root_mem_cgroup; - else { - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - } + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, 1, type, false); return 0; } @@ -4703,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4714,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->memcg->css); loop++; @@ -4781,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } @@ -5064,13 +4939,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the @@ -5417,7 +5293,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; @@ -5447,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent); - - if (val > 100 || !parent) - return -EINVAL; - mutex_lock(&memcg_create_mutex); - - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); + if (val > 100) return -EINVAL; - } - memcg->swappiness = val; - - mutex_unlock(&memcg_create_mutex); + if (css->parent) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5794,22 +5662,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css->parent || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } @@ -6498,7 +6359,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6694,16 +6555,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } |