diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:44:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:44:27 -0700 |
commit | ac4de9543aca59f2b763746647577302fbedd57e (patch) | |
tree | 40407750569ee030de56233c41c9a97f7e89cf67 /mm | |
parent | 26935fb06ee88f1188789807687c03041f3c70d9 (diff) | |
parent | de32a8177f64bc62e1b19c685dd391af664ab13f (diff) |
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton:
"The rest of MM. Plus one misc cleanup"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
mm/Kconfig: add MMU dependency for MIGRATION.
kernel: replace strict_strto*() with kstrto*()
mm, thp: count thp_fault_fallback anytime thp fault fails
thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
thp: do_huge_pmd_anonymous_page() cleanup
thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
mm: cleanup add_to_page_cache_locked()
thp: account anon transparent huge pages into NR_ANON_PAGES
truncate: drop 'oldsize' truncate_pagecache() parameter
mm: make lru_add_drain_all() selective
memcg: document cgroup dirty/writeback memory statistics
memcg: add per cgroup writeback pages accounting
memcg: check for proper lock held in mem_cgroup_update_page_stat
memcg: remove MEMCG_NR_FILE_MAPPED
memcg: reduce function dereference
memcg: avoid overflow caused by PAGE_ALIGN
memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
memcg: correct RESOURCE_MAX to ULLONG_MAX
mm: memcg: do not trap chargers with full callstack on OOM
mm: memcg: rework and document OOM waiting and wakeup
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/filemap.c | 59 | ||||
-rw-r--r-- | mm/huge_memory.c | 129 | ||||
-rw-r--r-- | mm/memcontrol.c | 871 | ||||
-rw-r--r-- | mm/memory.c | 52 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 22 | ||||
-rw-r--r-- | mm/swap.c | 44 | ||||
-rw-r--r-- | mm/truncate.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 83 |
11 files changed, 593 insertions, 702 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 6cdd2704330..026771a9b09 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -245,7 +245,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in @@ -480,7 +480,7 @@ config FRONTSWAP config CMA bool "Contiguous Memory Allocator" - depends on HAVE_MEMBLOCK + depends on HAVE_MEMBLOCK && MMU select MIGRATION select MEMORY_ISOLATION help diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a..1e6aec4a2d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) - goto out; + return error; error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error == 0) { - page_cache_get(page); - page->mapping = mapping; - page->index = offset; - - spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (likely(!error)) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); - trace_mm_filemap_add_to_page_cache(page); - } else { - page->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); - page_cache_release(page); - } - radix_tree_preload_end(); - } else + if (error) { mem_cgroup_uncharge_cache_page(page); -out: + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); return error; } EXPORT_SYMBOL(add_to_page_cache_locked); @@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; + bool memcg_oom; pgoff_t size; int ret = 0; @@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? + * Do we have something in the page cache already? Either + * way, try readahead, but disable the memcg OOM killer for it + * as readahead is optional and no errors are propagated up + * the fault stack. The OOM killer is enabled while trying to + * instantiate the faulting page individually below. */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ + memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); + mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ + memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); + mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d66010e0049..7489884682d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = mk_pmd(page, prot); entry = pmd_mkhuge(entry); return entry; } @@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_huge_pmd(page, vma); + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; - pte_t *pte; - if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && + transparent_hugepage_use_zero_page()) { + pgtable_t pgtable; + struct page *zero_page; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && - transparent_hugepage_use_zero_page()) { - pgtable_t pgtable; - struct page *zero_page; - bool set; - pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) - return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); - if (unlikely(!zero_page)) { - pte_free(mm, pgtable); - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - spin_lock(&mm->page_table_lock); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(&mm->page_table_lock); - if (!set) { - pte_free(mm, pgtable); - put_huge_zero_page(); - } - return 0; - } - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - if (unlikely(!page)) { + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { + pte_free(mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - count_vm_event(THP_FAULT_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { - put_page(page); - goto out; + return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, - page))) { - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; + spin_lock(&mm->page_table_lock); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_page); + spin_unlock(&mm->page_table_lock); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); } - return 0; } -out: - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + + count_vm_event(THP_FAULT_ALLOC); + return 0; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1170,7 +1150,6 @@ alloc: new_page = NULL; if (unlikely(!new_page)) { - count_vm_event(THP_FAULT_FALLBACK); if (is_huge_zero_pmd(orig_pmd)) { ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, address, pmd, orig_pmd, haddr); @@ -1181,9 +1160,9 @@ alloc: split_huge_page(page); put_page(page); } + count_vm_event(THP_FAULT_FALLBACK); goto out; } - count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); @@ -1191,10 +1170,13 @@ alloc: split_huge_page(page); put_page(page); } + count_vm_event(THP_FAULT_FALLBACK); ret |= VM_FAULT_OOM; goto out; } + count_vm_event(THP_FAULT_ALLOC); + if (is_huge_zero_pmd(orig_pmd)) clear_huge_page(new_page, haddr, HPAGE_PMD_NR); else @@ -1215,7 +1197,8 @@ alloc: goto out_mn; } else { pmd_t entry; - entry = mk_huge_pmd(new_page, vma); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); @@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page, BUG_ON(atomic_read(&page->_count) <= 0); __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); - __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); ClearPageCompound(page); compound_unlock(page); @@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm, __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma); + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); /* * spin_lock() below is not the equivalent of smp_wmb(), so diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd53..d5ff3ce1302 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,7 +39,6 @@ #include <linux/limits.h> #include <linux/export.h> #include <linux/mutex.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0; #endif -/* - * Statistics for memory cgroup. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ - MEM_CGROUP_STAT_NSTATS, -}; - static const char * const mem_cgroup_stat_names[] = { "cache", "rss", "rss_huge", "mapped_file", + "writeback", "swap", }; @@ -175,10 +160,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -187,26 +168,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -/* - * Cgroups above their limits are maintained in a RB-Tree, independent of - * their hierarchy representation - */ - -struct mem_cgroup_tree_per_zone { - struct rb_root rb_root; - spinlock_t lock; -}; - -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - -struct mem_cgroup_tree { - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; -}; - -static struct mem_cgroup_tree soft_limit_tree __read_mostly; - struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -280,6 +241,7 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; + atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -304,7 +266,7 @@ struct mem_cgroup { * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ - unsigned long move_charge_at_immigrate; + unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -341,6 +303,22 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif + /* + * Protects soft_contributed transitions. + * See mem_cgroup_update_soft_limit + */ + spinlock_t soft_lock; + + /* + * If true then this group has increased parents' children_in_excess + * when it got over the soft limit. + * When a group falls bellow the soft limit, parents' children_in_excess + * is decreased and soft_contributed changed to false. + */ + bool soft_contributed; + + /* Number of children that are in soft limit excess */ + atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -444,7 +422,6 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) -{ - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - int zid = page_zonenum(page); - - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) -{ - struct rb_node **p = &mctz->rb_root.rb_node; - struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; - - if (mz->on_tree) - return; - - mz->usage_in_excess = new_usage_in_excess; - if (!mz->usage_in_excess) - return; - while (*p) { - parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, - tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) - p = &(*p)->rb_left; - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) - p = &(*p)->rb_right; - } - rb_link_node(&mz->tree_node, parent, p); - rb_insert_color(&mz->tree_node, &mctz->rb_root); - mz->on_tree = true; -} - -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - if (!mz->on_tree) - return; - rb_erase(&mz->tree_node, &mctz->rb_root); - mz->on_tree = false; -} - -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - spin_unlock(&mctz->lock); -} - - -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) -{ - unsigned long long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); - - /* - * Necessary to update all ancestors when hierarchy is used. - * because their event counter is not touched. - */ - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - excess = res_counter_soft_limit_excess(&memcg->res); - /* - * We have to update the tree if mz is on RB-tree or - * mem is over its softlimit. - */ - if (excess || mz->on_tree) { - spin_lock(&mctz->lock); - /* if on-tree, remove it */ - if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - /* - * Insert again. mz->usage_in_excess will be updated. - * If excess is 0, no tree ops. - */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); - } - } -} - -static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) -{ - int node, zone; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); - } - } -} - -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; - -retry: - mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) - goto done; /* Nothing to reclaim from */ - - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); - /* - * Remove the node now but someone else can add it back, - * we will to add it back at the end of reclaim to its correct - * position in the tree. - */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) - goto retry; -done: - return mz; -} - -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct mem_cgroup_per_zone *mz; - - spin_lock(&mctz->lock); - mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); - return mz; -} - /* * Implementation Note: reading percpu statistics for memcg. * @@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, } /* + * Called from rate-limited memcg_check_events when enough + * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure + * that all the parents up the hierarchy will be notified that this group + * is in excess or that it is not in excess anymore. mmecg->soft_contributed + * makes the transition a single action whenever the state flips from one to + * the other. + */ +static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) +{ + unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); + struct mem_cgroup *parent = memcg; + int delta = 0; + + spin_lock(&memcg->soft_lock); + if (excess) { + if (!memcg->soft_contributed) { + delta = 1; + memcg->soft_contributed = true; + } + } else { + if (memcg->soft_contributed) { + delta = -1; + memcg->soft_contributed = false; + } + } + + /* + * Necessary to update all ancestors when hierarchy is used + * because their event counter is not touched. + * We track children even outside the hierarchy for the root + * cgroup because tree walk starting at root should visit + * all cgroups and we want to prevent from pointless tree + * walk if no children is below the limit. + */ + while (delta && (parent = parent_mem_cgroup(parent))) + atomic_add(delta, &parent->children_in_excess); + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) + atomic_add(delta, &root_mem_cgroup->children_in_excess); + spin_unlock(&memcg->soft_lock); +} + +/* * Check events in order. * */ @@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); + mem_cgroup_update_soft_limit(memcg); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static enum mem_cgroup_filter_t +mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, + mem_cgroup_iter_filter cond) +{ + if (!cond) + return VISIT; + return cond(memcg, root); +} + /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) + struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) { struct cgroup_subsys_state *prev_css, *next_css; @@ -1093,11 +963,31 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - if (css_tryget(&mem->css)) - return mem; - else { + switch (mem_cgroup_filter(mem, root, cond)) { + case SKIP: prev_css = next_css; goto skip_node; + case SKIP_TREE: + if (mem == root) + return NULL; + /* + * css_rightmost_descendant is not an optimal way to + * skip through a subtree (especially for imbalanced + * trees leaning to right) but that's what we have right + * now. More effective solution would be traversing + * right-up for first non-NULL without calling + * css_next_descendant_pre afterwards. + */ + prev_css = css_rightmost_descendant(next_css); + goto skip_node; + case VISIT: + if (css_tryget(&mem->css)) + return mem; + else { + prev_css = next_css; + goto skip_node; + } + break; } } @@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks + * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) - return NULL; + if (mem_cgroup_disabled()) { + /* first call must return non-NULL, second return NULL */ + return (struct mem_cgroup *)(unsigned long)!prev; + } if (!root) root = root_mem_cgroup; @@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - return root; + if (mem_cgroup_filter(root, root, cond) == VISIT) + return root; + return NULL; } rcu_read_lock(); @@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited); + memcg = __mem_cgroup_iter_next(root, last_visited, cond); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, reclaim->generation = iter->generation; } - if (prev && !memcg) + /* + * We have finished the whole tree walk or no group has been + * visited because filter told us to skip the root node. + */ + if (!memcg && (prev || (cond && !last_visited))) goto out_unlock; } out_unlock: @@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } +#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } -#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif -static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim = NULL; - int total = 0; - int loop = 0; - unsigned long excess; - unsigned long nr_scanned; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = 0, - }; - - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - - while (1) { - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); - if (!victim) { - loop++; - if (loop >= 2) { - /* - * If we have not been able to reclaim - * anything, it might because there are - * no reclaimable pages under this hierarchy - */ - if (!total) - break; - /* - * We want to do more targeted reclaim. - * excess >> 2 is not to excessive so as to - * reclaim too much, nor too less that we keep - * coming back to reclaim from this cgroup - */ - if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) - break; - } - continue; - } - if (!mem_cgroup_reclaimable(victim, false)) - continue; - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); - *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) +/* + * A group is eligible for the soft limit reclaim under the given root + * hierarchy if + * a) it is over its soft limit + * b) any parent up the hierarchy is over its soft limit + * + * If the given group doesn't have any children over the limit then it + * doesn't make any sense to iterate its subtree. + */ +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + struct mem_cgroup *parent; + + if (!memcg) + memcg = root_mem_cgroup; + parent = memcg; + + if (res_counter_soft_limit_excess(&memcg->res)) + return VISIT; + + /* + * If any parent up to the root in the hierarchy is over its soft limit + * then we have to obey and reclaim from this group as well. + */ + while ((parent = parent_mem_cgroup(parent))) { + if (res_counter_soft_limit_excess(&parent->res)) + return VISIT; + if (parent == root) break; } - mem_cgroup_iter_break(root_memcg, victim); - return total; + + if (!atomic_read(&memcg->children_in_excess)) + return SKIP_TREE; + return SKIP; } +static DEFINE_SPINLOCK(memcg_oom_lock); + /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* @@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) iter->oom_lock = true; } - if (!failed) - return true; - - /* - * OK, we failed to lock the whole subtree so we have to clean up - * what we set up to the failing subtree - */ - for_each_mem_cgroup_tree(iter, memcg) { - if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); - break; + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; } - iter->oom_lock = false; } - return false; + + spin_unlock(&memcg_oom_lock); + + return !failed; } -/* - * Has to be called with memcg_oom_lock - */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { stru |