diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 82 | ||||
-rw-r--r-- | mm/failslab.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 118 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/init-mm.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 471 | ||||
-rw-r--r-- | mm/memory-failure.c | 92 | ||||
-rw-r--r-- | mm/mempolicy.c | 25 | ||||
-rw-r--r-- | mm/mincore.c | 11 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 269 | ||||
-rw-r--r-- | mm/page_alloc.c | 60 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 1493 | ||||
-rw-r--r-- | mm/slab.c | 99 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 772 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 17 | ||||
-rw-r--r-- | mm/vmscan.c | 74 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
23 files changed, 1879 insertions, 1793 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8290b1e8825..d6edf8d14f9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +{ + if (wb1 < wb2) { + spin_lock(&wb1->list_lock); + spin_lock_nested(&wb2->list_lock, 1); + } else { + spin_lock(&wb2->list_lock); + spin_lock_nested(&wb1->list_lock, 1); + } +} + #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, - "BdiWriteback: %8lu kB\n" - "BdiReclaimable: %8lu kB\n" - "BdiDirtyThresh: %8lu kB\n" - "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n" - "b_dirty: %8lu\n" - "b_io: %8lu\n" - "b_more_io: %8lu\n" - "bdi_list: %8u\n" - "state: %8lx\n", + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_dirty, nr_io, nr_more_io, + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), + nr_dirty, + nr_io, + nr_more_io, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void bdi_flush_io(struct backing_dev_info *bdi) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - .nr_to_write = 1024, - }; - - writeback_inodes_wb(&bdi->wb, &wbc); -} - /* * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be @@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr) if (IS_ERR(task)) { /* * If thread creation fails, force writeout of - * the bdi from the thread. + * the bdi from the thread. Hopefully 1024 is + * large enough for efficient IO. */ - bdi_flush_io(bdi); + writeback_inodes_wb(&bdi->wb, 1024); } else { /* * The spinlock makes sure we do not lose @@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi) } bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + err = prop_local_init_percpu(&bdi->completions); if (err) { @@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_wb_list_lock); + bdi_lock_two(&bdi->wb, dst); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); + spin_unlock(&dst->list_lock); } bdi_unregister(bdi); diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240dd..0dd7b8fec71 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -5,10 +5,6 @@ static struct { struct fault_attr attr; u32 ignore_gfp_wait; int cache_filter; -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - struct dentry *ignore_gfp_wait_file; - struct dentry *cache_filter_file; -#endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, @@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init failslab_debugfs_init(void) { - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; - int err; - - err = init_fault_attr_dentries(&failslab.attr, "failslab"); - if (err) - return err; - dir = failslab.attr.dentries.dir; + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - failslab.ignore_gfp_wait_file = - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait); + dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); - failslab.cache_filter_file = - debugfs_create_bool("cache-filter", mode, dir, - &failslab.cache_filter); + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter)) + goto fail; - if (!failslab.ignore_gfp_wait_file || - !failslab.cache_filter_file) { - err = -ENOMEM; - debugfs_remove(failslab.cache_filter_file); - debugfs_remove(failslab.ignore_gfp_wait_file); - cleanup_fault_attr_dentries(&failslab.attr); - } + return 0; +fail: + debugfs_remove_recursive(dir); - return err; + return -ENOMEM; } late_initcall(failslab_debugfs_init); diff --git a/mm/filemap.c b/mm/filemap.c index 10a17111327..7771871fa35 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,6 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> -#include <linux/mm_inline.h> /* for page_is_file_cache() */ #include <linux/cleancache.h> #include "internal.h" @@ -78,7 +77,7 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * inode_wb_list_lock + * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * @@ -96,9 +95,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * @@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int error; VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { int ret; - /* - * Splice_read and readahead add shmem/tmpfs pages into the page cache - * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the anon lru below, and mem_cgroup_cache_charge - * (called in add_to_page_cache) needs to know where they're going too. - */ - if (mapping_cap_swap_backed(mapping)) - SetPageSwapBacked(page); - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) { - if (page_is_file_cache(page)) - lru_cache_add_file(page); - else - lru_cache_add_anon(page); - } + if (ret == 0) + lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -714,9 +699,16 @@ repeat: page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; - if (radix_tree_deref_retry(page)) - goto repeat; - + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + goto out; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_page(mapping, offset); - if (page) { + if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { @@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; - unsigned int nr_found; + unsigned int nr_found, nr_skip; rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, start, nr_pages); + (void ***)pages, NULL, start, nr_pages); ret = 0; + nr_skip = 0; for (i = 0; i < nr_found; i++) { struct page *page; repeat: @@ -849,13 +842,23 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) { - WARN_ON(start | i); - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(start | i); + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so skip over it - + * we only reach this from invalidate_mapping_pages(). + */ + nr_skip++; + continue; } if (!page_cache_get_speculative(page)) @@ -875,7 +878,7 @@ repeat: * If all entries were removed before we could secure them, * try again, because callers stop trying once 0 is returned. */ - if (unlikely(!ret && nr_found)) + if (unlikely(!ret && nr_found > nr_skip)) goto restart; rcu_read_unlock(); return ret; @@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, index, nr_pages); + (void ***)pages, NULL, index, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -912,12 +915,22 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so stop looking for + * contiguous pages. + */ + break; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -977,12 +990,21 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * This function is never used on a shmem/tmpfs + * mapping, so a swap entry won't be found here. + */ + BUG(); + } if (!page_cache_get_speculative(page)) goto repeat; diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2e..5ef672c07f7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -326,7 +326,7 @@ static struct page_address_slot { spinlock_t lock; /* Protect this bucket's list */ } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; -static struct page_address_slot *page_slot(struct page *page) +static struct page_address_slot *page_slot(const struct page *page) { return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } @@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) +void *page_address(const struct page *page) { unsigned long flags; void *ret; diff --git a/mm/init-mm.c b/mm/init-mm.c index 4019979b263..a56a851908d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -5,7 +5,7 @@ #include <linux/list.h> #include <linux/cpumask.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/pgtable.h> #include <asm/mmu.h> diff --git a/mm/kmemleak.c b/mm/kmemleak.c index aacee45616f..d6880f542f9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -96,7 +96,7 @@ #include <asm/sections.h> #include <asm/processor.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/kmemcheck.h> #include <linux/kmemleak.h> diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d2..3508777837c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,7 +35,6 @@ #include <linux/limits.h> #include <linux/mutex.h> #include <linux/rbtree.h> -#include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -246,10 +245,13 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - atomic_t oom_lock; + + bool oom_lock; + atomic_t under_oom; + atomic_t refcnt; - unsigned int swappiness; + int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } -static unsigned long -mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +unsigned long +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, + unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; + enum lru_list l; + unsigned long ret = 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + + for_each_lru(l) { + if (BIT(l) & lru_mask) + ret += MEM_CGROUP_ZSTAT(mz, l); + } + return ret; +} + +static unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, + int nid, unsigned int lru_mask) +{ u64 total = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + for (zid = 0; zid < MAX_NR_ZONES; zid++) + total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + return total; } -static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, - enum lru_list idx) + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, + unsigned int lru_mask) { int nid; u64 total = 0; - for_each_online_node(nid) - total += mem_cgroup_get_zonestat_node(mem, nid, idx); + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); return total; } @@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page, mem_cgroup_add_lru_list(page, to); } +/* + * Checks whether given mem is same or in the root_mem's + * hierarchy subtree + */ +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, + struct mem_cgroup *mem) +{ + if (root_mem != mem) { + return (root_mem->use_hierarchy && + css_is_ancestor(&mem->css, &root_mem->css)); + } + + return true; +} + int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; @@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) * enabled in "curr" and "curr" is a child of "mem" in *cgroup* * hierarchy(even if use_hierarchy is disabled in "mem"). */ - if (mem->use_hierarchy) - ret = css_is_ancestor(&curr->css, &mem->css); - else - ret = (curr == mem); + ret = mem_cgroup_same_or_subtree(mem, curr); css_put(&curr->css); return ret; } @@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) unsigned long active; unsigned long inactive; - inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); - active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); return (active > inactive); } -unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return MEM_CGROUP_ZSTAT(mz, lru); -} - -static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); - - return ret; -} - -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - return ret; -} - -#if MAX_NUMNODES > 1 -static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); - - return total; -} - -static unsigned long -mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) -{ - return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); -} - -static unsigned long -mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); - - return total; -} - -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - enum lru_list l; - u64 total = 0; - - for_each_lru(l) - total += mem_cgroup_get_zonestat_node(memcg, nid, l); - - return total; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) -{ - u64 total = 0; - int nid; - - for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid); - - return total; -} -#endif /* CONFIG_NUMA */ - struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) return margin >> PAGE_SHIFT; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +int mem_cgroup_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) to = mc.to; if (!from) goto unlock; - if (from == mem || to == mem - || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) - || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) - ret = true; + + ret = mem_cgroup_same_or_subtree(mem, from) + || mem_cgroup_same_or_subtree(mem, to); unlock: spin_unlock(&mc.lock); return ret; @@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, int nid, bool noswap) { - if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) + if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) return true; return false; @@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; while (1) { @@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone, - &nr_scanned); + noswap, zone, &nr_scanned); *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, get_swappiness(victim)); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. + * Has to be called with memcg_oom_lock */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int x, lock_count = 0; - struct mem_cgroup *iter; + struct mem_cgroup *iter, *failed = NULL; + bool cond = true; - for_each_mem_cgroup_tree(iter, mem) { - x = atomic_inc_return(&iter->oom_lock); - lock_count = max(x, lock_count); + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + if (iter->oom_lock) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + failed = iter; + cond = false; + } else + iter->oom_lock = true; } - if (lock_count == 1) + if (!failed) return true; + + /* + * OK, we failed to lock the whole subtree so we have to clean up + * what we set up to the failing subtree + */ + cond = true; + for_each_mem_cgroup_tree_cond(iter, mem, cond) { + if (iter == failed) { + cond = false; + continue; + } + iter->oom_lock = false; + } return false; } +/* + * Has to be called with memcg_oom_lock + */ static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { struct mem_cgroup *iter; + for_each_mem_cgroup_tree(iter, mem) + iter->oom_lock = false; + return 0; +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + atomic_inc(&iter->under_oom); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + /* * When a new child is created while the hierarchy is under oom, * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ for_each_mem_cgroup_tree(iter, mem) - atomic_add_unless(&iter->oom_lock, -1, 0); - return 0; + atomic_add_unless(&iter->under_oom, -1, 0); } - -static DEFINE_MUTEX(memcg_oom_mutex); +static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -1845,25 +1816,20 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, + *oom_wait_mem; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_mem = oom_wait_info->mem; - if (oom_wait_info->mem == wake_mem) - goto wakeup; - /* if no hierarchy, no match */ - if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) - return 0; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && - !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) + && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) return 0; - -wakeup: return autoremove_wake_function(wait, mode, sync, arg); } @@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) static void memcg_oom_recover(struct mem_cgroup *mem) { - if (mem && atomic_read(&mem->oom_lock)) + if (mem && atomic_read(&mem->under_oom)) memcg_wakeup_oom(mem); } @@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; + mem_cgroup_mark_under_oom(mem); + /* At first, try to OOM lock hierarchy under mem.*/ - mutex_lock(&memcg_oom_mutex); + spin_lock(&memcg_oom_lock); locked = mem_cgroup_oom_lock(mem); /* * Even if signal_pending(), we can't quit charge() loop without @@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) need_to_kill = false; if (locked) mem_cgroup_oom_notify(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } - mutex_lock(&memcg_oom_mutex); - mem_cgroup_oom_unlock(mem); + spin_lock(&memcg_oom_lock); + if (locked) + mem_cgroup_oom_unlock(mem); memcg_wakeup_oom(mem); - mutex_unlock(&memcg_oom_mutex); + spin_unlock(&memcg_oom_lock); + + mem_cgroup_unmark_under_oom(mem); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; @@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) } /* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. + * Drains all per-CPU charge caches for given root_mem resp. subtree + * of the hierarchy under it. sync flag says whether we should block + * until the work is done. */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_a |