diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 3033 |
1 files changed, 1778 insertions, 1255 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 6771ea70bfe..0f16ffe8eb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> #include <linux/gfp.h> @@ -19,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> @@ -26,7 +29,6 @@ #include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */ #include <linux/mm_inline.h> -#include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -41,35 +43,21 @@ #include <linux/memcontrol.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/oom.h> +#include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -/* - * reclaim_mode determines how the inactive list is shrunk - * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages - * RECLAIM_MODE_ASYNC: Do not block - * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference - * page from the LRU and reclaim all pages within a - * naturally aligned range - * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of - * order-0 pages and then compact the zone - */ -typedef unsigned __bitwise__ reclaim_mode_t; -#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) -#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) -#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) -#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) -#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -93,18 +81,19 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - int swappiness; - int order; + /* Scan (total_size >> priority) pages at once */ + int priority; + + /* anon vs. file LRUs scanning "ratio" */ + int swappiness; + /* - * Intend to reclaim enough continuous memory rather than reclaim - * enough amount of memory. i.e, mode for high order allocation. + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. */ - reclaim_mode_t reclaim_mode; - - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + struct mem_cgroup *target_mem_cgroup; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -147,45 +136,76 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc) (!(sc)->mem_cgroup) +#ifdef CONFIG_MEMCG +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} #else -#define scanning_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, - struct scan_control *sc) +static unsigned long zone_reclaimable_pages(struct zone *zone) { - if (!scanning_global_lru(sc)) - return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); - return &zone->reclaim_stat; + return nr; } -static unsigned long zone_nr_lru_pages(struct zone *zone, - struct scan_control *sc, enum lru_list lru) +bool zone_reclaimable(struct zone *zone) { - if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); - - return zone_page_state(zone, NR_LRU_BASE + lru); + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); + + return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); +} /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -197,10 +217,123 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long nr_pages_scanned, unsigned long lru_pages) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= freeable; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > freeable * 2) + total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, + freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + return freed; +} + /* * Call the shrink functions to age shrinkable caches * @@ -220,101 +353,46 @@ EXPORT_SYMBOL(unregister_shrinker); * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +unsigned long shrink_slab(struct shrink_control *shrinkctl, + unsigned long nr_pages_scanned, + unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; - if (scanned == 0) - scanned = SWAP_CLUSTER_MAX; - - if (!down_read_trylock(&shrinker_rwsem)) - return 1; /* Assume we'll be able to shrink next time */ - - list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; - - max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); - delta = (4 * scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; - } + if (nr_pages_scanned == 0) + nr_pages_scanned = SWAP_CLUSTER_MAX; + if (!down_read_trylock(&shrinker_rwsem)) { /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + freed = 1; + goto out; + } - total_scan = shrinker->nr; - shrinker->nr = 0; + list_for_each_entry(shrinker, &shrinker_list, list) { + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); + continue; + } - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; - int nr_before; + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); - nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); - shrink_ret = (*shrinker->shrink)(shrinker, this_scan, - gfp_mask); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; - - cond_resched(); } - - shrinker->nr += total_scan; } up_read(&shrinker_rwsem); - return ret; -} - -static void set_reclaim_mode(int priority, struct scan_control *sc, - bool sync) -{ - reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; - - /* - * Initially assume we are entering either lumpy reclaim or - * reclaim/compaction.Depending on the order, we will either set the - * sync mode or just reclaim order-0 pages later. - */ - if (COMPACTION_BUILD) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION; - else - sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; - - /* - * Avoid using lumpy reclaim or reclaim/compaction if possible by - * restricting when its set to either costly allocations or when - * under memory pressure - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->reclaim_mode |= syncmode; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->reclaim_mode |= syncmode; - else - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; -} - -static void reset_reclaim_mode(struct scan_control *sc) -{ - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; +out: + cond_resched(); + return freed; } static inline int is_page_cache_freeable(struct page *page) @@ -336,10 +414,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, return 1; if (bdi == current->backing_dev_info) return 1; - - /* lumpy reclaim for hugepage often need a lot of write */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - return 1; return 0; } @@ -358,7 +432,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -390,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in __generic_file_aio_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -409,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -439,21 +513,11 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, return PAGE_ACTIVATE; } - /* - * Wait on writeback if requested to. This happens when - * direct reclaiming a large contiguous area and the - * first attempt to free a range of pages fails. - */ - if (PageWriteback(page) && - (sc->reclaim_mode & RECLAIM_MODE_SYNC)) - wait_on_page_writeback(page); - if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->reclaim_mode)); + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -465,7 +529,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -511,10 +576,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swapcache_free(swap, page); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __remove_from_page_cache(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -537,7 +615,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -560,39 +638,39 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; - int active = !!TestClearPageActive(page); + bool is_unevictable; int was_unevictable = PageUnevictable(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + is_unevictable = false; + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* - * When racing with an mlock clearing (page is - * unlocked), make sure that if the other thread does - * not observe our setting of PG_lru and fails - * isolation, we see PG_mlocked cleared below and move + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_pages, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * - * The other side is TestClearPageMlocked(). + * The other side is TestClearPageMlocked() or shmem_lock(). */ smp_mb(); } @@ -602,7 +680,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -613,9 +691,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ @@ -634,13 +712,10 @@ static enum page_references page_check_references(struct page *page, int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); referenced_page = TestClearPageReferenced(page); - /* Lumpy reclaim - ignore references */ - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - return PAGEREF_RECLAIM; - /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. @@ -649,7 +724,7 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageAnon(page)) + if (PageSwapBacked(page)) return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table @@ -667,7 +742,13 @@ static enum page_references page_check_references(struct page *page, */ SetPageReferenced(page); - if (referenced_page) + if (referenced_page || referenced_ptes > 1) + return PAGEREF_ACTIVATE; + + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; @@ -680,22 +761,33 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } -static noinline_for_stack void free_page_list(struct list_head *free_pages) +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) { - struct pagevec freed_pvec; - struct page *page, *tmp; - - pagevec_init(&freed_pvec, 1); + struct address_space *mapping; - list_for_each_entry_safe(page, tmp, free_pages, lru) { - list_del(&page->lru); - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; } - pagevec_free(&freed_pvec); + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } /* @@ -703,22 +795,34 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) */ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, + enum ttu_flags ttu_flags, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, + unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, + bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); + mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { - enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -728,12 +832,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -746,25 +850,103 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * Synchronous reclaim is performed in two passes, - * first an asynchronous pass over the list to - * start parallel writeback, and a second synchronous - * pass to wait for the IO to complete. Wait here - * for any page for which writeback has already - * started. - */ - if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && - may_enter_fs) + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); + nr_writeback++; + + goto keep_locked; + + /* Case 3 above */ + } else { wait_on_page_writeback(page); - else { - unlock_page(page); - goto keep_lumpy; } } - references = page_check_references(page, sc); + if (!force_reclaim) + references = page_check_references(page, sc); + switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -782,19 +964,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page)) + if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -807,7 +990,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - nr_dirty++; + /* + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. + */ + if (page_is_file_cache(page) && + (!current_is_kswapd() || + !zone_is_reclaim_dirty(zone))) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + + goto keep_locked; + } if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; @@ -819,13 +1020,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) - goto keep_lumpy; + goto keep; if (PageDirty(page)) goto keep; @@ -885,7 +1085,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -911,41 +1111,63 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - reset_reclaim_mode(sc); continue; activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: - reset_reclaim_mode(sc); -keep_lumpy: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty == nr_congested && nr_dirty != 0) - zone_set_flag(zone, ZONE_CONGESTED); - - free_page_list(&free_pages); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; + *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -956,7 +1178,7 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode) { int ret = -EINVAL; @@ -964,26 +1186,48 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (!PageLRU(page)) return ret; - /* - * When checking the active state, we need to be sure we are - * dealing with comparible boolean values. Take the logical not - * of each. - */ - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; - if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) - return ret; + ret = -EBUSY; /* - * When this function is being called for lumpy reclaim, we - * initially look into all LRU pages, active, inactive and - * unevictable; only give shrink_page_list evictable pages. + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking */ - if (PageUnevictable(page)) - return ret; + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; - ret = -EBUSY; + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } + + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; if (likely(get_page_unless_zero(page))) { /* @@ -1009,168 +1253,57 @@ int __isolate_lru_page(struct page *page, int mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. - * @order: The caller's attempted allocation order + * @nr_scanned: The number of pages that were scanned. + * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes - * @file: True [1] if isolating file [!anon] pages + * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode, int file) + struct lruvec *lruvec, struct list_head *dst, + unsigned long *nr_scanned, struct scan_control *sc, + isolate_mode_t mode, enum lru_list lru) { + struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long nr_lumpy_taken = 0; - unsigned long nr_lumpy_dirty = 0; - unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; - unsigned long pfn; - unsigned long end_pfn; - unsigned long page_pfn; - int zone_id; + int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); - switch (__isolate_lru_page(page, mode, file)) { + switch (__isolate_lru_page(page, mode)) { case 0: + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_move(&page->lru, dst); - mem_cgroup_del_lru(page); - nr_taken += hpage_nr_pages(page); + nr_taken += nr_pages; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); - mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: BUG(); } - - if (!order) - continue; - - /* - * Attempt to take all pages in the order aligned region - * surrounding the tag page. Only take those pages of - * the same active state as that tag page. We may safely - * round the target page pfn down to the requested order - * as the mem_map is guarenteed valid out to MAX_ORDER, - * where that page is in a different zone we will detect - * it from its zone id and abort this block scan. - */ - zone_id = page_zone_id(page); - page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << order) - 1); - end_pfn = pfn + (1 << order); - for (; pfn < end_pfn; pfn++) { - struct page *cursor_page; - - /* The target page is in the block, ignore it. */ - if (unlikely(pfn == page_pfn)) - continue; - - /* Avoid holes within the zone. */ - if (unlikely(!pfn_valid_within(pfn))) - break; - - cursor_page = pfn_to_page(pfn); - - /* Check that we have not crossed a zone boundary. */ - if (unlikely(page_zone_id(cursor_page) != zone_id)) - break; - - /* - * If we don't have enough swap space, reclaiming of - * anon page which don't already have a swap slot is - * pointless. - */ - if (nr_swap_pages <= 0 && PageAnon(cursor_page) && - !PageSwapCache(cursor_page)) - break; - - if (__isolate_lru_page(cursor_page, mode, file) == 0) { - list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(page); - nr_lumpy_taken++; - if (PageDirty(cursor_page)) - nr_lumpy_dirty++; - scan++; - } else { - /* the page is freed already. */ - if (!page_count(cursor_page)) - continue; - break; - } - } - - /* If we break out of the loop above, lumpy reclaim failed */ - if (pfn < end_pfn) - nr_lumpy_failed++; } - *scanned = scan; - - trace_mm_vmscan_lru_isolate(order, - nr_to_scan, scan, - nr_taken, - nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode); + *nr_scanned = scan; + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - int mode, struct zone *z, - int active, int file) -{ - int lru = LRU_BASE; - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, file); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, - unsigned int *count) -{ - int nr_active = 0; - int lru; - struct page *page; - - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - if (count) - count[lru] += numpages; - } - - return nr_active; -} - /** * isolate_lru_page - tries to isolate a page from its LRU list * @page: page to isolate from its LRU list @@ -1200,16 +1333,20 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; + VM_BUG_ON_PAGE(!page_count(page), page); + if (PageLRU(page)) { struct zone *zone = page_zone(page); + struct lruvec *lruvec; spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { + lruvec = mem_cgroup_page_lruvec(page, zone); + if (PageLRU(page)) { int lru = page_lru(page); - ret = 0; + get_page(page); ClearPageLRU(page); - - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); + ret = 0; } spin_unlock_irq(&zone->lru_lock); } @@ -1217,7 +1354,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) @@ -1227,7 +1368,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!scanning_global_lru(sc)) + if (!global_reclaim(sc)) return 0; if (file) { @@ -1238,129 +1379,82 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */ static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, - unsigned long nr_anon, unsigned long nr_file, - struct list_head *page_list) +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { - struct page *page; - struct pagevec pvec; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - - pagevec_init(&pvec, 1); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct zone *zone = lruvec_zone(lruvec); + LIST_HEAD(pages_to_free); /* * Put back any unfreeable pages. */ - spin_lock(&zone->lru_lock); while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); int lru; - page = lru_to_page(page_list); - VM_BUG_ON(PageLRU(page)); + + VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); continue; } + + lruvec = mem_cgroup_page_lruvec(page, zone); + SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); + add_page_to_lru_list(page, lruvec, lru); + if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); } } - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); -} -static noinline_for_stack void update_isolated_counts(struct zone *zone, - struct scan_control *sc, - unsigned long *nr_anon, - unsigned long *nr_file, - struct list_head *isolated_list) -{ - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - - nr_active = clear_active_flags(isolated_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); - - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); - - *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - - reclaim_stat->recent_scanned[0] += *nr_anon; - reclaim_stat->recent_scanned[1] += *nr_file; + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } /* - * Returns true if the caller should wait to clean dirty/writeback pages. - * - * If we are direct reclaiming for contiguous pages and we do not reclaim - * everything in the list, try again and wait for writeback IO to complete. - * This will stall high-order allocations noticeably. Only do that when really - * need to free the pages under high memory pressure. + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. */ -static inline bool should_reclaim_stall(unsigned long nr_taken, - unsigned long nr_freed, - int priority, - struct scan_control *sc) +static int current_may_throttle(void) { - int lumpy_stall_priority; - - /* kswapd should not stall on sync IO */ - if (current_is_kswapd()) - return false; - - /* Only stall on lumpy reclaim */ - if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) - return false; - - /* If we have relaimed everything on the isolated list, no stall */ - if (nr_freed == nr_taken) - return false; - - /* - * For high-order allocations, there are two stall thresholds. - * High-cost allocations stall immediately where as lower - * order allocations such as stacks require the scanning - * priority to be much higher before stalling. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_stall_priority = DEF_PRIORITY; - else - lumpy_stall_priority = DEF_PRIORITY / 3; - - return priority <= lumpy_stall_priority; + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); } /* @@ -1368,15 +1462,22 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_anon; - unsigned long nr_file; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1386,65 +1487,121 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_reclaim_mode(priority, sc, false); lru_add_drain(); + + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + &nr_scanned, sc, isolate_mode, lru); + + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scanned); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scanned); - } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, - 0, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } + spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) { - spin_unlock_irq(&zone->lru_lock); + if (nr_taken == 0) return 0; + + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); + + spin_lock_irq(&zone->lru_lock); + + reclaim_stat->recent_scanned[file] += nr_taken; + + if (global_reclaim(sc)) { + if (current_is_kswapd()) + __count_zone_vm_events(PGSTEAL_KSWAPD, zone, + nr_reclaimed); + else + __count_zone_vm_events(PGSTEAL_DIRECT, zone, + nr_reclaimed); } - update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); + putback_inactive_pages(lruvec, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc); + free_hot_cold_page_list(&page_list, true); - /* Check if we should syncronously wait for writeback */ - if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc); - } + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. + */ + if (nr_writeback && nr_writeback == nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); - local_irq_disable(); - if (current_is_kswapd()) - __count_vm_events(KSWAPD_STEAL, nr_reclaimed); - __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); + /* + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim + */ + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); - putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_immediate && current_may_throttle()) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, - priority, - trace_shrink_flags(file, sc->reclaim_mode)); + sc->priority, + trace_shrink_flags(file)); return nr_reclaimed; } @@ -1466,32 +1623,39 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, * But we had to alter page->flags anyway. */ -static void move_active_pages_to_lru(struct zone *zone, +static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, + struct list_head *pages_to_free, enum lru_list lru) { + struct zone *zone = lruvec_zone(lruvec); unsigned long pgmoved = 0; - struct pagevec pvec; struct page *page; - - pagevec_init(&pvec, 1); + int nr_pages; while (!list_empty(list)) { page = lru_to_page(list); + lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved += hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); + pgmoved += nr_pages; - if (!pagevec_add(&pvec, page) || list_empty(list)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); } } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1499,45 +1663,42 @@ static void move_active_pages_to_lru(struct zone *zone, __count_vm_events(PGDEACTIVATE, pgmoved); } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc, + enum lru_list lru) { unsigned long nr_taken; - unsigned long pgscanned; + unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned long nr_rotated = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); lru_add_drain(); + + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); - if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_pages, &l_hold, - &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - 1, file); - zone->pages_scanned += pgscanned; - } else { - nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, - &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * mem_cgroup_isolate_pages() keeps track of - * scanned pages on its own. - */ - } + + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); + if (global_reclaim(sc)) + zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; - __count_zone_vm_events(PGREFILL, zone, pgscanned); - if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); - else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1546,12 +1707,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } - if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + if (unlikely(buffer_heads_over_limit)) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1584,12 +1754,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, - LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, - LRU_BASE + file * LRU_FILE); + move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP @@ -1608,16 +1778,13 @@ static int inactive_anon_is_low_global(struct zone *zone) /** * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @lruvec: LRU vector to check * * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct lruvec *lruvec) { - int low; - /* * If we don't have swap space, anonymous page deactivation * is pointless. @@ -1625,34 +1792,21 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (!total_swap_pages) return 0; - if (scanning_global_lru(sc)) - low = inactive_anon_is_low_global(zone); - else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); - return low; + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_anon_is_low(lruvec); + + return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct zone *zone, - struct scan_control *sc) +static inline int inactive_anon_is_low(struct lruvec *lruvec) { return 0; } #endif -static int inactive_file_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_FILE); - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - - return (active > inactive); -} - /** * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @lruvec: LRU vector to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1664,59 +1818,43 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct lruvec *lruvec) { - int low; + unsigned long inactive; + unsigned long active; - if (scanning_global_lru(sc)) - low = inactive_file_is_low_global(zone); - else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); - return low; + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + + return active > inactive; } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, - int file) +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { - if (file) - return inactive_file_is_low(zone, sc); + if (is_file_lru(lru)) + return inactive_file_is_low(lruvec); else - return inactive_anon_is_low(zone, sc); + return inactive_anon_is_low(lruvec); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct zone *zone, struct scan_control *sc, int priority) + struct lruvec *lruvec, struct scan_control *sc) { - int file = is_file_lru(lru); - if (is_active_lru(lru)) { - if (inactive_list_is_low(zone, sc, file)) - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (inactive_list_is_low(lruvec, lru)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) -{ - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; - else - nr = 0; - - return nr; -} +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; /* * Determine how aggressively the anon and file LRU lists should be @@ -1724,51 +1862,108 @@ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { - unsigned long anon, file, free; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file; + bool force_scan = false; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - u64 fraction[2], denominator; - enum lru_list l; - int noswap = 0; + enum lru_list lru; + bool some_scanned; + int pass; + + /* + * If the zone or memcg is small, nr[l] can be 0. This + * results in no scanning on this priority and a potential + * priority drop. Global direct reclaim can go to the next + * zone and tends to have no problems. Global kswapd is for + * zone balancing and it needs to scan a minimum amount. When + * reclaiming for a memcg, a priority drop can cause high + * latencies, so it's better to scan a minimum amount there as + * well. + */ + if (current_is_kswapd() && !zone_reclaimable(zone)) + force_scan = true; + if (!global_reclaim(sc)) + force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !sc->swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && sc->swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); - if (scanning_global_lru(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have very few page cache pages, - force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - fraction[0] = 1; - fraction[1] = 0; - denominator = 1; + scan_balance = SCAN_ANON; goto out; } } /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; + file_prio = 200 - anon_prio; /* * OK, so we have swap space and a fair amount of page cache @@ -1797,10 +1992,10 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&zone->lru_lock); @@ -1808,26 +2003,186 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; - - scan = zone_nr_lru_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = div64_u64(scan * fraction[file], denominator); + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; + + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; + /* + * Skip the second pass and don't force_scan, + * if we found something to scan. + */ + some_scanned |= !!scan; + } + } +} + +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted; + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + throttle_vm_writeout(sc->gfp_mask); +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; } /* - * Reclaim/compaction depends on a number of pages being freed. To avoid - * disruption to the system, a small number of order-0 pages continue to be - * rotated and reclaimed in the normal fashion. However, by the time we get - * back to the allocator and call try_to_compact_zone(), we ensure that - * there are enough free pages for it to be likely successful + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct zone *zone, unsigned long nr_reclaimed, @@ -1838,7 +2193,7 @@ static inline bool should_continue_reclaim(struct zone *zone, unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + if (!in_reclaim_compaction(sc)) return false; /* Consider stopping depending on scan and reclaim activity */ @@ -1869,8 +2224,9 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; @@ -1885,61 +2241,89 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -/* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long nr[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list l; unsigned long nr_reclaimed, nr_scanned; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; -restart: - nr_reclaimed = 0; - nr_scanned = sc->nr_scanned; - get_scan_count(zone, sc, nr, priority); + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + struct mem_cgroup *memcg; + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(l) { - if (nr[l]) { - nr_to_scan = min_t(unsigned long, - nr[l], SWAP_CLUSTER_MAX); - nr[l] -= nr_to_scan; - - nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + sc->swappiness = mem_cgroup_swappiness(memcg); + shrink_lruvec(lruvec, sc); + + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; } - } - /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. - */ - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) - break; - } - sc->nr_reclaimed += nr_reclaimed; + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); + + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; /* - * Even if we did not try to evict anon pages at all, we want to - * rebalance the anon lru active/inactive ratio. + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page */ - if (inactive_anon_is_low(zone, sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); - /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(zone, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) - goto restart; + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone, sc->order)) + return watermark_ok; - throttle_vm_writeout(sc->gfp_mask); + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; } /* @@ -1957,12 +2341,37 @@ restart: * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ -static void shrink_zones(int priority, struct zonelist *zonelist, - struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; + bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + orig_mask = sc->gfp_mask; + if (buffer_heads_over_limit) + sc->gfp_mask |= __GFP_HIGHMEM; + + nodes_clear(shrink.nodes_to_scan); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1972,33 +2381,79 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scanning_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ + if (IS_ENABLED(CONFIG_COMPACTION)) { + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { + aborted_reclaim = true; + continue; + } + } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ } - shrink_zone(priority, zone, sc); + shrink_zone(zone, sc); } -} -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return aborted_reclaim; } -/* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. It can't handle OOM during hibernation. - * So let's check zone's unreclaimable in direct reclaim as well as kswapd. - */ +/* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2006,13 +2461,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone_reclaimable(zone)) { - all_unreclaimable = false; - break; - } + if (zone_reclaimable(zone)) + return false; } - return all_unreclaimable; + return true; } /* @@ -2032,51 +2485,35 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc) + struct scan_control *sc) { - int priority; unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; + bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); - if (scanning_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; - if (!priority) - disable_swap_token(); - shrink_zones(priority, zonelist, sc); - /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups - */ - if (scanning_global_lru(sc)) { - unsigned long lru_pages = 0; - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + aborted_reclaim = shrink_zones(zonelist, sc); - lru_pages += zone_reclaimable_pages(zone); - } - - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + + /* * Try to write back as many pages as we just scanned. This * tends to cause slow streaming writers to write data to the * disk smoothly, at the dirtying rate, which is nice. But @@ -2085,52 +2522,187 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } - } + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; + /* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. Thus bypassing all_unreclaimable + * check. + */ + if (oom_killer_disabled) + return 0; + + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ - if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) return 1; return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zoneref *z; + struct zone *zone; + pg_data_t *pgdat = NULL; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; + + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) + goto out; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + + goto check_pending; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, - .mem_cgroup = NULL, + .priority = DEF_PRIORITY, + .target_mem_cgroup = NULL, .nodemask = nodemask, }; + /* + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. + */ + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + return 1; + trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); @@ -2142,26 +2714,30 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return nr_reclaimed; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, - unsigned int swappiness, - struct zone *zone) + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { + .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swappiness = swappiness, .order = 0, - .mem_cgroup = mem, + .priority = 0, + .swappiness = mem_cgroup_swappiness(memcg), + .target_mem_cgroup = memcg, }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, sc.gfp_mask); @@ -2172,34 +2748,42 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone(0, zone, &sc); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, - bool noswap, - unsigned int swappiness) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; + int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .swappiness = swappiness, .order = 0, - .mem_cgroup = mem_cont, + .priority = DEF_PRIORITY, + .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), }; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - zonelist = NODE_DATA(numa_node_id())->node_zonelists; + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(memcg); + + zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, @@ -2213,80 +2797,208 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (IS_ENABLED(CONFIG_COMPACTION) && order && + !compaction_suitable(zone, order)) + return false; + + return true; +} + /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. * The choice of 25% is due to * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine * o On all other machines, the top zone must be at least a reasonable - * precentage of the middle zones. For example, on 32-bit x86, highmem + * percentage of the middle zones. For example, on 32-bit x86, highmem * would need to be at least 256M for it to be balance a whole node. * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, - int classzone_idx) -{ - unsigned long present_pages = 0; - int i; - - for (i = 0; i <= classzone_idx; i++) - present_pages += pgdat->node_zones[i].present_pages; - - return balanced_pages > (present_pages >> 2); -} - -/* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { + unsigned long managed_pages = 0; + unsigned long balanced_pages = 0; int i; - unsigned long balanced = 0; - bool all_zones_ok = true; - - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return true; /* Check the watermark levels */ - for (i = 0; i < pgdat->nr_zones; i++) { + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; + managed_pages += zone->managed_pages; + /* + * A special case here: + * * balance_pgdat() skips over all_unreclaimable after * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well if kswapd - * is to sleep + * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { - balanced += zone->present_pages; + if (!zone_reclaimable(zone)) { + balanced_pages += zone->managed_pages; continue; } - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - classzone_idx, 0)) - all_zones_ok = false; - else - balanced += zone->present_pages; + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->managed_pages; + else if (!order) + return false; } - /* - * For high-order requests, the balanced zones must contain at least - * 25% of the nodes pages for kswapd to sleep. For order-0, all zones - * must be balanced - */ if (order) - return pgdat_balanced(pgdat, balanced, classzone_idx); + return balanced_pages >= (managed_pages >> 2); else - return !all_zones_ok; + return true; +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) +{ + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return false; + + /* + * There is a potential race between when kswapd checks its watermarks + * and a process gets throttled. There is also a potential race if + * processes get throttled, kswapd wakes, a large process exits therby + * balancing the zones that causes kswapd to miss a wakeup. If kswapd + * is going to sleep, no process should be sleeping on pfmemalloc_wait + * so wake them now if necessary. If necessary, processes will wake + * kswapd and get throttled again + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { + wake_up(&pgdat->pfmemalloc_wait); + return false; + } + + return pgdat_balanced(pgdat, order, classzone_idx); +} + +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + reclaim_state->reclaimed_slab = 0; + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (zone_reclaimable(zone) && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; } /* @@ -2313,42 +3025,28 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - int all_zones_ok; - unsigned long balanced; - int priority; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long total_scanned; - struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, - .swappiness = vm_swappiness, + .may_writepage = !laptop_mode, .order = order, - .mem_cgroup = NULL, + .target_mem_cgroup = NULL, }; -loop_again: - total_scanned = 0; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { unsigned long lru_pages = 0; - int has_under_min_watermark_zone = 0; - - /* The swap token gets in the way of swapout... */ - if (!priority) - disable_swap_token(); + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); - all_zones_ok = 1; - balanced = 0; + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2360,34 +3058,71 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone, &sc)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, - &sc, priority, 0); + age_active_anon(zone, &sc); - if (!zone_watermark_ok_safe(zone, order, - high_wmark_pages(zone), 0, 0)) { + /* + * If the number of buffer_heads in the machine + * exceeds the maximum allowed level and this node + * has a highmem zone, force kswapd to reclaim from + * it to relieve lowmem pressure. + */ + if (buffer_heads_over_limit && is_highmem_idx(i)) { end_zone = i; - *classzone_idx = i; break; } + + if (!zone_balanced(zone, order, 0, 0)) { + end_zone = i; + break; + } else { + /* + * If balanced, clear the dirty and congested + * flags + */ + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } } + if (i < 0) goto out; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2397,177 +3132,80 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { - int compaction; struct zone *zone = pgdat->node_zones + i; - int nr_slab; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; + nr_soft_scanned = 0; /* * Call soft limit reclaim before calling shrink_zone. - * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - if (!zone_watermark_ok_safe(zone, order, - 8*high_wmark_pages(zone), end_zone, 0)) - shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - - compaction = 0; - if (order && - zone_watermark_ok(zone, 0, - high_wmark_pages(zone), - end_zone, 0) && - !zone_watermark_ok(zone, order, - high_wmark_pages(zone), - end_zone, 0)) { - compact_zone_order(zone, - order, - sc.gfp_mask, false, - COMPACT_MODE_KSWAPD); - compaction = 1; - } - - if (zone->all_unreclaimable) - continue; - if (!compaction && nr_slab == 0 && - !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode - */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) - sc.may_writepage = 1; - - if (!zone_watermark_ok_safe(zone, order, - high_wmark_pages(zone), end_zone, 0)) { - all_zones_ok = 0; - /* - * We are still under min water mark. This - * means that we have a GFP_ATOMIC allocation - * failure risk. Hurry up! - */ - if (!zone_watermark_ok_safe(zone, order, - min_wmark_pages(zone), end_zone, 0)) - has_under_min_watermark_zone = 1; - } else { - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * spectulatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; - } - - } - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) - break; /* kswapd: all done */ - /* - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. - */ - if (total_scanned && (priority < DEF_PRIORITY - 2)) { - if (has_under_min_watermark_zone) - count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } -out: - - /* - * order-0: All zones must meet high watermark for a balanced node - * high-order: Balanced zones must make up at least 25% of the node - * for the node to be balanced - */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { - cond_resched(); - - try_to_freeze(); + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up(&pgdat->pfmemalloc_wait); /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + if (order && sc.nr_reclaimed >= 2UL << order) order = sc.order = 0; - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (zone->all_unreclaimable && priority != DEF_PRIORITY) - continue; + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; - /* Confirm the zone is balanced for order-0 */ - if (!zone_watermark_ok(zone, 0, - high_wmark_pages(zone), 0, 0)) { - order = sc.order = 0; - goto loop_again; - } + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) + compact_pgdat(pgdat, order); - /* If balanced, clear the congested flag */ - zone_clear_flag(zone, ZONE_CONGESTED); - } - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); +out: /* - * Return the order we were reclaiming at so sleeping_prematurely() + * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level @@ -2587,7 +3225,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2597,7 +3235,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -2609,7 +3247,18 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) @@ -2635,8 +3284,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order; - int classzone_idx; + unsigned long order, new_order; + unsigned balanced_order; + int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2666,17 +3317,26 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; - classzone_idx = MAX_NR_ZONES - 1; + order = new_order = 0; + balanced_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { - unsigned long new_order; - int new_classzone_idx; - int ret; - - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + bool ret; + + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' @@ -2685,11 +3345,14 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx); + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + pgdat->classzone_idx = pgdat->nr_zones - 1; } ret = try_to_freeze(); @@ -2702,9 +3365,16 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, &classzone_idx); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } + + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } @@ -2727,48 +3397,13 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ - int nr; - - nr = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); - - if (nr_swap_pages > 0) - nr += global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_INACTIVE_ANON); - - return nr; -} - -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (nr_swap_pages > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -2788,10 +3423,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, - .swappiness = vm_swappiness, .order = 0, + .priority = DEF_PRIORITY, }; - struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -2814,13 +3449,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); const struct cpumask *mask; @@ -2850,21 +3485,25 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; } return ret; } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) @@ -2872,7 +3511,7 @@ static int __init kswapd_init(void) int nid; swap_setup(); - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; @@ -2964,16 +3603,17 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .nr_to_reclaim = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, + .priority = ZONE_RECLAIM_PRIORITY, + }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, }; unsigned long nr_slab_pages0, nr_slab_pages1; @@ -2993,11 +3633,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; do { - shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && sc.nr_reclaimed < nr_pages); + shrink_zone(zone, &sc); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -3008,15 +3646,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. */ + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); for (;;) { unsigned long lru_pages = zone_reclaimable_pages(zone); /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) break; /* Freed enough memory */ @@ -3060,7 +3697,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* @@ -3095,181 +3732,81 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } +#ifdef CONFIG_SHMEM /** - * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list - * @page: page to check evictability and move to appropriate lru list - * @zone: zone page is in - * - * Checks a page for evictability and moves the page to the appropriate - * zone lru list. + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages: array of pages to check + * @nr_pages: number of pages to check * - * Restrictions: zone->lru_lock must be held, page must be on LRU and must - * have PageUnevictable set. - */ -static void check_move_unevictable_page(struct page *page, struct zone *zone) -{ - VM_BUG_ON(PageActive(page)); - -retry: - ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { - enum lru_list l = page_lru_base_type(page); - - __dec_zone_state(zone, NR_UNEVICTABLE); - list_move(&page->lru, &zone->lru[l].list); - mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); - __inc_zone_state(zone, NR_INACTIVE_ANON + l); - __count_vm_event(UNEVICTABLE_PGRESCUED); - } else { - /* - * rotate unevictable list - */ - SetPageUnevictable(page); - list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); - mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); - if (page_evictable(page, NULL)) - goto retry; - } -} - -/** - * scan_mapping_unevictable_pages - scan an address space for evictable pages - * @mapping: struct address_space to scan for evictable pages + * Checks pages for evictability and moves them to the appropriate lru list. * - * Scan all pages in mapping. Check unevictable pages for - * evictability and move them to the appropriate zone lru list. + * This function is only used for SysV IPC SHM_UNLOCK. */ -void scan_mapping_unevictable_pages(struct address_space *mapping) +void check_move_unevictable_pages(struct page **pages, int nr_pages) { - pgoff_t next = 0; - pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - struct zone *zone; - struct pagevec pvec; - - if (mapping->nrpages == 0) - return; - - pagevec_init(&pvec, 0); - while (next < end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - int i; - int pg_scanned = 0; + struct lruvec *lruvec; + struct zone *zone = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; - zone = NULL; + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct zone *pagezone; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - struct zone *pagezone = page_zone(page); + pgscanned++; + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + lruvec = mem_cgroup_page_lruvec(page, zone); - pg_scanned++; - if (page_index > next) - next = page_index; - next++; + if (!PageLRU(page) || !PageUnevictable(page)) + continue; - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } + if (page_evictable(page)) { + enum lru_list lru = page_lru_base_type(page); - if (PageLRU(page) && PageUnevictable(page)) - check_move_unevictable_page(page, zone); + VM_BUG_ON_PAGE(PageActive(page), page); + ClearPageUnevictable(page); + del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, lru); + pgrescued++; } - if (zone) - spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); - - count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); } -} - -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable. Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them. Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) -{ - struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; - unsigned long scan; - unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - - while (nr_to_scan > 0) { - unsigned long batch_size = min(nr_to_scan, - SCAN_UNEVICTABLE_BATCH_SIZE); - - spin_lock_irq(&zone->lru_lock); - for (scan = 0; scan < batch_size; scan++) { - struct page *page = lru_to_page(l_unevictable); - - if (!trylock_page(page)) - continue; - - prefetchw_prev_lru_page(page, l_unevictable, flags); - - if (likely(PageLRU(page) && PageUnevictable(page))) - check_move_unevictable_page(page, zone); - - unlock_page(page); - } + if (zone) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); spin_unlock_irq(&zone->lru_lock); - - nr_to_scan -= batch_size; } } +#endif /* CONFIG_SHMEM */ - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer: scan all zones' unevictable LRU lists to check for - * pages that have become evictable. Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system. As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) +static void warn_scan_unevictable_pages(void) { - struct zone *zone; - - for_each_zone(zone) { - scan_zone_unevictable_pages(zone); - } + printk_once(KERN_WARNING + "%s: The scan_unevictable_pages sysctl/node-interface has been " + "disabled for lack of a legitimate use case. If you have " + "one, please send an email to linux-mm@kvack.org.\n", + current->comm); } /* @@ -3282,11 +3819,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + warn_scan_unevictable_pages(); proc_doulongvec_minmax(table, write, buffer, length, ppos); - - if (write && *(unsigned long *)table->data) - scan_all_zones_unevictable_pages(); - scan_unevictable_pages = 0; return 0; } @@ -3297,45 +3831,34 @@ int scan_unevictable_handler(struct ctl_table *table, int write, * a specified node's per zone unevictable lists for evictable pages. */ -static ssize_t read_scan_unevictable_node(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t read_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, char *buf) { + warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } -static ssize_t write_scan_unevictable_node(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t write_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { - struct zone *node_zones = NODE_DATA(dev->id)->node_zones; - struct zone *zone; - unsigned long res; - unsigned long req = strict_strtoul(buf, 10, &res); - - if (!req) - return 1; /* zero is no-op */ - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - scan_zone_unevictable_pages(zone); - } + warn_scan_unevictable_pages(); return 1; } -static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, +static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, read_scan_unevictable_node, write_scan_unevictable_node); int scan_unevictable_register_node(struct node *node) { - return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); + return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); } void scan_unevictable_unregister_node(struct node *node) { - sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); + device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); } #endif |
