diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 3788 |
1 files changed, 2780 insertions, 1008 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4046434046e..0f16ffe8eb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,14 +11,17 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> -#include <linux/slab.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> @@ -26,58 +29,77 @@ #include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */ #include <linux/mm_inline.h> -#include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/delayacct.h> +#include <linux/sysctl.h> +#include <linux/oom.h> +#include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; int may_writepage; + /* Can mapped pages be reclaimed? */ + int may_unmap; + /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - - int swappiness; + int order; - int all_unreclaimable; + /* Scan (total_size >> priority) pages at once */ + int priority; - int order; + /* anon vs. file LRUs scanning "ratio" */ + int swappiness; - /* Which cgroup do we reclaim from */ - struct mem_cgroup *mem_cgroup; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active); + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -114,26 +136,76 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scan_global_lru(sc) (!(sc)->mem_cgroup) +#ifdef CONFIG_MEMCG +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} #else -#define scan_global_lru(sc) (1) +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} #endif +static unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); + + return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); +} + /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -145,10 +217,123 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long nr_pages_scanned, unsigned long lru_pages) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= freeable; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > freeable * 2) + total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, + freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + return freed; +} + /* * Call the shrink functions to age shrinkable caches * @@ -168,94 +353,60 @@ EXPORT_SYMBOL(unregister_shrinker); * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +unsigned long shrink_slab(struct shrink_control *shrinkctl, + unsigned long nr_pages_scanned, + unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; - - if (scanned == 0) - scanned = SWAP_CLUSTER_MAX; + unsigned long freed = 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 1; /* Assume we'll be able to shrink next time */ - - list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); - - delta = (4 * scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __FUNCTION__, shrinker->nr); - shrinker->nr = max_pass; - } + if (nr_pages_scanned == 0) + nr_pages_scanned = SWAP_CLUSTER_MAX; + if (!down_read_trylock(&shrinker_rwsem)) { /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; - - total_scan = shrinker->nr; - shrinker->nr = 0; + freed = 1; + goto out; + } - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; - int nr_before; + list_for_each_entry(shrinker, &shrinker_list, list) { + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); + continue; + } - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); - cond_resched(); } - - shrinker->nr += total_scan; } up_read(&shrinker_rwsem); - return ret; -} - -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) -{ - struct address_space *mapping; - - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; - - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; - - mapping = page_mapping(page); - if (!mapping) - return 0; - - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); +out: + cond_resched(); + return freed; } static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -287,12 +438,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -310,7 +455,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -319,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -327,7 +472,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -336,10 +480,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __FUNCTION__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -347,7 +491,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -357,7 +501,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; @@ -370,18 +513,11 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, return PAGE_ACTIVATE; } - /* - * Wait on writeback if requested to. This happens when - * direct reclaiming a large contiguous area and the - * first attempt to free a range of pages fails. - */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) - wait_on_page_writeback(page); - if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -390,17 +526,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } /* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. */ -int remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. * @@ -426,65 +561,286 @@ int remove_mapping(struct address_space *mapping, struct page *page) * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (unlikely(page_count(page) != 2)) + if (!page_freeze_refs(page, 2)) goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); goto cannot_free; + } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); - swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + spin_unlock_irq(&mapping->tree_lock); + swapcache_free(swap, page); + } else { + void (*freepage)(struct page *); + void *shadow = NULL; + + freepage = mapping->a_ops->freepage; + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + + if (freepage != NULL) + freepage(page); } - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } /* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page, false)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +void putback_lru_page(struct page *page) +{ + bool is_unevictable; + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON_PAGE(PageLRU(page), page); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + is_unevictable = false; + lru_cache_add(page); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + is_unevictable = true; + add_page_to_unevictable_list(page); + /* + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_pages, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked() or shmem_lock(). + */ + smp_mb(); + } + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (is_unevictable && page_evictable(page)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && !is_unevictable) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && is_unevictable) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +enum page_references { + PAGEREF_RECLAIM, + PAGEREF_RECLAIM_CLEAN, + PAGEREF_KEEP, + PAGEREF_ACTIVATE, +}; + +static enum page_references page_check_references(struct page *page, + struct scan_control *sc) +{ + int referenced_ptes, referenced_page; + unsigned long vm_flags; + + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); + referenced_page = TestClearPageReferenced(page); + + /* + * Mlock lost the isolation race with us. Let try_to_unmap() + * move the page to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return PAGEREF_RECLAIM; + + if (referenced_ptes) { + if (PageSwapBacked(page)) + return PAGEREF_ACTIVATE; + /* + * All mapped pages start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file page is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated pages as well + * so that recently deactivated but used pages are + * quickly recovered. + */ + SetPageReferenced(page); + + if (referenced_page || referenced_ptes > 1) + return PAGEREF_ACTIVATE; + + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; + } + + /* Reclaim if clean, defer dirty pages to writeback */ + if (referenced_page && !PageSwapBacked(page)) + return PAGEREF_RECLAIM_CLEAN; + + return PAGEREF_RECLAIM; +} + +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + +/* * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct zone *zone, + struct scan_control *sc, + enum ttu_flags ttu_flags, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, + unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, + bool force_reclaim) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); - pagevec_init(&freed_pvec, 1); + mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; int may_enter_fs; - int referenced; + enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); page = lru_to_page(page_list); list_del(&page->lru); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; - if (!sc->may_swap && page_mapped(page)) + if (unlikely(!page_evictable(page))) + goto cull_mlocked; + + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -494,56 +850,167 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * Synchronous reclaim is performed in two passes, - * first an asynchronous pass over the list to - * start parallel writeback, and a second synchronous - * pass to wait for the IO to complete. Wait here - * for any page for which writeback has already - * started. - */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) - wait_on_page_writeback(page); - else + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + nr_immediate++; goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); + nr_writeback++; + + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); + } } - referenced = page_referenced(page, 1, sc->mem_cgroup); - /* In active use or really unfreeable? Activate it. */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + if (!force_reclaim) + references = page_check_references(page, sc); + + switch (references) { + case PAGEREF_ACTIVATE: goto activate_locked; + case PAGEREF_KEEP: + goto keep_locked; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } -#ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) - if (!add_to_swap(page, GFP_ATOMIC)) + if (PageAnon(page) && !PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (!add_to_swap(page, page_list)) goto activate_locked; -#endif /* CONFIG_SWAP */ + may_enter_fs = 1; - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } } if (PageDirty(page)) { - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) + /* + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. + */ + if (page_is_file_cache(page) && + (!current_is_kswapd() || + !zone_is_reclaim_dirty(zone))) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + + goto keep_locked; + } + + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -551,19 +1018,22 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; @@ -583,7 +1053,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -594,43 +1064,109 @@ static unsigned long shrink_page_list(struct list_head *page_list, * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) - goto free_it; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } } - if (!mapping || !remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); free_it: - unlock_page(page); nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) - __pagevec_release_nonlru(&freed_pvec); + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); + continue; + +cull_mlocked: + if (PageSwapCache(page)) + try_to_free_swap(page); + unlock_page(page); + putback_lru_page(page); continue; activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + try_to_free_swap(page); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + + free_hot_cold_page_list(&free_pages, true); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_release_nonlru(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); + mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; + *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} /* * Attempt to remove the specified page from its LRU. Only take this page @@ -642,7 +1178,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, isolate_mode_t mode) { int ret = -EINVAL; @@ -650,15 +1186,49 @@ int __isolate_lru_page(struct page *page, int mode) if (!PageLRU(page)) return ret; + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) + return ret; + + ret = -EBUSY; + /* - * When checking the active state, we need to be sure we are - * dealing with comparible boolean values. Take the logical not - * of each. + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking */ - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } + + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; - ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -683,37 +1253,39 @@ int __isolate_lru_page(struct page *page, int mode) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @src: The LRU list to pull pages off. + * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. - * @scanned: The number of pages that were scanned. - * @order: The caller's attempted allocation order + * @nr_scanned: The number of pages that were scanned. + * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes + * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + struct lruvec *lruvec, struct list_head *dst, + unsigned long *nr_scanned, struct scan_control *sc, + isolate_mode_t mode, enum lru_list lru) { + struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; - unsigned long pfn; - unsigned long end_pfn; - unsigned long page_pfn; - int zone_id; + int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); switch (__isolate_lru_page(page, mode)) { case 0: + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_move(&page->lru, dst); - nr_taken++; + nr_taken += nr_pages; break; case -EBUSY: @@ -724,318 +1296,313 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, default: BUG(); } - - if (!order) - continue; - - /* - * Attempt to take all pages in the order aligned region - * surrounding the tag page. Only take those pages of - * the same active state as that tag page. We may safely - * round the target page pfn down to the requested order - * as the mem_map is guarenteed valid out to MAX_ORDER, - * where that page is in a different zone we will detect - * it from its zone id and abort this block scan. - */ - zone_id = page_zone_id(page); - page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << order) - 1); - end_pfn = pfn + (1 << order); - for (; pfn < end_pfn; pfn++) { - struct page *cursor_page; - - /* The target page is in the block, ignore it. */ - if (unlikely(pfn == page_pfn)) - continue; - - /* Avoid holes within the zone. */ - if (unlikely(!pfn_valid_within(pfn))) - break; - - cursor_page = pfn_to_page(pfn); - /* Check that we have not crossed a zone boundary. */ - if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; - switch (__isolate_lru_page(cursor_page, mode)) { - case 0: - list_move(&cursor_page->lru, dst); - nr_taken++; - scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; - } - } } - *scanned = scan; + *nr_scanned = scan; + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); return nr_taken; } -static unsigned long isolate_pages_global(unsigned long nr, - struct list_head *dst, - unsigned long *scanned, int order, - int mode, struct zone *z, - struct mem_cgroup *mem_cont, - int active) -{ - if (active) - return isolate_lru_pages(nr, &z->active_list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->inactive_list, dst, - scanned, order, mode); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. */ -static unsigned long clear_active_flags(struct list_head *page_list) +int isolate_lru_page(struct page *page) { - int nr_active = 0; - struct page *page; + int ret = -EBUSY; - list_for_each_entry(page, page_list, lru) - if (PageActive(page)) { - ClearPageActive(page); - nr_active++; - } + VM_BUG_ON_PAGE(!page_count(page), page); + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + struct lruvec *lruvec; - return nr_active; + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + if (PageLRU(page)) { + int lru = page_lru(page); + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, lru); + ret = 0; + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) { - LIST_HEAD(page_list); - struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; + unsigned long inactive, isolated; - pagevec_init(&pvec, 1); + if (current_is_kswapd()) + return 0; - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - - nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); - __count_vm_events(PGDEACTIVATE, nr_active); - - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) - zone->pages_scanned += nr_scan; - spin_unlock_irq(&zone->lru_lock); + if (!global_reclaim(sc)) + return 0; - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } - /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller - */ - if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { - congestion_wait(WRITE, HZ/10); + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list); - count_vm_events(PGDEACTIVATE, nr_active); + return isolated > inactive; +} + +static noinline_for_stack void +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +{ + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct zone *zone = lruvec_zone(lruvec); + LIST_HEAD(pages_to_free); - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); + int lru; + + VM_BUG_ON_PAGE(PageLRU(page), page); + list_del(&page->lru); + if (unlikely(!page_evictable(page))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; } - nr_reclaimed += nr_freed; - local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); - __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scan_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + lruvec = mem_cgroup_page_lruvec(page, zone); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(page, lruvec, lru); - if (nr_taken == 0) - goto done; + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; + } + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); - spin_lock(&zone->lru_lock); - /* - * Put back any unfreeable pages. - */ - while (!list_empty(&page_list)) { - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); - if (!pagevec_add(&pvec, page)) { + if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); + (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); - } + } else + list_add(&page->lru, &pages_to_free); } - } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); -done: - local_irq_enable(); - pagevec_release(&pvec); - return nr_reclaimed; + } + + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); } /* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) +static int current_may_throttle(void) { - if (priority < zone->prev_priority) - zone->prev_priority = priority; -} - -static inline int zone_is_near_oom(struct zone *zone) -{ - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); } /* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); + lru_add_drain(); - distress = 100 >> min(prev_priority, priority); + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); + spin_lock_irq(&zone->lru_lock); - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + &nr_scanned, sc, isolate_mode, lru); - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + + if (global_reclaim(sc)) { + zone->pages_scanned += nr_scanned; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + } + spin_unlock_irq(&zone->lru_lock); + + if (nr_taken == 0) + return 0; + + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); + + spin_lock_irq(&zone->lru_lock); + + reclaim_stat->recent_scanned[file] += nr_taken; + + if (global_reclaim(sc)) { + if (current_is_kswapd()) + __count_zone_vm_events(PGSTEAL_KSWAPD, zone, + nr_reclaimed); + else + __count_zone_vm_events(PGSTEAL_DIRECT, zone, + nr_reclaimed); + } + + putback_inactive_pages(lruvec, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&page_list, true); /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. * - * Max temporary value is vm_total_pages*100. + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; + if (nr_writeback && nr_writeback == nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim */ - imbalance *= mapped_ratio; - imbalance /= 100; + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_immediate && current_may_throttle()) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + sc->priority, + trace_shrink_flags(file)); + return nr_reclaimed; } /* @@ -1056,178 +1623,707 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct lruvec *lruvec, + struct list_head *list, + struct list_head *pages_to_free, + enum lru_list lru) +{ + struct zone *zone = lruvec_zone(lruvec); + unsigned long pgmoved = 0; + struct page *page; + int nr_pages; + + while (!list_empty(list)) { + page = lru_to_page(list); + lruvec = mem_cgroup_page_lruvec(page, zone); + + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); + pgmoved += nr_pages; + + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) +static void shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc, + enum lru_list lru) { - unsigned long pgmoved; - int pgdeactivate = 0; - unsigned long pgscanned; + unsigned long nr_taken; + unsigned long nr_scanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ - LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ - LIST_HEAD(l_active); /* Pages to go onto the active_list */ + LIST_HEAD(l_active); + LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + unsigned long nr_rotated = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); lru_add_drain(); + + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ - if (scan_global_lru(sc)) - zone->pages_scanned += pgscanned; - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); + if (global_reclaim(sc)) + zone->pages_scanned += nr_scanned; + + reclaim_stat->recent_scanned[file] += nr_taken; + + __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { + + if (unlikely(!page_evictable(page))) { + putback_lru_page(page); + continue; + } + + if (unlikely(buffer_heads_over_limit)) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { + nr_rotated += hpage_nr_pages(page); + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { list_add(&page->lru, &l_active); continue; } } + + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } - pagevec_init(&pvec, 1); - pgmoved = 0; + /* + * Move pages back to the lru list. + */ spin_lock_irq(&zone->lru_lock); - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); - - list_move(&page->lru, &zone->inactive_list); - mem_cgroup_move_lists(page, false); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); - spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } + /* + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. + */ + reclaim_stat->recent_rotated[file] += nr_rotated; + + move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&zone->lru_lock); + + free_hot_cold_page_list(&l_hold, true); +} + +#ifdef CONFIG_SWAP +static int inactive_anon_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} + +/** + * inactive_anon_is_low - check if anonymous pages need to be deactivated + * @lruvec: LRU vector to check + * + * Returns true if the zone does not have enough inactive anon pages, + * meaning some active anon pages need to be deactivated. + */ +static int inactive_anon_is_low(struct lruvec *lruvec) +{ + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_anon_is_low(lruvec); + + return inactive_anon_is_low_global(lruvec_zone(lruvec)); +} +#else +static inline int inactive_anon_is_low(struct lruvec *lruvec) +{ + return 0; +} +#endif + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @lruvec: LRU vector to check + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct lruvec *lruvec) +{ + unsigned long inactive; + unsigned long active; + + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + + return active > inactive; +} + +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +{ + if (is_file_lru(lru)) + return inactive_file_is_low(lruvec); + else + return inactive_anon_is_low(lruvec); +} + +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (inactive_list_is_low(lruvec, lru)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + return 0; } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); - pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + */ +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) +{ + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); + unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file; + bool force_scan = false; + unsigned long ap, fp; + enum lru_list lru; + bool some_scanned; + int pass; + + /* + * If the zone or memcg is small, nr[l] can be 0. This + * results in no scanning on this priority and a potential + * priority drop. Global direct reclaim can go to the next + * zone and tends to have no problems. Global kswapd is for + * zone balancing and it needs to scan a minimum amount. When + * reclaiming for a memcg, a priority drop can cause high + * latencies, so it's better to scan a minimum amount there as + * well. + */ + if (current_is_kswapd() && !zone_reclaimable(zone)) + force_scan = true; + if (!global_reclaim(sc)) + force_scan = true; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; + goto out; } - pgmoved = 0; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->active_list); - mem_cgroup_move_lists(page, true); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - pgmoved = 0; - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !sc->swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && sc->swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + + if (unlikely(file + free <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; } } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); + /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - anon_prio; + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + spin_lock_irq(&zone->lru_lock); + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { + reclaim_stat->recent_scanned[0] /= 2; + reclaim_stat->recent_rotated[0] /= 2; + } + + if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { + reclaim_stat->recent_scanned[1] /= 2; + reclaim_stat->recent_rotated[1] /= 2; + } + + /* + * The amount of pressure on anon vs file pages is inversely + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. + */ + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); + ap /= reclaim_stat->recent_rotated[0] + 1; + + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); + fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&zone->lru_lock); - pagevec_release(&pvec); + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; + + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; + /* + * Skip the second pass and don't force_scan, + * if we found something to scan. + */ + some_scanned |= !!scan; + } + } } /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; + enum lru_list lru; unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted; + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; - if (scan_global_lru(sc)) { /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; - } else { /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. */ - nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, - zone, priority); + if (!nr_file || !nr_anon) + break; - nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, - zone, priority); - } + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; - while (nr_active || nr_inactive) { - if (nr_active) { - nr_to_scan = min(nr_active, - (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc, priority); - } + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); - if (nr_inactive) { - nr_to_scan = min(nr_inactive, - (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); - } + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); throttle_vm_writeout(sc->gfp_mask); - return nr_reclaimed; +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + +/* + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!in_reclaim_compaction(sc)) + return false; + + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ + unsigned long nr_reclaimed, nr_scanned; + + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + struct mem_cgroup *memcg; + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + sc->swappiness = mem_cgroup_swappiness(memcg); + shrink_lruvec(lruvec, sc); + + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); + + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone, sc->order)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; } /* @@ -1235,59 +2331,143 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. - * - * Returns the number of reclaimed pages. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ -static unsigned long shrink_zones(int priority, struct zone **zones, - struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { - unsigned long nr_reclaimed = 0; - int i; + struct zoneref *z; + struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; + bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + orig_mask = sc->gfp_mask; + if (buffer_heads_over_limit) + sc->gfp_mask |= __GFP_HIGHMEM; - sc->all_unreclaimable = 1; - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; + nodes_clear(shrink.nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scan_global_lru(sc)) { + if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; - } else { + if (IS_ENABLED(CONFIG_COMPACTION)) { + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { + aborted_reclaim = true; + continue; + } + } /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. */ - sc->all_unreclaimable = 0; - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ } - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(zone, sc); } - return nr_reclaimed; + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return aborted_reclaim; +} + +/* All zones in zonelist are unreclaimable? */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) + return false; + } + + return true; } - + /* * This is the main entry point to direct page reclaim. * @@ -1296,60 +2476,42 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. + * + * returns: 0, if no pages reclaimed + * else, the number of pages reclaimed */ -static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, +static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { - int priority; - int ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; - int i; + unsigned long writeback_threshold; + bool aborted_reclaim; + + delayacct_freepages_start(); - if (scan_global_lru(sc)) + if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scan_global_lru(sc)) { - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); + sc->nr_scanned = 0; + aborted_reclaim = shrink_zones(zonelist, sc); - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); - } - } + total_scanned += sc->nr_scanned; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) + goto out; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { - sc->nr_scanned = 0; - if (!priority) - disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, sc); /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. */ - if (scan_global_lru(sc)) { - shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); - if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } - total_scanned += sc->nr_scanned; - if (nr_reclaimed >= sc->swap_cluster_max) { - ret = 1; - goto out; - } + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; /* * Try to write back as many pages as we just scanned. This @@ -1358,91 +2520,492 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } + } while (--sc->priority >= 0 && !aborted_reclaim); - /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); - } - /* top priority shrink_caches still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = 1; out: + delayacct_freepages_end(); + + if (sc->nr_reclaimed) + return sc->nr_reclaimed; + /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. Thus bypassing all_unreclaimable + * check. */ - if (priority < 0) - priority = 0; + if (oom_killer_disabled) + return 0; - if (scan_global_lru(sc)) { - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) + return 1; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + return 1; - zone->prev_priority = priority; - } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + return 0; +} - return ret; +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zoneref *z; + struct zone *zone; + pg_data_t *pgdat = NULL; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; + + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) + goto out; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + + goto check_pending; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; } -unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .may_unmap = 1, .may_swap = 1, - .swappiness = vm_swappiness, .order = order, - .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, + .priority = DEF_PRIORITY, + .target_mem_cgroup = NULL, + .nodemask = nodemask, }; - return do_try_to_free_pages(zones, gfp_mask, &sc); + /* + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. + */ + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + return 1; + + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, - gfp_t gfp_mask) +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { - .gfp_mask = gfp_mask, + .nr_scanned = 0, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, - .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .may_unmap = 1, + .may_swap = !noswap, .order = 0, - .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, + .priority = 0, + .swappiness = mem_cgroup_swappiness(memcg), + .target_mem_cgroup = memcg, }; - struct zone **zones; - int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; - if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) - return 1; - return 0; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, + sc.may_writepage, + sc.gfp_mask); + + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_lruvec(lruvec, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + + *nr_scanned = sc.nr_scanned; + return sc.nr_reclaimed; +} + +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + gfp_t gfp_mask, + bool noswap) +{ + struct zonelist *zonelist; + unsigned long nr_reclaimed; + int nid; + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .order = 0, + .priority = DEF_PRIORITY, + .target_mem_cgroup = memcg, + .nodemask = NULL, /* we don't care the placement */ + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + }; + + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(memcg); + + zonelist = NODE_DATA(nid)->node_zonelists; + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif +static void age_active_anon(struct zone *zone, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (IS_ENABLED(CONFIG_COMPACTION) && order && + !compaction_suitable(zone, order)) + return false; + + return true; +} + +/* + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * percentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +{ + unsigned long managed_pages = 0; + unsigned long balanced_pages = 0; + int i; + + /* Check the watermark levels */ + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + managed_pages += zone->managed_pages; + + /* + * A special case here: + * + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well! + */ + if (!zone_reclaimable(zone)) { + balanced_pages += zone->managed_pages; + continue; + } + + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->managed_pages; + else if (!order) + return false; + } + + if (order) + return balanced_pages >= (managed_pages >> 2); + else + return true; +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) +{ + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return false; + + /* + * There is a potential race between when kswapd checks its watermarks + * and a process gets throttled. There is also a potential race if + * processes get throttled, kswapd wakes, a large process exits therby + * balancing the zones that causes kswapd to miss a wakeup. If kswapd + * is going to sleep, no process should be sleeping on pfmemalloc_wait + * so wake them now if necessary. If necessary, processes will wake + * kswapd and get throttled again + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { + wake_up(&pgdat->pfmemalloc_wait); + return false; + } + + return pgdat_balanced(pgdat, order, classzone_idx); +} + +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + reclaim_state->reclaimed_slab = 0; + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (zone_reclaimable(zone) && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -1453,53 +3016,37 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) { - int all_zones_ok; - int priority; int i; - unsigned long total_scanned; - unsigned long nr_reclaimed; - struct reclaim_state *reclaim_state = current->reclaim_state; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .may_writepage = !laptop_mode, .order = order, - .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, + .target_mem_cgroup = NULL, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. - */ - int temp_priority[MAX_NR_ZONES]; - -loop_again: - total_scanned = 0; - nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - - for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + do { unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); - /* The swap token gets in the way of swapout... */ - if (!priority) - disable_swap_token(); - - all_zones_ok = 1; + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -1511,27 +3058,71 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + age_active_anon(zone, &sc); + + /* + * If the number of buffer_heads in the machine + * exceeds the maximum allowed level and this node + * has a highmem zone, force kswapd to reclaim from + * it to relieve lowmem pressure. + */ + if (buffer_heads_over_limit && is_highmem_idx(i)) { end_zone = i; break; } + + if (!zone_balanced(zone, order, 0, 0)) { + end_zone = i; + break; + } else { + /* + * If balanced, clear the dirty and congested + * flags + */ + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } } + if (i < 0) goto out; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + if (!populated_zone(zone)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -1542,92 +3133,145 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab; if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) - all_zones_ok = 0; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); + + nr_soft_scanned = 0; /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * Call soft limit reclaim before calling shrink_zone. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) - nr_reclaimed += shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - if (zone_is_all_unreclaimable(zone)) - continue; - if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) - zone_set_flag(zone, - ZONE_ALL_UNRECLAIMABLE); + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) - sc.may_writepage = 1; + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } - if (all_zones_ok) - break; /* kswapd: all done */ + /* - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up(&pgdat->pfmemalloc_wait); /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (nr_reclaimed >= SWAP_CLUSTER_MAX) + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; + + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) break; - } + + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) + compact_pgdat(pgdat, order); + + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + out: /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. + * Return the order we were reclaiming at so prepare_kswapd_sleep() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; + *classzone_idx = end_zone; + return order; +} - zone->prev_priority = temp_priority[i]; - } - if (!all_zones_ok) { - cond_resched(); +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; - try_to_freeze(); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - goto loop_again; + /* Try to sleep for a short interval */ + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } - return nr_reclaimed; + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + if (!kthread_should_stop()) + schedule(); + + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); } /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1640,18 +3284,22 @@ out: */ static int kswapd(void *p) { - unsigned long order; + unsigned long order, new_order; + unsigned balanced_order; + int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; - cpumask_t cpumask; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - cpumask = node_to_cpumask(pgdat->node_id); - if (!cpus_empty(cpumask)) - set_cpus_allowed(tsk, cpumask); + lockdep_set_current_reclaim_state(GFP_KERNEL); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; /* @@ -1669,228 +3317,153 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; + order = new_order = 0; + balanced_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { - unsigned long new_order; + bool ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - new_order = pgdat->kswapd_max_order; - pgdat->kswapd_max_order = 0; - if (order < new_order) { + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { - if (!freezing(current)) - schedule(); - + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; } - finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ - balance_pgdat(pgdat, order); + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } + + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) - return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - wake_up_interruptible(&pgdat->kswapd_wait); -} - -#ifdef CONFIG_PM -/* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages - */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) -{ - struct zone *zone; - unsigned long nr_to_scan, ret = 0; - - for_each_zone(zone) { - - if (!populated_zone(zone)) - continue; - - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; - - /* For pass = 0 we don't shrink the active list */ - if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); - shrink_active_list(nr_to_scan, zone, sc, prio); - } - } - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); - ret += shrink_inactive_list(nr_to_scan, zone, sc); - if (ret >= nr_pages) - return ret; - } - } - - return ret; -} + if (zone_balanced(zone, order, 0, 0)) + return; -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + wake_up_interruptible(&pgdat->kswapd_wait); } +#ifdef CONFIG_HIBERNATION /* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - unsigned long ret = 0; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_swap = 0, - .swap_cluster_max = nr_pages, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, - .swappiness = vm_swappiness, - .isolate_pages = isolate_pages_global, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .order = 0, + .priority = DEF_PRIORITY, }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = count_lru_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; - sc.swappiness = 100; - } - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; - sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); - } - } + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches - */ - if (!ret) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); - } - -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return ret; + return nr_reclaimed; } -#endif +#endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - pg_data_t *pgdat; - cpumask_t mask; int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_HIGH_MEMORY) { - pgdat = NODE_DATA(nid); - mask = node_to_cpumask(pgdat->node_id); - if (any_online_cpu(mask) != NR_CPUS) + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ - set_cpus_allowed(pgdat->kswapd, mask); + set_cpus_allowed_ptr(pgdat->kswapd, mask); } } return NOTIFY_OK; @@ -1912,18 +3485,33 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; } return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) { + kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } +} + static int __init kswapd_init(void) { int nid; swap_setup(); - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1941,7 +3529,7 @@ module_init(kswapd_init) int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ @@ -1964,6 +3552,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -1973,20 +3603,20 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; - unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, - .isolate_pages = isolate_pages_global, + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .priority = ZONE_RECLAIM_PRIORITY, + }; + struct shrink_control shrink = { + .gfp_mask = sc.gfp_mask, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1994,52 +3624,58 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * and RECLAIM_SWAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + shrink_zone(zone, &sc); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - return nr_reclaimed >= nr_pages; + lockdep_clear_current_reclaim_state(); + return sc.nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -2057,20 +3693,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; - if (zone_is_all_unreclaimable(zone)) - return 0; + if (!zone_reclaimable(zone)) + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2080,13 +3714,151 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif + +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page) +{ + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); +} + +#ifdef CONFIG_SHMEM +/** + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages: array of pages to check + * @nr_pages: number of pages to check + * + * Checks pages for evictability and moves them to the appropriate lru list. + * + * This function is only used for SysV IPC SHM_UNLOCK. + */ +void check_move_unevictable_pages(struct page **pages, int nr_pages) +{ + struct lruvec *lruvec; + struct zone *zone = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct zone *pagezone; + + pgscanned++; + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + lruvec = mem_cgroup_page_lruvec(page, zone); + + if (!PageLRU(page) || !PageUnevictable(page)) + continue; + + if (page_evictable(page)) { + enum lru_list lru = page_lru_base_type(page); + + VM_BUG_ON_PAGE(PageActive(page), page); + ClearPageUnevictable(page); + del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, lru); + pgrescued++; + } + } + + if (zone) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + spin_unlock_irq(&zone->lru_lock); + } +} +#endif /* CONFIG_SHMEM */ + +static void warn_scan_unevictable_pages(void) +{ + printk_once(KERN_WARNING + "%s: The scan_unevictable_pages sysctl/node-interface has been " + "disabled for lack of a legitimate use case. If you have " + "one, please send an email to linux-mm@kvack.org.\n", + current->comm); +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + void __user *buffer, + size_t *length, loff_t *ppos) +{ + warn_scan_unevictable_pages(); + proc_doulongvec_minmax(table, write, buffer, length, ppos); + scan_unevictable_pages = 0; + return 0; +} + +#ifdef CONFIG_NUMA +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + warn_scan_unevictable_pages(); + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + warn_scan_unevictable_pages(); + return 1; +} + + +static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); +} +#endif |
