diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 3056 | 
1 files changed, 1910 insertions, 1146 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0..0f16ffe8eb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@   *  Multiqueue VM started 5.8.00, Rik van Riel.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/gfp.h> @@ -19,6 +21,7 @@  #include <linux/pagemap.h>  #include <linux/init.h>  #include <linux/highmem.h> +#include <linux/vmpressure.h>  #include <linux/vmstat.h>  #include <linux/file.h>  #include <linux/writeback.h> @@ -26,12 +29,12 @@  #include <linux/buffer_head.h>	/* for try_to_release_page(),  					buffer_heads_over_limit */  #include <linux/mm_inline.h> -#include <linux/pagevec.h>  #include <linux/backing-dev.h>  #include <linux/rmap.h>  #include <linux/topology.h>  #include <linux/cpu.h>  #include <linux/cpuset.h> +#include <linux/compaction.h>  #include <linux/notifier.h>  #include <linux/rwsem.h>  #include <linux/delay.h> @@ -40,23 +43,21 @@  #include <linux/memcontrol.h>  #include <linux/delayacct.h>  #include <linux/sysctl.h> +#include <linux/oom.h> +#include <linux/prefetch.h> +#include <linux/printk.h>  #include <asm/tlbflush.h>  #include <asm/div64.h>  #include <linux/swapops.h> +#include <linux/balloon_compaction.h>  #include "internal.h"  #define CREATE_TRACE_POINTS  #include <trace/events/vmscan.h> -enum lumpy_mode { -	LUMPY_MODE_NONE, -	LUMPY_MODE_ASYNC, -	LUMPY_MODE_SYNC, -}; -  struct scan_control {  	/* Incremented by the number of inactive pages that were scanned */  	unsigned long nr_scanned; @@ -80,18 +81,19 @@ struct scan_control {  	/* Can pages be swapped as part of reclaim? */  	int may_swap; -	int swappiness; -  	int order; +	/* Scan (total_size >> priority) pages at once */ +	int priority; + +	/* anon vs. file LRUs scanning "ratio" */ +	int swappiness; +  	/* -	 * Intend to reclaim enough continuous memory rather than reclaim -	 * enough amount of memory. i.e, mode for high order allocation. +	 * The memory cgroup that hit its limit and as a result is the +	 * primary target of this reclaim invocation.  	 */ -	enum lumpy_mode lumpy_reclaim_mode; - -	/* Which cgroup do we reclaim from */ -	struct mem_cgroup *mem_cgroup; +	struct mem_cgroup *target_mem_cgroup;  	/*  	 * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -134,45 +136,76 @@ struct scan_control {   * From 0 .. 100.  Higher means more swappy.   */  int vm_swappiness = 60; -long vm_total_pages;	/* The total number of pages which the VM controls */ +unsigned long vm_total_pages;	/* The total number of pages which the VM controls */  static LIST_HEAD(shrinker_list);  static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scanning_global_lru(sc)	(!(sc)->mem_cgroup) +#ifdef CONFIG_MEMCG +static bool global_reclaim(struct scan_control *sc) +{ +	return !sc->target_mem_cgroup; +}  #else -#define scanning_global_lru(sc)	(1) +static bool global_reclaim(struct scan_control *sc) +{ +	return true; +}  #endif -static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, -						  struct scan_control *sc) +static unsigned long zone_reclaimable_pages(struct zone *zone)  { -	if (!scanning_global_lru(sc)) -		return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); +	int nr; -	return &zone->reclaim_stat; +	nr = zone_page_state(zone, NR_ACTIVE_FILE) + +	     zone_page_state(zone, NR_INACTIVE_FILE); + +	if (get_nr_swap_pages() > 0) +		nr += zone_page_state(zone, NR_ACTIVE_ANON) + +		      zone_page_state(zone, NR_INACTIVE_ANON); + +	return nr;  } -static unsigned long zone_nr_lru_pages(struct zone *zone, -				struct scan_control *sc, enum lru_list lru) +bool zone_reclaimable(struct zone *zone)  { -	if (!scanning_global_lru(sc)) -		return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); - -	return zone_page_state(zone, NR_LRU_BASE + lru); +	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;  } +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ +	if (!mem_cgroup_disabled()) +		return mem_cgroup_get_lru_size(lruvec, lru); + +	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); +}  /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm.   */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker)  { -	shrinker->nr = 0; +	size_t size = sizeof(*shrinker->nr_deferred); + +	/* +	 * If we only have one possible node in the system anyway, save +	 * ourselves the trouble and disable NUMA aware behavior. This way we +	 * will save memory and some small loop time later. +	 */ +	if (nr_node_ids == 1) +		shrinker->flags &= ~SHRINKER_NUMA_AWARE; + +	if (shrinker->flags & SHRINKER_NUMA_AWARE) +		size *= nr_node_ids; + +	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); +	if (!shrinker->nr_deferred) +		return -ENOMEM; +  	down_write(&shrinker_rwsem);  	list_add_tail(&shrinker->list, &shrinker_list);  	up_write(&shrinker_rwsem); +	return 0;  }  EXPORT_SYMBOL(register_shrinker); @@ -184,10 +217,123 @@ void unregister_shrinker(struct shrinker *shrinker)  	down_write(&shrinker_rwsem);  	list_del(&shrinker->list);  	up_write(&shrinker_rwsem); +	kfree(shrinker->nr_deferred);  }  EXPORT_SYMBOL(unregister_shrinker);  #define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, +		 unsigned long nr_pages_scanned, unsigned long lru_pages) +{ +	unsigned long freed = 0; +	unsigned long long delta; +	long total_scan; +	long freeable; +	long nr; +	long new_nr; +	int nid = shrinkctl->nid; +	long batch_size = shrinker->batch ? shrinker->batch +					  : SHRINK_BATCH; + +	freeable = shrinker->count_objects(shrinker, shrinkctl); +	if (freeable == 0) +		return 0; + +	/* +	 * copy the current shrinker scan count into a local variable +	 * and zero it so that other concurrent shrinker invocations +	 * don't also do this scanning work. +	 */ +	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + +	total_scan = nr; +	delta = (4 * nr_pages_scanned) / shrinker->seeks; +	delta *= freeable; +	do_div(delta, lru_pages + 1); +	total_scan += delta; +	if (total_scan < 0) { +		printk(KERN_ERR +		"shrink_slab: %pF negative objects to delete nr=%ld\n", +		       shrinker->scan_objects, total_scan); +		total_scan = freeable; +	} + +	/* +	 * We need to avoid excessive windup on filesystem shrinkers +	 * due to large numbers of GFP_NOFS allocations causing the +	 * shrinkers to return -1 all the time. This results in a large +	 * nr being built up so when a shrink that can do some work +	 * comes along it empties the entire cache due to nr >>> +	 * freeable. This is bad for sustaining a working set in +	 * memory. +	 * +	 * Hence only allow the shrinker to scan the entire cache when +	 * a large delta change is calculated directly. +	 */ +	if (delta < freeable / 4) +		total_scan = min(total_scan, freeable / 2); + +	/* +	 * Avoid risking looping forever due to too large nr value: +	 * never try to free more than twice the estimate number of +	 * freeable entries. +	 */ +	if (total_scan > freeable * 2) +		total_scan = freeable * 2; + +	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, +				nr_pages_scanned, lru_pages, +				freeable, delta, total_scan); + +	/* +	 * Normally, we should not scan less than batch_size objects in one +	 * pass to avoid too frequent shrinker calls, but if the slab has less +	 * than batch_size objects in total and we are really tight on memory, +	 * we will try to reclaim all available objects, otherwise we can end +	 * up failing allocations although there are plenty of reclaimable +	 * objects spread over several slabs with usage less than the +	 * batch_size. +	 * +	 * We detect the "tight on memory" situations by looking at the total +	 * number of objects we want to scan (total_scan). If it is greater +	 * than the total number of objects on slab (freeable), we must be +	 * scanning at high prio and therefore should try to reclaim as much as +	 * possible. +	 */ +	while (total_scan >= batch_size || +	       total_scan >= freeable) { +		unsigned long ret; +		unsigned long nr_to_scan = min(batch_size, total_scan); + +		shrinkctl->nr_to_scan = nr_to_scan; +		ret = shrinker->scan_objects(shrinker, shrinkctl); +		if (ret == SHRINK_STOP) +			break; +		freed += ret; + +		count_vm_events(SLABS_SCANNED, nr_to_scan); +		total_scan -= nr_to_scan; + +		cond_resched(); +	} + +	/* +	 * move the unused scan count back into the shrinker in a +	 * manner that handles concurrent updates. If we exhausted the +	 * scan, there is no need to do an update. +	 */ +	if (total_scan > 0) +		new_nr = atomic_long_add_return(total_scan, +						&shrinker->nr_deferred[nid]); +	else +		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + +	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); +	return freed; +} +  /*   * Call the shrink functions to age shrinkable caches   * @@ -207,98 +353,46 @@ EXPORT_SYMBOL(unregister_shrinker);   *   * Returns the number of slab objects which we shrunk.   */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, -			unsigned long lru_pages) +unsigned long shrink_slab(struct shrink_control *shrinkctl, +			  unsigned long nr_pages_scanned, +			  unsigned long lru_pages)  {  	struct shrinker *shrinker; -	unsigned long ret = 0; +	unsigned long freed = 0; -	if (scanned == 0) -		scanned = SWAP_CLUSTER_MAX; - -	if (!down_read_trylock(&shrinker_rwsem)) -		return 1;	/* Assume we'll be able to shrink next time */ - -	list_for_each_entry(shrinker, &shrinker_list, list) { -		unsigned long long delta; -		unsigned long total_scan; -		unsigned long max_pass; - -		max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); -		delta = (4 * scanned) / shrinker->seeks; -		delta *= max_pass; -		do_div(delta, lru_pages + 1); -		shrinker->nr += delta; -		if (shrinker->nr < 0) { -			printk(KERN_ERR "shrink_slab: %pF negative objects to " -			       "delete nr=%ld\n", -			       shrinker->shrink, shrinker->nr); -			shrinker->nr = max_pass; -		} +	if (nr_pages_scanned == 0) +		nr_pages_scanned = SWAP_CLUSTER_MAX; +	if (!down_read_trylock(&shrinker_rwsem)) {  		/* -		 * Avoid risking looping forever due to too large nr value: -		 * never try to free more than twice the estimate number of -		 * freeable entries. +		 * If we would return 0, our callers would understand that we +		 * have nothing else to shrink and give up trying. By returning +		 * 1 we keep it going and assume we'll be able to shrink next +		 * time.  		 */ -		if (shrinker->nr > max_pass * 2) -			shrinker->nr = max_pass * 2; +		freed = 1; +		goto out; +	} -		total_scan = shrinker->nr; -		shrinker->nr = 0; +	list_for_each_entry(shrinker, &shrinker_list, list) { +		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { +			shrinkctl->nid = 0; +			freed += shrink_slab_node(shrinkctl, shrinker, +					nr_pages_scanned, lru_pages); +			continue; +		} -		while (total_scan >= SHRINK_BATCH) { -			long this_scan = SHRINK_BATCH; -			int shrink_ret; -			int nr_before; +		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { +			if (node_online(shrinkctl->nid)) +				freed += shrink_slab_node(shrinkctl, shrinker, +						nr_pages_scanned, lru_pages); -			nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); -			shrink_ret = (*shrinker->shrink)(shrinker, this_scan, -								gfp_mask); -			if (shrink_ret == -1) -				break; -			if (shrink_ret < nr_before) -				ret += nr_before - shrink_ret; -			count_vm_events(SLABS_SCANNED, this_scan); -			total_scan -= this_scan; - -			cond_resched();  		} - -		shrinker->nr += total_scan;  	}  	up_read(&shrinker_rwsem); -	return ret; -} - -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, -				   bool sync) -{ -	enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; - -	/* -	 * Some reclaim have alredy been failed. No worth to try synchronous -	 * lumpy reclaim. -	 */ -	if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) -		return; - -	/* -	 * If we need a large contiguous chunk of memory, or have -	 * trouble getting a small set of contiguous pages, we -	 * will reclaim both active and inactive pages. -	 */ -	if (sc->order > PAGE_ALLOC_COSTLY_ORDER) -		sc->lumpy_reclaim_mode = mode; -	else if (sc->order && priority < DEF_PRIORITY - 2) -		sc->lumpy_reclaim_mode = mode; -	else -		sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; -} - -static void disable_lumpy_reclaim_mode(struct scan_control *sc) -{ -	sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +out: +	cond_resched(); +	return freed;  }  static inline int is_page_cache_freeable(struct page *page) @@ -320,10 +414,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,  		return 1;  	if (bdi == current->backing_dev_info)  		return 1; - -	/* lumpy reclaim for hugepage often need a lot of write */ -	if (sc->order > PAGE_ALLOC_COSTLY_ORDER) -		return 1;  	return 0;  } @@ -342,7 +432,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,  static void handle_write_error(struct address_space *mapping,  				struct page *page, int error)  { -	lock_page_nosync(page); +	lock_page(page);  	if (page_mapping(page) == mapping)  		mapping_set_error(mapping, error);  	unlock_page(page); @@ -374,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  	 * stalls if we need to run get_block().  We could test  	 * PagePrivate for that.  	 * -	 * If this process is currently in __generic_file_aio_write() against +	 * If this process is currently in __generic_file_write_iter() against  	 * this page's queue, we can perform writeback even if that  	 * will block.  	 * @@ -393,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  		if (page_has_private(page)) {  			if (try_to_free_buffers(page)) {  				ClearPageDirty(page); -				printk("%s: orphaned page\n", __func__); +				pr_info("%s: orphaned page\n", __func__);  				return PAGE_CLEAN;  			}  		} @@ -423,21 +513,11 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  			return PAGE_ACTIVATE;  		} -		/* -		 * Wait on writeback if requested to. This happens when -		 * direct reclaiming a large contiguous area and the -		 * first attempt to free a range of pages fails. -		 */ -		if (PageWriteback(page) && -		    sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) -			wait_on_page_writeback(page); -  		if (!PageWriteback(page)) {  			/* synchronous write or broken a_ops? */  			ClearPageReclaim(page);  		} -		trace_mm_vmscan_writepage(page, -			trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); +		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));  		inc_zone_page_state(page, NR_VMSCAN_WRITE);  		return PAGE_SUCCESS;  	} @@ -449,7 +529,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,   * Same as remove_mapping, but if the page is removed from the mapping, it   * gets returned with a refcount of 0.   */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, +			    bool reclaimed)  {  	BUG_ON(!PageLocked(page));  	BUG_ON(mapping != page_mapping(page)); @@ -494,9 +575,29 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)  		spin_unlock_irq(&mapping->tree_lock);  		swapcache_free(swap, page);  	} else { -		__remove_from_page_cache(page); +		void (*freepage)(struct page *); +		void *shadow = NULL; + +		freepage = mapping->a_ops->freepage; +		/* +		 * Remember a shadow entry for reclaimed file cache in +		 * order to detect refaults, thus thrashing, later on. +		 * +		 * But don't store shadows in an address space that is +		 * already exiting.  This is not just an optizimation, +		 * inode reclaim needs to empty out the radix tree or +		 * the nodes are lost.  Don't plant shadows behind its +		 * back. +		 */ +		if (reclaimed && page_is_file_cache(page) && +		    !mapping_exiting(mapping)) +			shadow = workingset_eviction(mapping, page); +		__delete_from_page_cache(page, shadow);  		spin_unlock_irq(&mapping->tree_lock);  		mem_cgroup_uncharge_cache_page(page); + +		if (freepage != NULL) +			freepage(page);  	}  	return 1; @@ -514,7 +615,7 @@ cannot_free:   */  int remove_mapping(struct address_space *mapping, struct page *page)  { -	if (__remove_mapping(mapping, page)) { +	if (__remove_mapping(mapping, page, false)) {  		/*  		 * Unfreezing the refcount with 1 rather than 2 effectively  		 * drops the pagecache ref for us without requiring another @@ -537,39 +638,39 @@ int remove_mapping(struct address_space *mapping, struct page *page)   */  void putback_lru_page(struct page *page)  { -	int lru; -	int active = !!TestClearPageActive(page); +	bool is_unevictable;  	int was_unevictable = PageUnevictable(page); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page);  redo:  	ClearPageUnevictable(page); -	if (page_evictable(page, NULL)) { +	if (page_evictable(page)) {  		/*  		 * For evictable pages, we can use the cache.  		 * In event of a race, worst case is we end up with an  		 * unevictable page on [in]active list.  		 * We know how to handle that.  		 */ -		lru = active + page_lru_base_type(page); -		lru_cache_add_lru(page, lru); +		is_unevictable = false; +		lru_cache_add(page);  	} else {  		/*  		 * Put unevictable pages directly on zone's unevictable  		 * list.  		 */ -		lru = LRU_UNEVICTABLE; +		is_unevictable = true;  		add_page_to_unevictable_list(page);  		/* -		 * When racing with an mlock clearing (page is -		 * unlocked), make sure that if the other thread does -		 * not observe our setting of PG_lru and fails -		 * isolation, we see PG_mlocked cleared below and move +		 * When racing with an mlock or AS_UNEVICTABLE clearing +		 * (page is unlocked) make sure that if the other thread +		 * does not observe our setting of PG_lru and fails +		 * isolation/check_move_unevictable_pages, +		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move  		 * the page back to the evictable list.  		 * -		 * The other side is TestClearPageMlocked(). +		 * The other side is TestClearPageMlocked() or shmem_lock().  		 */  		smp_mb();  	} @@ -579,7 +680,7 @@ redo:  	 * page is on unevictable list, it never be freed. To avoid that,  	 * check after we added it to the list, again.  	 */ -	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { +	if (is_unevictable && page_evictable(page)) {  		if (!isolate_lru_page(page)) {  			put_page(page);  			goto redo; @@ -590,9 +691,9 @@ redo:  		 */  	} -	if (was_unevictable && lru != LRU_UNEVICTABLE) +	if (was_unevictable && !is_unevictable)  		count_vm_event(UNEVICTABLE_PGRESCUED); -	else if (!was_unevictable && lru == LRU_UNEVICTABLE) +	else if (!was_unevictable && is_unevictable)  		count_vm_event(UNEVICTABLE_PGCULLED);  	put_page(page);		/* drop ref from isolate */ @@ -611,13 +712,10 @@ static enum page_references page_check_references(struct page *page,  	int referenced_ptes, referenced_page;  	unsigned long vm_flags; -	referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); +	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, +					  &vm_flags);  	referenced_page = TestClearPageReferenced(page); -	/* Lumpy reclaim - ignore references */ -	if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) -		return PAGEREF_RECLAIM; -  	/*  	 * Mlock lost the isolation race with us.  Let try_to_unmap()  	 * move the page to the unevictable list. @@ -626,7 +724,7 @@ static enum page_references page_check_references(struct page *page,  		return PAGEREF_RECLAIM;  	if (referenced_ptes) { -		if (PageAnon(page)) +		if (PageSwapBacked(page))  			return PAGEREF_ACTIVATE;  		/*  		 * All mapped pages start out with page table @@ -644,7 +742,13 @@ static enum page_references page_check_references(struct page *page,  		 */  		SetPageReferenced(page); -		if (referenced_page) +		if (referenced_page || referenced_ptes > 1) +			return PAGEREF_ACTIVATE; + +		/* +		 * Activate file-backed executable pages after first usage. +		 */ +		if (vm_flags & VM_EXEC)  			return PAGEREF_ACTIVATE;  		return PAGEREF_KEEP; @@ -657,22 +761,33 @@ static enum page_references page_check_references(struct page *page,  	return PAGEREF_RECLAIM;  } -static noinline_for_stack void free_page_list(struct list_head *free_pages) +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, +				       bool *dirty, bool *writeback)  { -	struct pagevec freed_pvec; -	struct page *page, *tmp; +	struct address_space *mapping; -	pagevec_init(&freed_pvec, 1); - -	list_for_each_entry_safe(page, tmp, free_pages, lru) { -		list_del(&page->lru); -		if (!pagevec_add(&freed_pvec, page)) { -			__pagevec_free(&freed_pvec); -			pagevec_reinit(&freed_pvec); -		} +	/* +	 * Anonymous pages are not handled by flushers and must be written +	 * from reclaim context. Do not stall reclaim based on them +	 */ +	if (!page_is_file_cache(page)) { +		*dirty = false; +		*writeback = false; +		return;  	} -	pagevec_free(&freed_pvec); +	/* By default assume that the page flags are accurate */ +	*dirty = PageDirty(page); +	*writeback = PageWriteback(page); + +	/* Verify dirty/writeback state if the filesystem supports it */ +	if (!page_has_private(page)) +		return; + +	mapping = page_mapping(page); +	if (mapping && mapping->a_ops->is_dirty_writeback) +		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);  }  /* @@ -680,22 +795,34 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)   */  static unsigned long shrink_page_list(struct list_head *page_list,  				      struct zone *zone, -				      struct scan_control *sc) +				      struct scan_control *sc, +				      enum ttu_flags ttu_flags, +				      unsigned long *ret_nr_dirty, +				      unsigned long *ret_nr_unqueued_dirty, +				      unsigned long *ret_nr_congested, +				      unsigned long *ret_nr_writeback, +				      unsigned long *ret_nr_immediate, +				      bool force_reclaim)  {  	LIST_HEAD(ret_pages);  	LIST_HEAD(free_pages);  	int pgactivate = 0; +	unsigned long nr_unqueued_dirty = 0;  	unsigned long nr_dirty = 0;  	unsigned long nr_congested = 0;  	unsigned long nr_reclaimed = 0; +	unsigned long nr_writeback = 0; +	unsigned long nr_immediate = 0;  	cond_resched(); +	mem_cgroup_uncharge_start();  	while (!list_empty(page_list)) { -		enum page_references references;  		struct address_space *mapping;  		struct page *page;  		int may_enter_fs; +		enum page_references references = PAGEREF_RECLAIM_CLEAN; +		bool dirty, writeback;  		cond_resched(); @@ -705,12 +832,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		if (!trylock_page(page))  			goto keep; -		VM_BUG_ON(PageActive(page)); -		VM_BUG_ON(page_zone(page) != zone); +		VM_BUG_ON_PAGE(PageActive(page), page); +		VM_BUG_ON_PAGE(page_zone(page) != zone, page);  		sc->nr_scanned++; -		if (unlikely(!page_evictable(page, NULL))) +		if (unlikely(!page_evictable(page)))  			goto cull_mlocked;  		if (!sc->may_unmap && page_mapped(page)) @@ -723,25 +850,103 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); +		/* +		 * The number of dirty pages determines if a zone is marked +		 * reclaim_congested which affects wait_iff_congested. kswapd +		 * will stall and start writing pages if the tail of the LRU +		 * is all dirty unqueued pages. +		 */ +		page_check_dirty_writeback(page, &dirty, &writeback); +		if (dirty || writeback) +			nr_dirty++; + +		if (dirty && !writeback) +			nr_unqueued_dirty++; + +		/* +		 * Treat this page as congested if the underlying BDI is or if +		 * pages are cycling through the LRU so quickly that the +		 * pages marked for immediate reclaim are making it to the +		 * end of the LRU a second time. +		 */ +		mapping = page_mapping(page); +		if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || +		    (writeback && PageReclaim(page))) +			nr_congested++; + +		/* +		 * If a page at the tail of the LRU is under writeback, there +		 * are three cases to consider. +		 * +		 * 1) If reclaim is encountering an excessive number of pages +		 *    under writeback and this page is both under writeback and +		 *    PageReclaim then it indicates that pages are being queued +		 *    for IO but are being recycled through the LRU before the +		 *    IO can complete. Waiting on the page itself risks an +		 *    indefinite stall if it is impossible to writeback the +		 *    page due to IO error or disconnected storage so instead +		 *    note that the LRU is being scanned too quickly and the +		 *    caller can stall after page list has been processed. +		 * +		 * 2) Global reclaim encounters a page, memcg encounters a +		 *    page that is not marked for immediate reclaim or +		 *    the caller does not have __GFP_IO. In this case mark +		 *    the page for immediate reclaim and continue scanning. +		 * +		 *    __GFP_IO is checked  because a loop driver thread might +		 *    enter reclaim, and deadlock if it waits on a page for +		 *    which it is needed to do the write (loop masks off +		 *    __GFP_IO|__GFP_FS for this reason); but more thought +		 *    would probably show more reasons. +		 * +		 *    Don't require __GFP_FS, since we're not going into the +		 *    FS, just waiting on its writeback completion. Worryingly, +		 *    ext4 gfs2 and xfs allocate pages with +		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing +		 *    may_enter_fs here is liable to OOM on them. +		 * +		 * 3) memcg encounters a page that is not already marked +		 *    PageReclaim. memcg does not have any dirty pages +		 *    throttling so we could easily OOM just because too many +		 *    pages are in writeback and there is nothing else to +		 *    reclaim. Wait for the writeback to complete. +		 */  		if (PageWriteback(page)) { -			/* -			 * Synchronous reclaim is performed in two passes, -			 * first an asynchronous pass over the list to -			 * start parallel writeback, and a second synchronous -			 * pass to wait for the IO to complete.  Wait here -			 * for any page for which writeback has already -			 * started. -			 */ -			if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && -			    may_enter_fs) +			/* Case 1 above */ +			if (current_is_kswapd() && +			    PageReclaim(page) && +			    zone_is_reclaim_writeback(zone)) { +				nr_immediate++; +				goto keep_locked; + +			/* Case 2 above */ +			} else if (global_reclaim(sc) || +			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { +				/* +				 * This is slightly racy - end_page_writeback() +				 * might have just cleared PageReclaim, then +				 * setting PageReclaim here end up interpreted +				 * as PageReadahead - but that does not matter +				 * enough to care.  What we do want is for this +				 * page to have PageReclaim set next time memcg +				 * reclaim reaches the tests above, so it will +				 * then wait_on_page_writeback() to avoid OOM; +				 * and it's also appropriate in global reclaim. +				 */ +				SetPageReclaim(page); +				nr_writeback++; + +				goto keep_locked; + +			/* Case 3 above */ +			} else {  				wait_on_page_writeback(page); -			else { -				unlock_page(page); -				goto keep_lumpy;  			}  		} -		references = page_check_references(page, sc); +		if (!force_reclaim) +			references = page_check_references(page, sc); +  		switch (references) {  		case PAGEREF_ACTIVATE:  			goto activate_locked; @@ -759,19 +964,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		if (PageAnon(page) && !PageSwapCache(page)) {  			if (!(sc->gfp_mask & __GFP_IO))  				goto keep_locked; -			if (!add_to_swap(page)) +			if (!add_to_swap(page, page_list))  				goto activate_locked;  			may_enter_fs = 1; -		} -		mapping = page_mapping(page); +			/* Adding to swap updated mapping */ +			mapping = page_mapping(page); +		}  		/*  		 * The page is mapped into the page tables of one or more  		 * processes. Try to unmap it here.  		 */  		if (page_mapped(page) && mapping) { -			switch (try_to_unmap(page, TTU_UNMAP)) { +			switch (try_to_unmap(page, ttu_flags)) {  			case SWAP_FAIL:  				goto activate_locked;  			case SWAP_AGAIN: @@ -784,7 +990,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		}  		if (PageDirty(page)) { -			nr_dirty++; +			/* +			 * Only kswapd can writeback filesystem pages to +			 * avoid risk of stack overflow but only writeback +			 * if many dirty pages have been encountered. +			 */ +			if (page_is_file_cache(page) && +					(!current_is_kswapd() || +					 !zone_is_reclaim_dirty(zone))) { +				/* +				 * Immediately reclaim when written back. +				 * Similar in principal to deactivate_page() +				 * except we already have the page isolated +				 * and know it's dirty +				 */ +				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); +				SetPageReclaim(page); + +				goto keep_locked; +			}  			if (references == PAGEREF_RECLAIM_CLEAN)  				goto keep_locked; @@ -796,13 +1020,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			/* Page is dirty, try to write it out here */  			switch (pageout(page, mapping, sc)) {  			case PAGE_KEEP: -				nr_congested++;  				goto keep_locked;  			case PAGE_ACTIVATE:  				goto activate_locked;  			case PAGE_SUCCESS:  				if (PageWriteback(page)) -					goto keep_lumpy; +					goto keep;  				if (PageDirty(page))  					goto keep; @@ -862,7 +1085,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			}  		} -		if (!mapping || !__remove_mapping(mapping, page)) +		if (!mapping || !__remove_mapping(mapping, page, true))  			goto keep_locked;  		/* @@ -888,41 +1111,63 @@ cull_mlocked:  			try_to_free_swap(page);  		unlock_page(page);  		putback_lru_page(page); -		disable_lumpy_reclaim_mode(sc);  		continue;  activate_locked:  		/* Not a candidate for swapping, so reclaim swap space. */  		if (PageSwapCache(page) && vm_swap_full())  			try_to_free_swap(page); -		VM_BUG_ON(PageActive(page)); +		VM_BUG_ON_PAGE(PageActive(page), page);  		SetPageActive(page);  		pgactivate++;  keep_locked:  		unlock_page(page);  keep: -		disable_lumpy_reclaim_mode(sc); -keep_lumpy:  		list_add(&page->lru, &ret_pages); -		VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); +		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);  	} -	/* -	 * Tag a zone as congested if all the dirty pages encountered were -	 * backed by a congested BDI. In this case, reclaimers should just -	 * back off and wait for congestion to clear because further reclaim -	 * will encounter the same problem -	 */ -	if (nr_dirty == nr_congested && nr_dirty != 0) -		zone_set_flag(zone, ZONE_CONGESTED); - -	free_page_list(&free_pages); +	free_hot_cold_page_list(&free_pages, true);  	list_splice(&ret_pages, page_list);  	count_vm_events(PGACTIVATE, pgactivate); +	mem_cgroup_uncharge_end(); +	*ret_nr_dirty += nr_dirty; +	*ret_nr_congested += nr_congested; +	*ret_nr_unqueued_dirty += nr_unqueued_dirty; +	*ret_nr_writeback += nr_writeback; +	*ret_nr_immediate += nr_immediate;  	return nr_reclaimed;  } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, +					    struct list_head *page_list) +{ +	struct scan_control sc = { +		.gfp_mask = GFP_KERNEL, +		.priority = DEF_PRIORITY, +		.may_unmap = 1, +	}; +	unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; +	struct page *page, *next; +	LIST_HEAD(clean_pages); + +	list_for_each_entry_safe(page, next, page_list, lru) { +		if (page_is_file_cache(page) && !PageDirty(page) && +		    !isolated_balloon_page(page)) { +			ClearPageActive(page); +			list_move(&page->lru, &clean_pages); +		} +	} + +	ret = shrink_page_list(&clean_pages, zone, &sc, +			TTU_UNMAP|TTU_IGNORE_ACCESS, +			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); +	list_splice(&clean_pages, page_list); +	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); +	return ret; +} +  /*   * Attempt to remove the specified page from its LRU.  Only take this page   * if it is of the appropriate PageActive status.  Pages which are being @@ -933,7 +1178,7 @@ keep_lumpy:   *   * returns 0 on success, -ve errno on failure.   */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode)  {  	int ret = -EINVAL; @@ -941,26 +1186,48 @@ int __isolate_lru_page(struct page *page, int mode, int file)  	if (!PageLRU(page))  		return ret; -	/* -	 * When checking the active state, we need to be sure we are -	 * dealing with comparible boolean values.  Take the logical not -	 * of each. -	 */ -	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) +	/* Compaction should not handle unevictable pages but CMA can do so */ +	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))  		return ret; -	if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) -		return ret; +	ret = -EBUSY;  	/* -	 * When this function is being called for lumpy reclaim, we -	 * initially look into all LRU pages, active, inactive and -	 * unevictable; only give shrink_page_list evictable pages. +	 * To minimise LRU disruption, the caller can indicate that it only +	 * wants to isolate pages it will be able to operate on without +	 * blocking - clean pages for the most part. +	 * +	 * ISOLATE_CLEAN means that only clean pages should be isolated. This +	 * is used by reclaim when it is cannot write to backing storage +	 * +	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages +	 * that it is possible to migrate without blocking  	 */ -	if (PageUnevictable(page)) -		return ret; +	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { +		/* All the caller can do on PageWriteback is block */ +		if (PageWriteback(page)) +			return ret; -	ret = -EBUSY; +		if (PageDirty(page)) { +			struct address_space *mapping; + +			/* ISOLATE_CLEAN means only clean pages */ +			if (mode & ISOLATE_CLEAN) +				return ret; + +			/* +			 * Only pages without mappings or that have a +			 * ->migratepage callback are possible to migrate +			 * without blocking +			 */ +			mapping = page_mapping(page); +			if (mapping && !mapping->a_ops->migratepage) +				return ret; +		} +	} + +	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) +		return ret;  	if (likely(get_page_unless_zero(page))) {  		/* @@ -986,167 +1253,57 @@ int __isolate_lru_page(struct page *page, int mode, int file)   * Appropriate locks must be held before calling this function.   *   * @nr_to_scan:	The number of pages to look through on the list. - * @src:	The LRU list to pull pages off. + * @lruvec:	The LRU vector to pull pages from.   * @dst:	The temp list to put pages on to. - * @scanned:	The number of pages that were scanned. - * @order:	The caller's attempted allocation order + * @nr_scanned:	The number of pages that were scanned. + * @sc:		The scan_control struct for this reclaim session   * @mode:	One of the LRU isolation modes - * @file:	True [1] if isolating file [!anon] pages + * @lru:	LRU list id for isolating   *   * returns how many pages were moved onto *@dst.   */  static unsigned long isolate_lru_pages(unsigned long nr_to_scan, -		struct list_head *src, struct list_head *dst, -		unsigned long *scanned, int order, int mode, int file) +		struct lruvec *lruvec, struct list_head *dst, +		unsigned long *nr_scanned, struct scan_control *sc, +		isolate_mode_t mode, enum lru_list lru)  { +	struct list_head *src = &lruvec->lists[lru];  	unsigned long nr_taken = 0; -	unsigned long nr_lumpy_taken = 0; -	unsigned long nr_lumpy_dirty = 0; -	unsigned long nr_lumpy_failed = 0;  	unsigned long scan;  	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {  		struct page *page; -		unsigned long pfn; -		unsigned long end_pfn; -		unsigned long page_pfn; -		int zone_id; +		int nr_pages;  		page = lru_to_page(src);  		prefetchw_prev_lru_page(page, src, flags); -		VM_BUG_ON(!PageLRU(page)); +		VM_BUG_ON_PAGE(!PageLRU(page), page); -		switch (__isolate_lru_page(page, mode, file)) { +		switch (__isolate_lru_page(page, mode)) {  		case 0: +			nr_pages = hpage_nr_pages(page); +			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);  			list_move(&page->lru, dst); -			mem_cgroup_del_lru(page); -			nr_taken++; +			nr_taken += nr_pages;  			break;  		case -EBUSY:  			/* else it is being freed elsewhere */  			list_move(&page->lru, src); -			mem_cgroup_rotate_lru_list(page, page_lru(page));  			continue;  		default:  			BUG();  		} - -		if (!order) -			continue; - -		/* -		 * Attempt to take all pages in the order aligned region -		 * surrounding the tag page.  Only take those pages of -		 * the same active state as that tag page.  We may safely -		 * round the target page pfn down to the requested order -		 * as the mem_map is guarenteed valid out to MAX_ORDER, -		 * where that page is in a different zone we will detect -		 * it from its zone id and abort this block scan. -		 */ -		zone_id = page_zone_id(page); -		page_pfn = page_to_pfn(page); -		pfn = page_pfn & ~((1 << order) - 1); -		end_pfn = pfn + (1 << order); -		for (; pfn < end_pfn; pfn++) { -			struct page *cursor_page; - -			/* The target page is in the block, ignore it. */ -			if (unlikely(pfn == page_pfn)) -				continue; - -			/* Avoid holes within the zone. */ -			if (unlikely(!pfn_valid_within(pfn))) -				break; - -			cursor_page = pfn_to_page(pfn); - -			/* Check that we have not crossed a zone boundary. */ -			if (unlikely(page_zone_id(cursor_page) != zone_id)) -				break; - -			/* -			 * If we don't have enough swap space, reclaiming of -			 * anon page which don't already have a swap slot is -			 * pointless. -			 */ -			if (nr_swap_pages <= 0 && PageAnon(cursor_page) && -			    !PageSwapCache(cursor_page)) -				break; - -			if (__isolate_lru_page(cursor_page, mode, file) == 0) { -				list_move(&cursor_page->lru, dst); -				mem_cgroup_del_lru(cursor_page); -				nr_taken++; -				nr_lumpy_taken++; -				if (PageDirty(cursor_page)) -					nr_lumpy_dirty++; -				scan++; -			} else { -				/* the page is freed already. */ -				if (!page_count(cursor_page)) -					continue; -				break; -			} -		} - -		/* If we break out of the loop above, lumpy reclaim failed */ -		if (pfn < end_pfn) -			nr_lumpy_failed++;  	} -	*scanned = scan; - -	trace_mm_vmscan_lru_isolate(order, -			nr_to_scan, scan, -			nr_taken, -			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, -			mode); +	*nr_scanned = scan; +	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, +				    nr_taken, mode, is_file_lru(lru));  	return nr_taken;  } -static unsigned long isolate_pages_global(unsigned long nr, -					struct list_head *dst, -					unsigned long *scanned, int order, -					int mode, struct zone *z, -					int active, int file) -{ -	int lru = LRU_BASE; -	if (active) -		lru += LRU_ACTIVE; -	if (file) -		lru += LRU_FILE; -	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, -								mode, file); -} - -/* - * clear_active_flags() is a helper for shrink_active_list(), clearing - * any active bits from the pages in the list. - */ -static unsigned long clear_active_flags(struct list_head *page_list, -					unsigned int *count) -{ -	int nr_active = 0; -	int lru; -	struct page *page; - -	list_for_each_entry(page, page_list, lru) { -		lru = page_lru_base_type(page); -		if (PageActive(page)) { -			lru += LRU_ACTIVE; -			ClearPageActive(page); -			nr_active++; -		} -		if (count) -			count[lru]++; -	} - -	return nr_active; -} -  /**   * isolate_lru_page - tries to isolate a page from its LRU list   * @page: page to isolate from its LRU list @@ -1176,16 +1333,20 @@ int isolate_lru_page(struct page *page)  {  	int ret = -EBUSY; +	VM_BUG_ON_PAGE(!page_count(page), page); +  	if (PageLRU(page)) {  		struct zone *zone = page_zone(page); +		struct lruvec *lruvec;  		spin_lock_irq(&zone->lru_lock); -		if (PageLRU(page) && get_page_unless_zero(page)) { +		lruvec = mem_cgroup_page_lruvec(page, zone); +		if (PageLRU(page)) {  			int lru = page_lru(page); -			ret = 0; +			get_page(page);  			ClearPageLRU(page); - -			del_page_from_lru_list(zone, page, lru); +			del_page_from_lru_list(page, lruvec, lru); +			ret = 0;  		}  		spin_unlock_irq(&zone->lru_lock);  	} @@ -1193,7 +1354,11 @@ int isolate_lru_page(struct page *page)  }  /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM.   */  static int too_many_isolated(struct zone *zone, int file,  		struct scan_control *sc) @@ -1203,7 +1368,7 @@ static int too_many_isolated(struct zone *zone, int file,  	if (current_is_kswapd())  		return 0; -	if (!scanning_global_lru(sc)) +	if (!global_reclaim(sc))  		return 0;  	if (file) { @@ -1214,128 +1379,82 @@ static int too_many_isolated(struct zone *zone, int file,  		isolated = zone_page_state(zone, NR_ISOLATED_ANON);  	} +	/* +	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they +	 * won't get blocked by normal direct-reclaimers, forming a circular +	 * deadlock. +	 */ +	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) +		inactive >>= 3; +  	return isolated > inactive;  } -/* - * TODO: Try merging with migrations version of putback_lru_pages - */  static noinline_for_stack void -putback_lru_pages(struct zone *zone, struct scan_control *sc, -				unsigned long nr_anon, unsigned long nr_file, -				struct list_head *page_list) +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)  { -	struct page *page; -	struct pagevec pvec; -	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - -	pagevec_init(&pvec, 1); +	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; +	struct zone *zone = lruvec_zone(lruvec); +	LIST_HEAD(pages_to_free);  	/*  	 * Put back any unfreeable pages.  	 */ -	spin_lock(&zone->lru_lock);  	while (!list_empty(page_list)) { +		struct page *page = lru_to_page(page_list);  		int lru; -		page = lru_to_page(page_list); -		VM_BUG_ON(PageLRU(page)); + +		VM_BUG_ON_PAGE(PageLRU(page), page);  		list_del(&page->lru); -		if (unlikely(!page_evictable(page, NULL))) { +		if (unlikely(!page_evictable(page))) {  			spin_unlock_irq(&zone->lru_lock);  			putback_lru_page(page);  			spin_lock_irq(&zone->lru_lock);  			continue;  		} + +		lruvec = mem_cgroup_page_lruvec(page, zone); +  		SetPageLRU(page);  		lru = page_lru(page); -		add_page_to_lru_list(zone, page, lru); +		add_page_to_lru_list(page, lruvec, lru); +  		if (is_active_lru(lru)) {  			int file = is_file_lru(lru); -			reclaim_stat->recent_rotated[file]++; +			int numpages = hpage_nr_pages(page); +			reclaim_stat->recent_rotated[file] += numpages;  		} -		if (!pagevec_add(&pvec, page)) { -			spin_unlock_irq(&zone->lru_lock); -			__pagevec_release(&pvec); -			spin_lock_irq(&zone->lru_lock); +		if (put_page_testzero(page)) { +			__ClearPageLRU(page); +			__ClearPageActive(page); +			del_page_from_lru_list(page, lruvec, lru); + +			if (unlikely(PageCompound(page))) { +				spin_unlock_irq(&zone->lru_lock); +				(*get_compound_page_dtor(page))(page); +				spin_lock_irq(&zone->lru_lock); +			} else +				list_add(&page->lru, &pages_to_free);  		}  	} -	__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); -	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); - -	spin_unlock_irq(&zone->lru_lock); -	pagevec_release(&pvec); -} - -static noinline_for_stack void update_isolated_counts(struct zone *zone, -					struct scan_control *sc, -					unsigned long *nr_anon, -					unsigned long *nr_file, -					struct list_head *isolated_list) -{ -	unsigned long nr_active; -	unsigned int count[NR_LRU_LISTS] = { 0, }; -	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - -	nr_active = clear_active_flags(isolated_list, count); -	__count_vm_events(PGDEACTIVATE, nr_active); - -	__mod_zone_page_state(zone, NR_ACTIVE_FILE, -			      -count[LRU_ACTIVE_FILE]); -	__mod_zone_page_state(zone, NR_INACTIVE_FILE, -			      -count[LRU_INACTIVE_FILE]); -	__mod_zone_page_state(zone, NR_ACTIVE_ANON, -			      -count[LRU_ACTIVE_ANON]); -	__mod_zone_page_state(zone, NR_INACTIVE_ANON, -			      -count[LRU_INACTIVE_ANON]); -	*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; -	*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; -	__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); -	__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - -	reclaim_stat->recent_scanned[0] += *nr_anon; -	reclaim_stat->recent_scanned[1] += *nr_file; +	/* +	 * To save our caller's stack, now use input list for pages to free. +	 */ +	list_splice(&pages_to_free, page_list);  }  /* - * Returns true if the caller should wait to clean dirty/writeback pages. - * - * If we are direct reclaiming for contiguous pages and we do not reclaim - * everything in the list, try again and wait for writeback IO to complete. - * This will stall high-order allocations noticeably. Only do that when really - * need to free the pages under high memory pressure. + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested.  In other cases it is safe to throttle.   */ -static inline bool should_reclaim_stall(unsigned long nr_taken, -					unsigned long nr_freed, -					int priority, -					struct scan_control *sc) +static int current_may_throttle(void)  { -	int lumpy_stall_priority; - -	/* kswapd should not stall on sync IO */ -	if (current_is_kswapd()) -		return false; - -	/* Only stall on lumpy reclaim */ -	if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) -		return false; - -	/* If we have relaimed everything on the isolated list, no stall */ -	if (nr_freed == nr_taken) -		return false; - -	/* -	 * For high-order allocations, there are two stall thresholds. -	 * High-cost allocations stall immediately where as lower -	 * order allocations such as stacks require the scanning -	 * priority to be much higher before stalling. -	 */ -	if (sc->order > PAGE_ALLOC_COSTLY_ORDER) -		lumpy_stall_priority = DEF_PRIORITY; -	else -		lumpy_stall_priority = DEF_PRIORITY / 3; - -	return priority <= lumpy_stall_priority; +	return !(current->flags & PF_LESS_THROTTLE) || +		current->backing_dev_info == NULL || +		bdi_write_congested(current->backing_dev_info);  }  /* @@ -1343,15 +1462,22 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,   * of reclaimed pages   */  static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, -			struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, +		     struct scan_control *sc, enum lru_list lru)  {  	LIST_HEAD(page_list);  	unsigned long nr_scanned;  	unsigned long nr_reclaimed = 0;  	unsigned long nr_taken; -	unsigned long nr_anon; -	unsigned long nr_file; +	unsigned long nr_dirty = 0; +	unsigned long nr_congested = 0; +	unsigned long nr_unqueued_dirty = 0; +	unsigned long nr_writeback = 0; +	unsigned long nr_immediate = 0; +	isolate_mode_t isolate_mode = 0; +	int file = is_file_lru(lru); +	struct zone *zone = lruvec_zone(lruvec); +	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;  	while (unlikely(too_many_isolated(zone, file, sc))) {  		congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1361,65 +1487,121 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,  			return SWAP_CLUSTER_MAX;  	} -	set_lumpy_reclaim_mode(priority, sc, false);  	lru_add_drain(); + +	if (!sc->may_unmap) +		isolate_mode |= ISOLATE_UNMAPPED; +	if (!sc->may_writepage) +		isolate_mode |= ISOLATE_CLEAN; +  	spin_lock_irq(&zone->lru_lock); -	if (scanning_global_lru(sc)) { -		nr_taken = isolate_pages_global(nr_to_scan, -			&page_list, &nr_scanned, sc->order, -			sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? -					ISOLATE_INACTIVE : ISOLATE_BOTH, -			zone, 0, file); +	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, +				     &nr_scanned, sc, isolate_mode, lru); + +	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); +	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + +	if (global_reclaim(sc)) {  		zone->pages_scanned += nr_scanned;  		if (current_is_kswapd()) -			__count_zone_vm_events(PGSCAN_KSWAPD, zone, -					       nr_scanned); +			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);  		else -			__count_zone_vm_events(PGSCAN_DIRECT, zone, -					       nr_scanned); -	} else { -		nr_taken = mem_cgroup_isolate_pages(nr_to_scan, -			&page_list, &nr_scanned, sc->order, -			sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? -					ISOLATE_INACTIVE : ISOLATE_BOTH, -			zone, sc->mem_cgroup, -			0, file); -		/* -		 * mem_cgroup_isolate_pages() keeps track of -		 * scanned pages on its own. -		 */ +			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);  	} +	spin_unlock_irq(&zone->lru_lock); -	if (nr_taken == 0) { -		spin_unlock_irq(&zone->lru_lock); +	if (nr_taken == 0)  		return 0; + +	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, +				&nr_dirty, &nr_unqueued_dirty, &nr_congested, +				&nr_writeback, &nr_immediate, +				false); + +	spin_lock_irq(&zone->lru_lock); + +	reclaim_stat->recent_scanned[file] += nr_taken; + +	if (global_reclaim(sc)) { +		if (current_is_kswapd()) +			__count_zone_vm_events(PGSTEAL_KSWAPD, zone, +					       nr_reclaimed); +		else +			__count_zone_vm_events(PGSTEAL_DIRECT, zone, +					       nr_reclaimed);  	} -	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); +	putback_inactive_pages(lruvec, &page_list); + +	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);  	spin_unlock_irq(&zone->lru_lock); -	nr_reclaimed = shrink_page_list(&page_list, zone, sc); +	free_hot_cold_page_list(&page_list, true); -	/* Check if we should syncronously wait for writeback */ -	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { -		set_lumpy_reclaim_mode(priority, sc, true); -		nr_reclaimed += shrink_page_list(&page_list, zone, sc); -	} +	/* +	 * If reclaim is isolating dirty pages under writeback, it implies +	 * that the long-lived page allocation rate is exceeding the page +	 * laundering rate. Either the global limits are not being effective +	 * at throttling processes due to the page distribution throughout +	 * zones or there is heavy usage of a slow backing device. The +	 * only option is to throttle from reclaim context which is not ideal +	 * as there is no guarantee the dirtying process is throttled in the +	 * same way balance_dirty_pages() manages. +	 * +	 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number +	 * of pages under pages flagged for immediate reclaim and stall if any +	 * are encountered in the nr_immediate check below. +	 */ +	if (nr_writeback && nr_writeback == nr_taken) +		zone_set_flag(zone, ZONE_WRITEBACK); -	local_irq_disable(); -	if (current_is_kswapd()) -		__count_vm_events(KSWAPD_STEAL, nr_reclaimed); -	__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); +	/* +	 * memcg will stall in page writeback so only consider forcibly +	 * stalling for global reclaim +	 */ +	if (global_reclaim(sc)) { +		/* +		 * Tag a zone as congested if all the dirty pages scanned were +		 * backed by a congested BDI and wait_iff_congested will stall. +		 */ +		if (nr_dirty && nr_dirty == nr_congested) +			zone_set_flag(zone, ZONE_CONGESTED); -	putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); +		/* +		 * If dirty pages are scanned that are not queued for IO, it +		 * implies that flushers are not keeping up. In this case, flag +		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing +		 * pages from reclaim context. +		 */ +		if (nr_unqueued_dirty == nr_taken) +			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + +		/* +		 * If kswapd scans pages marked marked for immediate +		 * reclaim and under writeback (nr_immediate), it implies +		 * that pages are cycling through the LRU faster than +		 * they are written so also forcibly stall. +		 */ +		if (nr_immediate && current_may_throttle()) +			congestion_wait(BLK_RW_ASYNC, HZ/10); +	} + +	/* +	 * Stall direct reclaim for IO completions if underlying BDIs or zone +	 * is congested. Allow kswapd to continue until it starts encountering +	 * unqueued dirty pages or cycling through the LRU too quickly. +	 */ +	if (!sc->hibernation_mode && !current_is_kswapd() && +	    current_may_throttle()) +		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);  	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,  		zone_idx(zone),  		nr_scanned, nr_reclaimed, -		priority, -		trace_shrink_flags(file, sc->lumpy_reclaim_mode)); +		sc->priority, +		trace_shrink_flags(file));  	return nr_reclaimed;  } @@ -1441,32 +1623,39 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,   * But we had to alter page->flags anyway.   */ -static void move_active_pages_to_lru(struct zone *zone, +static void move_active_pages_to_lru(struct lruvec *lruvec,  				     struct list_head *list, +				     struct list_head *pages_to_free,  				     enum lru_list lru)  { +	struct zone *zone = lruvec_zone(lruvec);  	unsigned long pgmoved = 0; -	struct pagevec pvec;  	struct page *page; - -	pagevec_init(&pvec, 1); +	int nr_pages;  	while (!list_empty(list)) {  		page = lru_to_page(list); +		lruvec = mem_cgroup_page_lruvec(page, zone); -		VM_BUG_ON(PageLRU(page)); +		VM_BUG_ON_PAGE(PageLRU(page), page);  		SetPageLRU(page); -		list_move(&page->lru, &zone->lru[lru].list); -		mem_cgroup_add_lru_list(page, lru); -		pgmoved++; +		nr_pages = hpage_nr_pages(page); +		mem_cgroup_update_lru_size(lruvec, lru, nr_pages); +		list_move(&page->lru, &lruvec->lists[lru]); +		pgmoved += nr_pages; -		if (!pagevec_add(&pvec, page) || list_empty(list)) { -			spin_unlock_irq(&zone->lru_lock); -			if (buffer_heads_over_limit) -				pagevec_strip(&pvec); -			__pagevec_release(&pvec); -			spin_lock_irq(&zone->lru_lock); +		if (put_page_testzero(page)) { +			__ClearPageLRU(page); +			__ClearPageActive(page); +			del_page_from_lru_list(page, lruvec, lru); + +			if (unlikely(PageCompound(page))) { +				spin_unlock_irq(&zone->lru_lock); +				(*get_compound_page_dtor(page))(page); +				spin_lock_irq(&zone->lru_lock); +			} else +				list_add(&page->lru, pages_to_free);  		}  	}  	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1474,45 +1663,42 @@ static void move_active_pages_to_lru(struct zone *zone,  		__count_vm_events(PGDEACTIVATE, pgmoved);  } -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, -			struct scan_control *sc, int priority, int file) +static void shrink_active_list(unsigned long nr_to_scan, +			       struct lruvec *lruvec, +			       struct scan_control *sc, +			       enum lru_list lru)  {  	unsigned long nr_taken; -	unsigned long pgscanned; +	unsigned long nr_scanned;  	unsigned long vm_flags;  	LIST_HEAD(l_hold);	/* The pages which were snipped off */  	LIST_HEAD(l_active);  	LIST_HEAD(l_inactive);  	struct page *page; -	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); +	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;  	unsigned long nr_rotated = 0; +	isolate_mode_t isolate_mode = 0; +	int file = is_file_lru(lru); +	struct zone *zone = lruvec_zone(lruvec);  	lru_add_drain(); + +	if (!sc->may_unmap) +		isolate_mode |= ISOLATE_UNMAPPED; +	if (!sc->may_writepage) +		isolate_mode |= ISOLATE_CLEAN; +  	spin_lock_irq(&zone->lru_lock); -	if (scanning_global_lru(sc)) { -		nr_taken = isolate_pages_global(nr_pages, &l_hold, -						&pgscanned, sc->order, -						ISOLATE_ACTIVE, zone, -						1, file); -		zone->pages_scanned += pgscanned; -	} else { -		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, -						&pgscanned, sc->order, -						ISOLATE_ACTIVE, zone, -						sc->mem_cgroup, 1, file); -		/* -		 * mem_cgroup_isolate_pages() keeps track of -		 * scanned pages on its own. -		 */ -	} + +	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, +				     &nr_scanned, sc, isolate_mode, lru); +	if (global_reclaim(sc)) +		zone->pages_scanned += nr_scanned;  	reclaim_stat->recent_scanned[file] += nr_taken; -	__count_zone_vm_events(PGREFILL, zone, pgscanned); -	if (file) -		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); -	else -		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); +	__count_zone_vm_events(PGREFILL, zone, nr_scanned); +	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);  	spin_unlock_irq(&zone->lru_lock); @@ -1521,13 +1707,22 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,  		page = lru_to_page(&l_hold);  		list_del(&page->lru); -		if (unlikely(!page_evictable(page, NULL))) { +		if (unlikely(!page_evictable(page))) {  			putback_lru_page(page);  			continue;  		} -		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { -			nr_rotated++; +		if (unlikely(buffer_heads_over_limit)) { +			if (page_has_private(page) && trylock_page(page)) { +				if (page_has_private(page)) +					try_to_release_page(page, 0); +				unlock_page(page); +			} +		} + +		if (page_referenced(page, 0, sc->target_mem_cgroup, +				    &vm_flags)) { +			nr_rotated += hpage_nr_pages(page);  			/*  			 * Identify referenced, file-backed active pages and  			 * give them one more trip around the active list. So @@ -1559,12 +1754,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,  	 */  	reclaim_stat->recent_rotated[file] += nr_rotated; -	move_active_pages_to_lru(zone, &l_active, -						LRU_ACTIVE + file * LRU_FILE); -	move_active_pages_to_lru(zone, &l_inactive, -						LRU_BASE   + file * LRU_FILE); +	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); +	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);  	spin_unlock_irq(&zone->lru_lock); + +	free_hot_cold_page_list(&l_hold, true);  }  #ifdef CONFIG_SWAP @@ -1583,16 +1778,13 @@ static int inactive_anon_is_low_global(struct zone *zone)  /**   * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @zone: zone to check - * @sc:   scan control of this context + * @lruvec: LRU vector to check   *   * Returns true if the zone does not have enough inactive anon pages,   * meaning some active anon pages need to be deactivated.   */ -static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_anon_is_low(struct lruvec *lruvec)  { -	int low; -  	/*  	 * If we don't have swap space, anonymous page deactivation  	 * is pointless. @@ -1600,34 +1792,21 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)  	if (!total_swap_pages)  		return 0; -	if (scanning_global_lru(sc)) -		low = inactive_anon_is_low_global(zone); -	else -		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); -	return low; +	if (!mem_cgroup_disabled()) +		return mem_cgroup_inactive_anon_is_low(lruvec); + +	return inactive_anon_is_low_global(lruvec_zone(lruvec));  }  #else -static inline int inactive_anon_is_low(struct zone *zone, -					struct scan_control *sc) +static inline int inactive_anon_is_low(struct lruvec *lruvec)  {  	return 0;  }  #endif -static int inactive_file_is_low_global(struct zone *zone) -{ -	unsigned long active, inactive; - -	active = zone_page_state(zone, NR_ACTIVE_FILE); -	inactive = zone_page_state(zone, NR_INACTIVE_FILE); - -	return (active > inactive); -} -  /**   * inactive_file_is_low - check if file pages need to be deactivated - * @zone: zone to check - * @sc:   scan control of this context + * @lruvec: LRU vector to check   *   * When the system is doing streaming IO, memory pressure here   * ensures that active file pages get deactivated, until more @@ -1639,59 +1818,43 @@ static int inactive_file_is_low_global(struct zone *zone)   * This uses a different ratio than the anonymous pages, because   * the page cache uses a use-once replacement algorithm.   */ -static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +static int inactive_file_is_low(struct lruvec *lruvec)  { -	int low; +	unsigned long inactive; +	unsigned long active; -	if (scanning_global_lru(sc)) -		low = inactive_file_is_low_global(zone); -	else -		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); -	return low; +	inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); +	active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + +	return active > inactive;  } -static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, -				int file) +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)  { -	if (file) -		return inactive_file_is_low(zone, sc); +	if (is_file_lru(lru)) +		return inactive_file_is_low(lruvec);  	else -		return inactive_anon_is_low(zone, sc); +		return inactive_anon_is_low(lruvec);  }  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, -	struct zone *zone, struct scan_control *sc, int priority) +				 struct lruvec *lruvec, struct scan_control *sc)  { -	int file = is_file_lru(lru); -  	if (is_active_lru(lru)) { -		if (inactive_list_is_low(zone, sc, file)) -		    shrink_active_list(nr_to_scan, zone, sc, priority, file); +		if (inactive_list_is_low(lruvec, lru)) +			shrink_active_list(nr_to_scan, lruvec, sc, lru);  		return 0;  	} -	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);  } -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, -				       unsigned long *nr_saved_scan) -{ -	unsigned long nr; - -	*nr_saved_scan += nr_to_scan; -	nr = *nr_saved_scan; - -	if (nr >= SWAP_CLUSTER_MAX) -		*nr_saved_scan = 0; -	else -		nr = 0; - -	return nr; -} +enum scan_balance { +	SCAN_EQUAL, +	SCAN_FRACT, +	SCAN_ANON, +	SCAN_FILE, +};  /*   * Determine how aggressively the anon and file LRU lists should be @@ -1699,51 +1862,108 @@ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,   * by looking at the fraction of the pages scanned we did rotate back   * onto the active list instead of evict.   * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan   */ -static void get_scan_count(struct zone *zone, struct scan_control *sc, -					unsigned long *nr, int priority) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, +			   unsigned long *nr)  { -	unsigned long anon, file, free; +	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; +	u64 fraction[2]; +	u64 denominator = 0;	/* gcc */ +	struct zone *zone = lruvec_zone(lruvec);  	unsigned long anon_prio, file_prio; +	enum scan_balance scan_balance; +	unsigned long anon, file; +	bool force_scan = false;  	unsigned long ap, fp; -	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); -	u64 fraction[2], denominator; -	enum lru_list l; -	int noswap = 0; +	enum lru_list lru; +	bool some_scanned; +	int pass; + +	/* +	 * If the zone or memcg is small, nr[l] can be 0.  This +	 * results in no scanning on this priority and a potential +	 * priority drop.  Global direct reclaim can go to the next +	 * zone and tends to have no problems. Global kswapd is for +	 * zone balancing and it needs to scan a minimum amount. When +	 * reclaiming for a memcg, a priority drop can cause high +	 * latencies, so it's better to scan a minimum amount there as +	 * well. +	 */ +	if (current_is_kswapd() && !zone_reclaimable(zone)) +		force_scan = true; +	if (!global_reclaim(sc)) +		force_scan = true;  	/* If we have no swap space, do not bother scanning anon pages. */ -	if (!sc->may_swap || (nr_swap_pages <= 0)) { -		noswap = 1; -		fraction[0] = 0; -		fraction[1] = 1; -		denominator = 1; +	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { +		scan_balance = SCAN_FILE;  		goto out;  	} -	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + -		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); -	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + -		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); +	/* +	 * Global reclaim will swap to prevent OOM even with no +	 * swappiness, but memcg users want to use this knob to +	 * disable swapping for individual groups completely when +	 * using the memory controller's swap limit feature would be +	 * too expensive. +	 */ +	if (!global_reclaim(sc) && !sc->swappiness) { +		scan_balance = SCAN_FILE; +		goto out; +	} + +	/* +	 * Do not apply any pressure balancing cleverness when the +	 * system is close to OOM, scan both anon and file equally +	 * (unless the swappiness setting disagrees with swapping). +	 */ +	if (!sc->priority && sc->swappiness) { +		scan_balance = SCAN_EQUAL; +		goto out; +	} + +	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) + +		get_lru_size(lruvec, LRU_INACTIVE_ANON); +	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) + +		get_lru_size(lruvec, LRU_INACTIVE_FILE); + +	/* +	 * Prevent the reclaimer from falling into the cache trap: as +	 * cache pages start out inactive, every cache fault will tip +	 * the scan balance towards the file LRU.  And as the file LRU +	 * shrinks, so does the window for rotation from references. +	 * This means we have a runaway feedback loop where a tiny +	 * thrashing file LRU becomes infinitely more attractive than +	 * anon pages.  Try to detect this based on file LRU size. +	 */ +	if (global_reclaim(sc)) { +		unsigned long free = zone_page_state(zone, NR_FREE_PAGES); -	if (scanning_global_lru(sc)) { -		free  = zone_page_state(zone, NR_FREE_PAGES); -		/* If we have very few page cache pages, -		   force-scan anon pages. */  		if (unlikely(file + free <= high_wmark_pages(zone))) { -			fraction[0] = 1; -			fraction[1] = 0; -			denominator = 1; +			scan_balance = SCAN_ANON;  			goto out;  		}  	}  	/* +	 * There is enough inactive page cache, do not reclaim +	 * anything from the anonymous working set right now. +	 */ +	if (!inactive_file_is_low(lruvec)) { +		scan_balance = SCAN_FILE; +		goto out; +	} + +	scan_balance = SCAN_FRACT; + +	/*  	 * With swappiness at 100, anonymous and file have the same priority.  	 * This scanning priority is essentially the inverse of IO cost.  	 */  	anon_prio = sc->swappiness; -	file_prio = 200 - sc->swappiness; +	file_prio = 200 - anon_prio;  	/*  	 * OK, so we have swap space and a fair amount of page cache @@ -1772,10 +1992,10 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  	 * proportional to the fraction of recently scanned pages on  	 * each list that were recently referenced and in active use.  	 */ -	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); +	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);  	ap /= reclaim_stat->recent_rotated[0] + 1; -	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); +	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);  	fp /= reclaim_stat->recent_rotated[1] + 1;  	spin_unlock_irq(&zone->lru_lock); @@ -1783,70 +2003,329 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,  	fraction[1] = fp;  	denominator = ap + fp + 1;  out: -	for_each_evictable_lru(l) { -		int file = is_file_lru(l); -		unsigned long scan; - -		scan = zone_nr_lru_pages(zone, sc, l); -		if (priority || noswap) { -			scan >>= priority; -			scan = div64_u64(scan * fraction[file], denominator); +	some_scanned = false; +	/* Only use force_scan on second pass. */ +	for (pass = 0; !some_scanned && pass < 2; pass++) { +		for_each_evictable_lru(lru) { +			int file = is_file_lru(lru); +			unsigned long size; +			unsigned long scan; + +			size = get_lru_size(lruvec, lru); +			scan = size >> sc->priority; + +			if (!scan && pass && force_scan) +				scan = min(size, SWAP_CLUSTER_MAX); + +			switch (scan_balance) { +			case SCAN_EQUAL: +				/* Scan lists relative to size */ +				break; +			case SCAN_FRACT: +				/* +				 * Scan types proportional to swappiness and +				 * their relative recent reclaim efficiency. +				 */ +				scan = div64_u64(scan * fraction[file], +							denominator); +				break; +			case SCAN_FILE: +			case SCAN_ANON: +				/* Scan one type exclusively */ +				if ((scan_balance == SCAN_FILE) != file) +					scan = 0; +				break; +			default: +				/* Look ma, no brain */ +				BUG(); +			} +			nr[lru] = scan; +			/* +			 * Skip the second pass and don't force_scan, +			 * if we found something to scan. +			 */ +			some_scanned |= !!scan;  		} -		nr[l] = nr_scan_try_batch(scan, -					  &reclaim_stat->nr_saved_scan[l]);  	}  }  /*   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.   */ -static void shrink_zone(int priority, struct zone *zone, -				struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  {  	unsigned long nr[NR_LRU_LISTS]; +	unsigned long targets[NR_LRU_LISTS];  	unsigned long nr_to_scan; -	enum lru_list l; -	unsigned long nr_reclaimed = sc->nr_reclaimed; +	enum lru_list lru; +	unsigned long nr_reclaimed = 0;  	unsigned long nr_to_reclaim = sc->nr_to_reclaim; +	struct blk_plug plug; +	bool scan_adjusted; + +	get_scan_count(lruvec, sc, nr); -	get_scan_count(zone, sc, nr, priority); +	/* Record the original scan target for proportional adjustments later */ +	memcpy(targets, nr, sizeof(nr)); + +	/* +	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal +	 * event that can occur when there is little memory pressure e.g. +	 * multiple streaming readers/writers. Hence, we do not abort scanning +	 * when the requested number of pages are reclaimed when scanning at +	 * DEF_PRIORITY on the assumption that the fact we are direct +	 * reclaiming implies that kswapd is not keeping up and it is best to +	 * do a batch of work at once. For memcg reclaim one check is made to +	 * abort proportional reclaim if either the file or anon lru has already +	 * dropped to zero at the first pass. +	 */ +	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && +			 sc->priority == DEF_PRIORITY); +	blk_start_plug(&plug);  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||  					nr[LRU_INACTIVE_FILE]) { -		for_each_evictable_lru(l) { -			if (nr[l]) { -				nr_to_scan = min_t(unsigned long, -						   nr[l], SWAP_CLUSTER_MAX); -				nr[l] -= nr_to_scan; - -				nr_reclaimed += shrink_list(l, nr_to_scan, -							    zone, sc, priority); +		unsigned long nr_anon, nr_file, percentage; +		unsigned long nr_scanned; + +		for_each_evictable_lru(lru) { +			if (nr[lru]) { +				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); +				nr[lru] -= nr_to_scan; + +				nr_reclaimed += shrink_list(lru, nr_to_scan, +							    lruvec, sc);  			}  		} + +		if (nr_reclaimed < nr_to_reclaim || scan_adjusted) +			continue; +  		/* -		 * On large memory systems, scan >> priority can become -		 * really large. This is fine for the starting priority; -		 * we want to put equal scanning pressure on each zone. -		 * However, if the VM has a harder time of freeing pages, -		 * with multiple processes reclaiming pages, the total -		 * freeing target can get unreasonably large. +		 * For kswapd and memcg, reclaim at least the number of pages +		 * requested. Ensure that the anon and file LRUs are scanned +		 * proportionally what was requested by get_scan_count(). We +		 * stop reclaiming one LRU and reduce the amount scanning +		 * proportional to the original scan target.  		 */ -		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) +		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; +		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + +		/* +		 * It's just vindictive to attack the larger once the smaller +		 * has gone to zero.  And given the way we stop scanning the +		 * smaller below, this makes sure that we only make one nudge +		 * towards proportionality once we've got nr_to_reclaim. +		 */ +		if (!nr_file || !nr_anon)  			break; -	} -	sc->nr_reclaimed = nr_reclaimed; +		if (nr_file > nr_anon) { +			unsigned long scan_target = targets[LRU_INACTIVE_ANON] + +						targets[LRU_ACTIVE_ANON] + 1; +			lru = LRU_BASE; +			percentage = nr_anon * 100 / scan_target; +		} else { +			unsigned long scan_target = targets[LRU_INACTIVE_FILE] + +						targets[LRU_ACTIVE_FILE] + 1; +			lru = LRU_FILE; +			percentage = nr_file * 100 / scan_target; +		} + +		/* Stop scanning the smaller of the LRU */ +		nr[lru] = 0; +		nr[lru + LRU_ACTIVE] = 0; + +		/* +		 * Recalculate the other LRU scan count based on its original +		 * scan target and the percentage scanning already complete +		 */ +		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; +		nr_scanned = targets[lru] - nr[lru]; +		nr[lru] = targets[lru] * (100 - percentage) / 100; +		nr[lru] -= min(nr[lru], nr_scanned); + +		lru += LRU_ACTIVE; +		nr_scanned = targets[lru] - nr[lru]; +		nr[lru] = targets[lru] * (100 - percentage) / 100; +		nr[lru] -= min(nr[lru], nr_scanned); + +		scan_adjusted = true; +	} +	blk_finish_plug(&plug); +	sc->nr_reclaimed += nr_reclaimed;  	/*  	 * Even if we did not try to evict anon pages at all, we want to  	 * rebalance the anon lru active/inactive ratio.  	 */ -	if (inactive_anon_is_low(zone, sc)) -		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); +	if (inactive_anon_is_low(lruvec)) +		shrink_active_list(SWAP_CLUSTER_MAX, lruvec, +				   sc, LRU_ACTIVE_ANON);  	throttle_vm_writeout(sc->gfp_mask);  } +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ +	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && +			(sc->order > PAGE_ALLOC_COSTLY_ORDER || +			 sc->priority < DEF_PRIORITY - 2)) +		return true; + +	return false; +} + +/* + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. + */ +static inline bool should_continue_reclaim(struct zone *zone, +					unsigned long nr_reclaimed, +					unsigned long nr_scanned, +					struct scan_control *sc) +{ +	unsigned long pages_for_compaction; +	unsigned long inactive_lru_pages; + +	/* If not in reclaim/compaction mode, stop */ +	if (!in_reclaim_compaction(sc)) +		return false; + +	/* Consider stopping depending on scan and reclaim activity */ +	if (sc->gfp_mask & __GFP_REPEAT) { +		/* +		 * For __GFP_REPEAT allocations, stop reclaiming if the +		 * full LRU list has been scanned and we are still failing +		 * to reclaim pages. This full LRU scan is potentially +		 * expensive but a __GFP_REPEAT caller really wants to succeed +		 */ +		if (!nr_reclaimed && !nr_scanned) +			return false; +	} else { +		/* +		 * For non-__GFP_REPEAT allocations which can presumably +		 * fail without consequence, stop if we failed to reclaim +		 * any pages from the last SWAP_CLUSTER_MAX number of +		 * pages that were scanned. This will return to the +		 * caller faster at the risk reclaim/compaction and +		 * the resulting allocation attempt fails +		 */ +		if (!nr_reclaimed) +			return false; +	} + +	/* +	 * If we have not reclaimed enough pages for compaction and the +	 * inactive lists are large enough, continue reclaiming +	 */ +	pages_for_compaction = (2UL << sc->order); +	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); +	if (get_nr_swap_pages() > 0) +		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); +	if (sc->nr_reclaimed < pages_for_compaction && +			inactive_lru_pages > pages_for_compaction) +		return true; + +	/* If compaction would go ahead or the allocation would succeed, stop */ +	switch (compaction_suitable(zone, sc->order)) { +	case COMPACT_PARTIAL: +	case COMPACT_CONTINUE: +		return false; +	default: +		return true; +	} +} + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ +	unsigned long nr_reclaimed, nr_scanned; + +	do { +		struct mem_cgroup *root = sc->target_mem_cgroup; +		struct mem_cgroup_reclaim_cookie reclaim = { +			.zone = zone, +			.priority = sc->priority, +		}; +		struct mem_cgroup *memcg; + +		nr_reclaimed = sc->nr_reclaimed; +		nr_scanned = sc->nr_scanned; + +		memcg = mem_cgroup_iter(root, NULL, &reclaim); +		do { +			struct lruvec *lruvec; + +			lruvec = mem_cgroup_zone_lruvec(zone, memcg); + +			sc->swappiness = mem_cgroup_swappiness(memcg); +			shrink_lruvec(lruvec, sc); + +			/* +			 * Direct reclaim and kswapd have to scan all memory +			 * cgroups to fulfill the overall scan target for the +			 * zone. +			 * +			 * Limit reclaim, on the other hand, only cares about +			 * nr_to_reclaim pages to be reclaimed and it will +			 * retry with decreasing priority if one round over the +			 * whole hierarchy is not sufficient. +			 */ +			if (!global_reclaim(sc) && +					sc->nr_reclaimed >= sc->nr_to_reclaim) { +				mem_cgroup_iter_break(root, memcg); +				break; +			} +			memcg = mem_cgroup_iter(root, memcg, &reclaim); +		} while (memcg); + +		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, +			   sc->nr_scanned - nr_scanned, +			   sc->nr_reclaimed - nr_reclaimed); + +	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, +					 sc->nr_scanned - nr_scanned, sc)); +} + +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ +	unsigned long balance_gap, watermark; +	bool watermark_ok; + +	/* Do not consider compaction for orders reclaim is meant to satisfy */ +	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) +		return false; + +	/* +	 * Compaction takes time to run and there are potentially other +	 * callers using the pages just freed. Continue reclaiming until +	 * there is a buffer of free pages available to give compaction +	 * a reasonable chance of completing and allocating the page +	 */ +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); +	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); +	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + +	/* +	 * If compaction is deferred, reclaim up to a point where +	 * compaction will have a chance of success when re-enabled +	 */ +	if (compaction_deferred(zone, sc->order)) +		return watermark_ok; + +	/* If compaction is not ready to start, keep reclaiming */ +	if (!compaction_suitable(zone, sc->order)) +		return false; + +	return watermark_ok; +} +  /*   * This is the direct reclaim path, for page-allocating processes.  We only   * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1862,12 +2341,37 @@ static void shrink_zone(int priority, struct zone *zone,   *   * If a zone is deemed to be full of pinned pages then just give it a light   * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should consider retrying the allocation instead of + * further reclaim.   */ -static void shrink_zones(int priority, struct zonelist *zonelist, -					struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)  {  	struct zoneref *z;  	struct zone *zone; +	unsigned long nr_soft_reclaimed; +	unsigned long nr_soft_scanned; +	unsigned long lru_pages = 0; +	bool aborted_reclaim = false; +	struct reclaim_state *reclaim_state = current->reclaim_state; +	gfp_t orig_mask; +	struct shrink_control shrink = { +		.gfp_mask = sc->gfp_mask, +	}; +	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + +	/* +	 * If the number of buffer_heads in the machine exceeds the maximum +	 * allowed level, force direct reclaim to scan the highmem zone as +	 * highmem pages could be pinning lowmem pages storing buffer_heads +	 */ +	orig_mask = sc->gfp_mask; +	if (buffer_heads_over_limit) +		sc->gfp_mask |= __GFP_HIGHMEM; + +	nodes_clear(shrink.nodes_to_scan);  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  					gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1877,33 +2381,79 @@ static void shrink_zones(int priority, struct zonelist *zonelist,  		 * Take care memory controller reclaiming has small influence  		 * to global LRU.  		 */ -		if (scanning_global_lru(sc)) { +		if (global_reclaim(sc)) {  			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))  				continue; -			if (zone->all_unreclaimable && priority != DEF_PRIORITY) + +			lru_pages += zone_reclaimable_pages(zone); +			node_set(zone_to_nid(zone), shrink.nodes_to_scan); + +			if (sc->priority != DEF_PRIORITY && +			    !zone_reclaimable(zone))  				continue;	/* Let kswapd poll it */ +			if (IS_ENABLED(CONFIG_COMPACTION)) { +				/* +				 * If we already have plenty of memory free for +				 * compaction in this zone, don't free any more. +				 * Even though compaction is invoked for any +				 * non-zero order, only frequent costly order +				 * reclamation is disruptive enough to become a +				 * noticeable problem, like transparent huge +				 * page allocations. +				 */ +				if ((zonelist_zone_idx(z) <= requested_highidx) +				    && compaction_ready(zone, sc)) { +					aborted_reclaim = true; +					continue; +				} +			} +			/* +			 * This steals pages from memory cgroups over softlimit +			 * and returns the number of reclaimed pages and +			 * scanned pages. This works for global memory pressure +			 * and balancing, not for a memcg's limit. +			 */ +			nr_soft_scanned = 0; +			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, +						sc->order, sc->gfp_mask, +						&nr_soft_scanned); +			sc->nr_reclaimed += nr_soft_reclaimed; +			sc->nr_scanned += nr_soft_scanned; +			/* need some check for avoid more shrink_zone() */  		} -		shrink_zone(priority, zone, sc); +		shrink_zone(zone, sc);  	} -} -static bool zone_reclaimable(struct zone *zone) -{ -	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +	/* +	 * Don't shrink slabs when reclaiming memory from over limit cgroups +	 * but do shrink slab at least once when aborting reclaim for +	 * compaction to avoid unevenly scanning file/anon LRU pages over slab +	 * pages. +	 */ +	if (global_reclaim(sc)) { +		shrink_slab(&shrink, sc->nr_scanned, lru_pages); +		if (reclaim_state) { +			sc->nr_reclaimed += reclaim_state->reclaimed_slab; +			reclaim_state->reclaimed_slab = 0; +		} +	} + +	/* +	 * Restore to original mask to avoid the impact on the caller if we +	 * promoted it to __GFP_HIGHMEM. +	 */ +	sc->gfp_mask = orig_mask; + +	return aborted_reclaim;  } -/* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. It can't handle OOM during hibernation. - * So let's check zone's unreclaimable in direct reclaim as well as kswapd. - */ +/* All zones in zonelist are unreclaimable? */  static bool all_unreclaimable(struct zonelist *zonelist,  		struct scan_control *sc)  {  	struct zoneref *z;  	struct zone *zone; -	bool all_unreclaimable = true;  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  			gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1911,13 +2461,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,  			continue;  		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))  			continue; -		if (zone_reclaimable(zone)) { -			all_unreclaimable = false; -			break; -		} +		if (zone_reclaimable(zone)) +			return false;  	} -	return all_unreclaimable; +	return true;  }  /* @@ -1937,51 +2485,35 @@ static bool all_unreclaimable(struct zonelist *zonelist,   * 		else, the number of pages reclaimed   */  static unsigned long do_try_to_free_pages(struct zonelist *zonelist, -					struct scan_control *sc) +					  struct scan_control *sc)  { -	int priority;  	unsigned long total_scanned = 0; -	struct reclaim_state *reclaim_state = current->reclaim_state; -	struct zoneref *z; -	struct zone *zone;  	unsigned long writeback_threshold; +	bool aborted_reclaim; -	get_mems_allowed();  	delayacct_freepages_start(); -	if (scanning_global_lru(sc)) +	if (global_reclaim(sc))  		count_vm_event(ALLOCSTALL); -	for (priority = DEF_PRIORITY; priority >= 0; priority--) { +	do { +		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, +				sc->priority);  		sc->nr_scanned = 0; -		if (!priority) -			disable_swap_token(); -		shrink_zones(priority, zonelist, sc); -		/* -		 * Don't shrink slabs when reclaiming memory from -		 * over limit cgroups -		 */ -		if (scanning_global_lru(sc)) { -			unsigned long lru_pages = 0; -			for_each_zone_zonelist(zone, z, zonelist, -					gfp_zone(sc->gfp_mask)) { -				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) -					continue; +		aborted_reclaim = shrink_zones(zonelist, sc); -				lru_pages += zone_reclaimable_pages(zone); -			} - -			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); -			if (reclaim_state) { -				sc->nr_reclaimed += reclaim_state->reclaimed_slab; -				reclaim_state->reclaimed_slab = 0; -			} -		}  		total_scanned += sc->nr_scanned;  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)  			goto out;  		/* +		 * If we're getting trouble reclaiming, start doing +		 * writepage even in laptop mode. +		 */ +		if (sc->priority < DEF_PRIORITY - 2) +			sc->may_writepage = 1; + +		/*  		 * Try to write back as many pages as we just scanned.  This  		 * tends to cause slow streaming writers to write data to the  		 * disk smoothly, at the dirtying rate, which is nice.   But @@ -1990,51 +2522,187 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,  		 */  		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;  		if (total_scanned > writeback_threshold) { -			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); +			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, +						WB_REASON_TRY_TO_FREE_PAGES);  			sc->may_writepage = 1;  		} - -		/* Take a nap, wait for some writeback to complete */ -		if (!sc->hibernation_mode && sc->nr_scanned && -		    priority < DEF_PRIORITY - 2) { -			struct zone *preferred_zone; - -			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), -							NULL, &preferred_zone); -			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); -		} -	} +	} while (--sc->priority >= 0 && !aborted_reclaim);  out:  	delayacct_freepages_end(); -	put_mems_allowed();  	if (sc->nr_reclaimed)  		return sc->nr_reclaimed; +	/* +	 * As hibernation is going on, kswapd is freezed so that it can't mark +	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable +	 * check. +	 */ +	if (oom_killer_disabled) +		return 0; + +	/* Aborted reclaim to try compaction? don't OOM, then */ +	if (aborted_reclaim) +		return 1; +  	/* top priority shrink_zones still had more to do? don't OOM, then */ -	if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) +	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))  		return 1;  	return 0;  } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ +	struct zone *zone; +	unsigned long pfmemalloc_reserve = 0; +	unsigned long free_pages = 0; +	int i; +	bool wmark_ok; + +	for (i = 0; i <= ZONE_NORMAL; i++) { +		zone = &pgdat->node_zones[i]; +		if (!populated_zone(zone)) +			continue; + +		pfmemalloc_reserve += min_wmark_pages(zone); +		free_pages += zone_page_state(zone, NR_FREE_PAGES); +	} + +	/* If there are no reserves (unexpected config) then do not throttle */ +	if (!pfmemalloc_reserve) +		return true; + +	wmark_ok = free_pages > pfmemalloc_reserve / 2; + +	/* kswapd must be awake if processes are being throttled */ +	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { +		pgdat->classzone_idx = min(pgdat->classzone_idx, +						(enum zone_type)ZONE_NORMAL); +		wake_up_interruptible(&pgdat->kswapd_wait); +	} + +	return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, +					nodemask_t *nodemask) +{ +	struct zoneref *z; +	struct zone *zone; +	pg_data_t *pgdat = NULL; + +	/* +	 * Kernel threads should not be throttled as they may be indirectly +	 * responsible for cleaning pages necessary for reclaim to make forward +	 * progress. kjournald for example may enter direct reclaim while +	 * committing a transaction where throttling it could forcing other +	 * processes to block on log_wait_commit(). +	 */ +	if (current->flags & PF_KTHREAD) +		goto out; + +	/* +	 * If a fatal signal is pending, this process should not throttle. +	 * It should return quickly so it can exit and free its memory +	 */ +	if (fatal_signal_pending(current)) +		goto out; + +	/* +	 * Check if the pfmemalloc reserves are ok by finding the first node +	 * with a usable ZONE_NORMAL or lower zone. The expectation is that +	 * GFP_KERNEL will be required for allocating network buffers when +	 * swapping over the network so ZONE_HIGHMEM is unusable. +	 * +	 * Throttling is based on the first usable node and throttled processes +	 * wait on a queue until kswapd makes progress and wakes them. There +	 * is an affinity then between processes waking up and where reclaim +	 * progress has been made assuming the process wakes on the same node. +	 * More importantly, processes running on remote nodes will not compete +	 * for remote pfmemalloc reserves and processes on different nodes +	 * should make reasonable progress. +	 */ +	for_each_zone_zonelist_nodemask(zone, z, zonelist, +					gfp_mask, nodemask) { +		if (zone_idx(zone) > ZONE_NORMAL) +			continue; + +		/* Throttle based on the first usable node */ +		pgdat = zone->zone_pgdat; +		if (pfmemalloc_watermark_ok(pgdat)) +			goto out; +		break; +	} + +	/* If no zone was usable by the allocation flags then do not throttle */ +	if (!pgdat) +		goto out; + +	/* Account for the throttling */ +	count_vm_event(PGSCAN_DIRECT_THROTTLE); + +	/* +	 * If the caller cannot enter the filesystem, it's possible that it +	 * is due to the caller holding an FS lock or performing a journal +	 * transaction in the case of a filesystem like ext[3|4]. In this case, +	 * it is not safe to block on pfmemalloc_wait as kswapd could be +	 * blocked waiting on the same lock. Instead, throttle for up to a +	 * second before continuing. +	 */ +	if (!(gfp_mask & __GFP_FS)) { +		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, +			pfmemalloc_watermark_ok(pgdat), HZ); + +		goto check_pending; +	} + +	/* Throttle until kswapd wakes the process */ +	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, +		pfmemalloc_watermark_ok(pgdat)); + +check_pending: +	if (fatal_signal_pending(current)) +		return true; + +out: +	return false; +} +  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  				gfp_t gfp_mask, nodemask_t *nodemask)  {  	unsigned long nr_reclaimed;  	struct scan_control sc = { -		.gfp_mask = gfp_mask, +		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),  		.may_writepage = !laptop_mode,  		.nr_to_reclaim = SWAP_CLUSTER_MAX,  		.may_unmap = 1,  		.may_swap = 1, -		.swappiness = vm_swappiness,  		.order = order, -		.mem_cgroup = NULL, +		.priority = DEF_PRIORITY, +		.target_mem_cgroup = NULL,  		.nodemask = nodemask,  	}; +	/* +	 * Do not enter reclaim if fatal signal was delivered while throttled. +	 * 1 is returned so that the page allocator does not OOM kill at this +	 * point. +	 */ +	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) +		return 1; +  	trace_mm_vmscan_direct_reclaim_begin(order,  				sc.may_writepage,  				gfp_mask); @@ -2046,26 +2714,30 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  	return nr_reclaimed;  } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,  						gfp_t gfp_mask, bool noswap, -						unsigned int swappiness, -						struct zone *zone) +						struct zone *zone, +						unsigned long *nr_scanned)  {  	struct scan_control sc = { +		.nr_scanned = 0,  		.nr_to_reclaim = SWAP_CLUSTER_MAX,  		.may_writepage = !laptop_mode,  		.may_unmap = 1,  		.may_swap = !noswap, -		.swappiness = swappiness,  		.order = 0, -		.mem_cgroup = mem, +		.priority = 0, +		.swappiness = mem_cgroup_swappiness(memcg), +		.target_mem_cgroup = memcg,  	}; +	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); +  	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |  			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); -	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, +	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,  						      sc.may_writepage,  						      sc.gfp_mask); @@ -2076,34 +2748,42 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,  	 * will pick up pages from other mem cgroup's as well. We hack  	 * the priority and make it zero.  	 */ -	shrink_zone(0, zone, &sc); +	shrink_lruvec(lruvec, &sc);  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); +	*nr_scanned = sc.nr_scanned;  	return sc.nr_reclaimed;  } -unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,  					   gfp_t gfp_mask, -					   bool noswap, -					   unsigned int swappiness) +					   bool noswap)  {  	struct zonelist *zonelist;  	unsigned long nr_reclaimed; +	int nid;  	struct scan_control sc = {  		.may_writepage = !laptop_mode,  		.may_unmap = 1,  		.may_swap = !noswap,  		.nr_to_reclaim = SWAP_CLUSTER_MAX, -		.swappiness = swappiness,  		.order = 0, -		.mem_cgroup = mem_cont, +		.priority = DEF_PRIORITY, +		.target_mem_cgroup = memcg,  		.nodemask = NULL, /* we don't care the placement */ +		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | +				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),  	}; -	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | -			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); -	zonelist = NODE_DATA(numa_node_id())->node_zonelists; +	/* +	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't +	 * take care of from where we get pages. So the node where we start the +	 * scan does not need to be the current node. +	 */ +	nid = mem_cgroup_select_victim_node(memcg); + +	zonelist = NODE_DATA(nid)->node_zonelists;  	trace_mm_vmscan_memcg_reclaim_begin(0,  					    sc.may_writepage, @@ -2117,38 +2797,215 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,  }  #endif -/* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static void age_active_anon(struct zone *zone, struct scan_control *sc)  { -	int i; +	struct mem_cgroup *memcg; -	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */ -	if (remaining) -		return 1; +	if (!total_swap_pages) +		return; + +	memcg = mem_cgroup_iter(NULL, NULL, NULL); +	do { +		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + +		if (inactive_anon_is_low(lruvec)) +			shrink_active_list(SWAP_CLUSTER_MAX, lruvec, +					   sc, LRU_ACTIVE_ANON); + +		memcg = mem_cgroup_iter(NULL, memcg, NULL); +	} while (memcg); +} + +static bool zone_balanced(struct zone *zone, int order, +			  unsigned long balance_gap, int classzone_idx) +{ +	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + +				    balance_gap, classzone_idx, 0)) +		return false; + +	if (IS_ENABLED(CONFIG_COMPACTION) && order && +	    !compaction_suitable(zone, order)) +		return false; + +	return true; +} + +/* + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. + * The choice of 25% is due to + *   o a 16M DMA zone that is balanced will not balance a zone on any + *     reasonable sized machine + *   o On all other machines, the top zone must be at least a reasonable + *     percentage of the middle zones. For example, on 32-bit x86, highmem + *     would need to be at least 256M for it to be balance a whole node. + *     Similarly, on x86-64 the Normal zone would need to be at least 1G + *     to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +{ +	unsigned long managed_pages = 0; +	unsigned long balanced_pages = 0; +	int i; -	/* If after HZ/10, a zone is below the high mark, it's premature */ -	for (i = 0; i < pgdat->nr_zones; i++) { +	/* Check the watermark levels */ +	for (i = 0; i <= classzone_idx; i++) {  		struct zone *zone = pgdat->node_zones + i;  		if (!populated_zone(zone))  			continue; -		if (zone->all_unreclaimable) +		managed_pages += zone->managed_pages; + +		/* +		 * A special case here: +		 * +		 * balance_pgdat() skips over all_unreclaimable after +		 * DEF_PRIORITY. Effectively, it considers them balanced so +		 * they must be considered balanced here as well! +		 */ +		if (!zone_reclaimable(zone)) { +			balanced_pages += zone->managed_pages;  			continue; +		} -		if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), -								0, 0)) -			return 1; +		if (zone_balanced(zone, order, 0, i)) +			balanced_pages += zone->managed_pages; +		else if (!order) +			return false;  	} -	return 0; +	if (order) +		return balanced_pages >= (managed_pages >> 2); +	else +		return true; +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, +					int classzone_idx) +{ +	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */ +	if (remaining) +		return false; + +	/* +	 * There is a potential race between when kswapd checks its watermarks +	 * and a process gets throttled. There is also a potential race if +	 * processes get throttled, kswapd wakes, a large process exits therby +	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd +	 * is going to sleep, no process should be sleeping on pfmemalloc_wait +	 * so wake them now if necessary. If necessary, processes will wake +	 * kswapd and get throttled again +	 */ +	if (waitqueue_active(&pgdat->pfmemalloc_wait)) { +		wake_up(&pgdat->pfmemalloc_wait); +		return false; +	} + +	return pgdat_balanced(pgdat, order, classzone_idx); +} + +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, +			       int classzone_idx, +			       struct scan_control *sc, +			       unsigned long lru_pages, +			       unsigned long *nr_attempted) +{ +	int testorder = sc->order; +	unsigned long balance_gap; +	struct reclaim_state *reclaim_state = current->reclaim_state; +	struct shrink_control shrink = { +		.gfp_mask = sc->gfp_mask, +	}; +	bool lowmem_pressure; + +	/* Reclaim above the high watermark. */ +	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + +	/* +	 * Kswapd reclaims only single pages with compaction enabled. Trying +	 * too hard to reclaim until contiguous free pages have become +	 * available can hurt performance by evicting too much useful data +	 * from memory. Do not reclaim more than needed for compaction. +	 */ +	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && +			compaction_suitable(zone, sc->order) != +				COMPACT_SKIPPED) +		testorder = 0; + +	/* +	 * We put equal pressure on every zone, unless one zone has way too +	 * many pages free already. The "too many pages" is defined as the +	 * high wmark plus a "gap" where the gap is either the low +	 * watermark or 1% of the zone, whichever is smaller. +	 */ +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + +	/* +	 * If there is no low memory pressure or the zone is balanced then no +	 * reclaim is necessary +	 */ +	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); +	if (!lowmem_pressure && zone_balanced(zone, testorder, +						balance_gap, classzone_idx)) +		return true; + +	shrink_zone(zone, sc); +	nodes_clear(shrink.nodes_to_scan); +	node_set(zone_to_nid(zone), shrink.nodes_to_scan); + +	reclaim_state->reclaimed_slab = 0; +	shrink_slab(&shrink, sc->nr_scanned, lru_pages); +	sc->nr_reclaimed += reclaim_state->reclaimed_slab; + +	/* Account for the number of pages attempted to reclaim */ +	*nr_attempted += sc->nr_to_reclaim; + +	zone_clear_flag(zone, ZONE_WRITEBACK); + +	/* +	 * If a zone reaches its high watermark, consider it to be no longer +	 * congested. It's possible there are dirty pages backed by congested +	 * BDIs but as pressure is relieved, speculatively avoid congestion +	 * waits. +	 */ +	if (zone_reclaimable(zone) && +	    zone_balanced(zone, testorder, 0, classzone_idx)) { +		zone_clear_flag(zone, ZONE_CONGESTED); +		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); +	} + +	return sc->nr_scanned >= sc->nr_to_reclaim;  }  /*   * For kswapd, balance_pgdat() will work across all this node's zones until   * they are all at high_wmark_pages(zone).   * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at   *   * There is special handling here for zones which are full of pinned pages.   * This can happen if the pages are all mlocked, or if they are all used by @@ -2165,42 +3022,31 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)   * interoperates with the page allocator fallback scheme to ensure that aging   * of pages is balanced across the zones.   */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, +							int *classzone_idx)  { -	int all_zones_ok; -	int priority;  	int i; -	unsigned long total_scanned; -	struct reclaim_state *reclaim_state = current->reclaim_state; +	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */ +	unsigned long nr_soft_reclaimed; +	unsigned long nr_soft_scanned;  	struct scan_control sc = {  		.gfp_mask = GFP_KERNEL, +		.priority = DEF_PRIORITY,  		.may_unmap = 1,  		.may_swap = 1, -		/* -		 * kswapd doesn't want to be bailed out while reclaim. because -		 * we want to put equal scanning pressure on each zone. -		 */ -		.nr_to_reclaim = ULONG_MAX, -		.swappiness = vm_swappiness, +		.may_writepage = !laptop_mode,  		.order = order, -		.mem_cgroup = NULL, +		.target_mem_cgroup = NULL,  	}; -loop_again: -	total_scanned = 0; -	sc.nr_reclaimed = 0; -	sc.may_writepage = !laptop_mode;  	count_vm_event(PAGEOUTRUN); -	for (priority = DEF_PRIORITY; priority >= 0; priority--) { -		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */ +	do {  		unsigned long lru_pages = 0; -		int has_under_min_watermark_zone = 0; +		unsigned long nr_attempted = 0; +		bool raise_priority = true; +		bool pgdat_needs_compaction = (order > 0); -		/* The swap token gets in the way of swapout... */ -		if (!priority) -			disable_swap_token(); - -		all_zones_ok = 1; +		sc.nr_reclaimed = 0;  		/*  		 * Scan in the highmem->dma direction for the highest @@ -2212,33 +3058,71 @@ loop_again:  			if (!populated_zone(zone))  				continue; -			if (zone->all_unreclaimable && priority != DEF_PRIORITY) +			if (sc.priority != DEF_PRIORITY && +			    !zone_reclaimable(zone))  				continue;  			/*  			 * Do some background aging of the anon list, to give  			 * pages a chance to be referenced before reclaiming.  			 */ -			if (inactive_anon_is_low(zone, &sc)) -				shrink_active_list(SWAP_CLUSTER_MAX, zone, -							&sc, priority, 0); +			age_active_anon(zone, &sc); + +			/* +			 * If the number of buffer_heads in the machine +			 * exceeds the maximum allowed level and this node +			 * has a highmem zone, force kswapd to reclaim from +			 * it to relieve lowmem pressure. +			 */ +			if (buffer_heads_over_limit && is_highmem_idx(i)) { +				end_zone = i; +				break; +			} -			if (!zone_watermark_ok(zone, order, -					high_wmark_pages(zone), 0, 0)) { +			if (!zone_balanced(zone, order, 0, 0)) {  				end_zone = i;  				break; +			} else { +				/* +				 * If balanced, clear the dirty and congested +				 * flags +				 */ +				zone_clear_flag(zone, ZONE_CONGESTED); +				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);  			}  		} +  		if (i < 0)  			goto out;  		for (i = 0; i <= end_zone; i++) {  			struct zone *zone = pgdat->node_zones + i; +			if (!populated_zone(zone)) +				continue; +  			lru_pages += zone_reclaimable_pages(zone); + +			/* +			 * If any zone is currently balanced then kswapd will +			 * not call compaction as it is expected that the +			 * necessary pages are already available. +			 */ +			if (pgdat_needs_compaction && +					zone_watermark_ok(zone, order, +						low_wmark_pages(zone), +						*classzone_idx, 0)) +				pgdat_needs_compaction = false;  		}  		/* +		 * If we're getting trouble reclaiming, start doing writepage +		 * even in laptop mode. +		 */ +		if (sc.priority < DEF_PRIORITY - 2) +			sc.may_writepage = 1; + +		/*  		 * Now scan the zone in the dma->highmem direction, stopping  		 * at the last zone which needs scanning.  		 * @@ -2249,119 +3133,140 @@ loop_again:  		 */  		for (i = 0; i <= end_zone; i++) {  			struct zone *zone = pgdat->node_zones + i; -			int nr_slab;  			if (!populated_zone(zone))  				continue; -			if (zone->all_unreclaimable && priority != DEF_PRIORITY) +			if (sc.priority != DEF_PRIORITY && +			    !zone_reclaimable(zone))  				continue;  			sc.nr_scanned = 0; +			nr_soft_scanned = 0;  			/*  			 * Call soft limit reclaim before calling shrink_zone. -			 * For now we ignore the return value  			 */ -			mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); +			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, +							order, sc.gfp_mask, +							&nr_soft_scanned); +			sc.nr_reclaimed += nr_soft_reclaimed;  			/* -			 * We put equal pressure on every zone, unless one -			 * zone has way too many pages free already. +			 * There should be no need to raise the scanning +			 * priority if enough pages are already being scanned +			 * that that high watermark would be met at 100% +			 * efficiency.  			 */ -			if (!zone_watermark_ok(zone, order, -					8*high_wmark_pages(zone), end_zone, 0)) -				shrink_zone(priority, zone, &sc); -			reclaim_state->reclaimed_slab = 0; -			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, -						lru_pages); -			sc.nr_reclaimed += reclaim_state->reclaimed_slab; -			total_scanned += sc.nr_scanned; -			if (zone->all_unreclaimable) -				continue; -			if (nr_slab == 0 && !zone_reclaimable(zone)) -				zone->all_unreclaimable = 1; -			/* -			 * If we've done a decent amount of scanning and -			 * the reclaim ratio is low, start doing writepage -			 * even in laptop mode -			 */ -			if (total_scanned > SWAP_CLUSTER_MAX * 2 && -			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) -				sc.may_writepage = 1; - -			if (!zone_watermark_ok(zone, order, -					high_wmark_pages(zone), end_zone, 0)) { -				all_zones_ok = 0; -				/* -				 * We are still under min water mark.  This -				 * means that we have a GFP_ATOMIC allocation -				 * failure risk. Hurry up! -				 */ -				if (!zone_watermark_ok(zone, order, -					    min_wmark_pages(zone), end_zone, 0)) -					has_under_min_watermark_zone = 1; -			} else { -				/* -				 * If a zone reaches its high watermark, -				 * consider it to be no longer congested. It's -				 * possible there are dirty pages backed by -				 * congested BDIs but as pressure is relieved, -				 * spectulatively avoid congestion waits -				 */ -				zone_clear_flag(zone, ZONE_CONGESTED); -			} - +			if (kswapd_shrink_zone(zone, end_zone, &sc, +					lru_pages, &nr_attempted)) +				raise_priority = false;  		} -		if (all_zones_ok) -			break;		/* kswapd: all done */ +  		/* -		 * OK, kswapd is getting into trouble.  Take a nap, then take -		 * another pass across the zones. +		 * If the low watermark is met there is no need for processes +		 * to be throttled on pfmemalloc_wait as they should not be +		 * able to safely make forward progress. Wake them  		 */ -		if (total_scanned && (priority < DEF_PRIORITY - 2)) { -			if (has_under_min_watermark_zone) -				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); -			else -				congestion_wait(BLK_RW_ASYNC, HZ/10); -		} +		if (waitqueue_active(&pgdat->pfmemalloc_wait) && +				pfmemalloc_watermark_ok(pgdat)) +			wake_up(&pgdat->pfmemalloc_wait);  		/* -		 * We do this so kswapd doesn't build up large priorities for -		 * example when it is freeing in parallel with allocators. It -		 * matches the direct reclaim path behaviour in terms of impact -		 * on zone->*_priority. +		 * Fragmentation may mean that the system cannot be rebalanced +		 * for high-order allocations in all zones. If twice the +		 * allocation size has been reclaimed and the zones are still +		 * not balanced then recheck the watermarks at order-0 to +		 * prevent kswapd reclaiming excessively. Assume that a +		 * process requested a high-order can direct reclaim/compact.  		 */ -		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) +		if (order && sc.nr_reclaimed >= 2UL << order) +			order = sc.order = 0; + +		/* Check if kswapd should be suspending */ +		if (try_to_freeze() || kthread_should_stop())  			break; -	} -out: -	if (!all_zones_ok) { -		cond_resched(); -		try_to_freeze(); +		/* +		 * Compact if necessary and kswapd is reclaiming at least the +		 * high watermark number of pages as requsted +		 */ +		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) +			compact_pgdat(pgdat, order);  		/* -		 * Fragmentation may mean that the system cannot be -		 * rebalanced for high-order allocations in all zones. -		 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, -		 * it means the zones have been fully scanned and are still -		 * not balanced. For high-order allocations, there is -		 * little point trying all over again as kswapd may -		 * infinite loop. -		 * -		 * Instead, recheck all watermarks at order-0 as they -		 * are the most important. If watermarks are ok, kswapd will go -		 * back to sleep. High-order users can still perform direct -		 * reclaim if they wish. +		 * Raise priority if scanning rate is too low or there was no +		 * progress in reclaiming pages  		 */ -		if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) -			order = sc.order = 0; +		if (raise_priority || !sc.nr_reclaimed) +			sc.priority--; +	} while (sc.priority >= 1 && +		 !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: +	/* +	 * Return the order we were reclaiming at so prepare_kswapd_sleep() +	 * makes a decision on the order we were last reclaiming at. However, +	 * if another caller entered the allocator slow path while kswapd +	 * was awake, order will remain at the higher level +	 */ +	*classzone_idx = end_zone; +	return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ +	long remaining = 0; +	DEFINE_WAIT(wait); + +	if (freezing(current) || kthread_should_stop()) +		return; + +	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); -		goto loop_again; +	/* Try to sleep for a short interval */ +	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { +		remaining = schedule_timeout(HZ/10); +		finish_wait(&pgdat->kswapd_wait, &wait); +		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);  	} -	return sc.nr_reclaimed; +	/* +	 * After a short sleep, check if it was a premature sleep. If not, then +	 * go fully to sleep until explicitly woken up. +	 */ +	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { +		trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + +		/* +		 * vmstat counters are not perfectly accurate and the estimated +		 * value for counters such as NR_FREE_PAGES can deviate from the +		 * true value by nr_online_cpus * threshold. To avoid the zone +		 * watermarks being breached while under pressure, we reduce the +		 * per-cpu vmstat threshold while kswapd is awake and restore +		 * them before going back to sleep. +		 */ +		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + +		/* +		 * Compaction records what page blocks it recently failed to +		 * isolate pages from and skips them in the future scanning. +		 * When kswapd is going to sleep, it is reasonable to assume +		 * that pages and compaction may succeed so reset the cache. +		 */ +		reset_isolation_suitable(pgdat); + +		if (!kthread_should_stop()) +			schedule(); + +		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); +	} else { +		if (remaining) +			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); +		else +			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); +	} +	finish_wait(&pgdat->kswapd_wait, &wait);  }  /* @@ -2379,10 +3284,13 @@ out:   */  static int kswapd(void *p)  { -	unsigned long order; +	unsigned long order, new_order; +	unsigned balanced_order; +	int classzone_idx, new_classzone_idx; +	int balanced_classzone_idx;  	pg_data_t *pgdat = (pg_data_t*)p;  	struct task_struct *tsk = current; -	DEFINE_WAIT(wait); +  	struct reclaim_state reclaim_state = {  		.reclaimed_slab = 0,  	}; @@ -2409,50 +3317,43 @@ static int kswapd(void *p)  	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;  	set_freezable(); -	order = 0; +	order = new_order = 0; +	balanced_order = 0; +	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; +	balanced_classzone_idx = classzone_idx;  	for ( ; ; ) { -		unsigned long new_order; -		int ret; +		bool ret; -		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); -		new_order = pgdat->kswapd_max_order; -		pgdat->kswapd_max_order = 0; -		if (order < new_order) { +		/* +		 * If the last balance_pgdat was unsuccessful it's unlikely a +		 * new request of a similar or harder type will succeed soon +		 * so consider going to sleep on the basis we reclaimed at +		 */ +		if (balanced_classzone_idx >= new_classzone_idx && +					balanced_order == new_order) { +			new_order = pgdat->kswapd_max_order; +			new_classzone_idx = pgdat->classzone_idx; +			pgdat->kswapd_max_order =  0; +			pgdat->classzone_idx = pgdat->nr_zones - 1; +		} + +		if (order < new_order || classzone_idx > new_classzone_idx) {  			/*  			 * Don't sleep if someone wants a larger 'order' -			 * allocation +			 * allocation or has tigher zone constraints  			 */  			order = new_order; +			classzone_idx = new_classzone_idx;  		} else { -			if (!freezing(current) && !kthread_should_stop()) { -				long remaining = 0; - -				/* Try to sleep for a short interval */ -				if (!sleeping_prematurely(pgdat, order, remaining)) { -					remaining = schedule_timeout(HZ/10); -					finish_wait(&pgdat->kswapd_wait, &wait); -					prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); -				} - -				/* -				 * After a short sleep, check if it was a -				 * premature sleep. If not, then go fully -				 * to sleep until explicitly woken up -				 */ -				if (!sleeping_prematurely(pgdat, order, remaining)) { -					trace_mm_vmscan_kswapd_sleep(pgdat->node_id); -					schedule(); -				} else { -					if (remaining) -						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); -					else -						count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); -				} -			} - +			kswapd_try_to_sleep(pgdat, balanced_order, +						balanced_classzone_idx);  			order = pgdat->kswapd_max_order; +			classzone_idx = pgdat->classzone_idx; +			new_order = order; +			new_classzone_idx = classzone_idx; +			pgdat->kswapd_max_order = 0; +			pgdat->classzone_idx = pgdat->nr_zones - 1;  		} -		finish_wait(&pgdat->kswapd_wait, &wait);  		ret = try_to_freeze();  		if (kthread_should_stop()) @@ -2464,68 +3365,43 @@ static int kswapd(void *p)  		 */  		if (!ret) {  			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); -			balance_pgdat(pgdat, order); +			balanced_classzone_idx = classzone_idx; +			balanced_order = balance_pgdat(pgdat, order, +						&balanced_classzone_idx);  		}  	} + +	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); +	current->reclaim_state = NULL; +	lockdep_clear_current_reclaim_state(); +  	return 0;  }  /*   * A zone is low on free memory, so wake its kswapd task to service it.   */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)  {  	pg_data_t *pgdat;  	if (!populated_zone(zone))  		return; -	pgdat = zone->zone_pgdat; -	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) -		return; -	if (pgdat->kswapd_max_order < order) -		pgdat->kswapd_max_order = order; -	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);  	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))  		return; +	pgdat = zone->zone_pgdat; +	if (pgdat->kswapd_max_order < order) { +		pgdat->kswapd_max_order = order; +		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); +	}  	if (!waitqueue_active(&pgdat->kswapd_wait))  		return; -	wake_up_interruptible(&pgdat->kswapd_wait); -} - -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ -	int nr; - -	nr = global_page_state(NR_ACTIVE_FILE) + -	     global_page_state(NR_INACTIVE_FILE); - -	if (nr_swap_pages > 0) -		nr += global_page_state(NR_ACTIVE_ANON) + -		      global_page_state(NR_INACTIVE_ANON); - -	return nr; -} - -unsigned long zone_reclaimable_pages(struct zone *zone) -{ -	int nr; - -	nr = zone_page_state(zone, NR_ACTIVE_FILE) + -	     zone_page_state(zone, NR_INACTIVE_FILE); - -	if (nr_swap_pages > 0) -		nr += zone_page_state(zone, NR_ACTIVE_ANON) + -		      zone_page_state(zone, NR_INACTIVE_ANON); +	if (zone_balanced(zone, order, 0, 0)) +		return; -	return nr; +	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); +	wake_up_interruptible(&pgdat->kswapd_wait);  }  #ifdef CONFIG_HIBERNATION @@ -2547,10 +3423,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)  		.may_writepage = 1,  		.nr_to_reclaim = nr_to_reclaim,  		.hibernation_mode = 1, -		.swappiness = vm_swappiness,  		.order = 0, +		.priority = DEF_PRIORITY,  	}; -	struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); +	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);  	struct task_struct *p = current;  	unsigned long nr_reclaimed; @@ -2573,13 +3449,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)     not required for correctness.  So if the last cpu in a node goes     away, we get changed to run anywhere: as the first one comes back,     restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, -				  unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, +			void *hcpu)  {  	int nid;  	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { -		for_each_node_state(nid, N_HIGH_MEMORY) { +		for_each_node_state(nid, N_MEMORY) {  			pg_data_t *pgdat = NODE_DATA(nid);  			const struct cpumask *mask; @@ -2609,21 +3485,25 @@ int kswapd_run(int nid)  	if (IS_ERR(pgdat->kswapd)) {  		/* failure at boot is fatal */  		BUG_ON(system_state == SYSTEM_BOOTING); -		printk("Failed to start kswapd on node %d\n",nid); -		ret = -1; +		pr_err("Failed to start kswapd on node %d\n", nid); +		ret = PTR_ERR(pgdat->kswapd); +		pgdat->kswapd = NULL;  	}  	return ret;  }  /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined.  Caller must + * hold mem_hotplug_begin/end().   */  void kswapd_stop(int nid)  {  	struct task_struct *kswapd = NODE_DATA(nid)->kswapd; -	if (kswapd) +	if (kswapd) {  		kthread_stop(kswapd); +		NODE_DATA(nid)->kswapd = NULL; +	}  }  static int __init kswapd_init(void) @@ -2631,7 +3511,7 @@ static int __init kswapd_init(void)  	int nid;  	swap_setup(); -	for_each_node_state(nid, N_HIGH_MEMORY) +	for_each_node_state(nid, N_MEMORY)   		kswapd_run(nid);  	hotcpu_notifier(cpu_callback, 0);  	return 0; @@ -2723,16 +3603,17 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)  	const unsigned long nr_pages = 1 << order;  	struct task_struct *p = current;  	struct reclaim_state reclaim_state; -	int priority;  	struct scan_control sc = {  		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),  		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),  		.may_swap = 1, -		.nr_to_reclaim = max_t(unsigned long, nr_pages, -				       SWAP_CLUSTER_MAX), -		.gfp_mask = gfp_mask, -		.swappiness = vm_swappiness, +		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), +		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),  		.order = order, +		.priority = ZONE_RECLAIM_PRIORITY, +	}; +	struct shrink_control shrink = { +		.gfp_mask = sc.gfp_mask,  	};  	unsigned long nr_slab_pages0, nr_slab_pages1; @@ -2752,11 +3633,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)  		 * Free memory by calling shrink zone with increasing  		 * priorities until we have enough memory freed.  		 */ -		priority = ZONE_RECLAIM_PRIORITY;  		do { -			shrink_zone(priority, zone, &sc); -			priority--; -		} while (priority >= 0 && sc.nr_reclaimed < nr_pages); +			shrink_zone(zone, &sc); +		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);  	}  	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -2767,15 +3646,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)  		 * number of slab pages and shake the slab until it is reduced  		 * by the same nr_pages that we used for reclaiming unmapped  		 * pages. -		 * -		 * Note that shrink_slab will free memory on all zones and may -		 * take a long time.  		 */ +		nodes_clear(shrink.nodes_to_scan); +		node_set(zone_to_nid(zone), shrink.nodes_to_scan);  		for (;;) {  			unsigned long lru_pages = zone_reclaimable_pages(zone);  			/* No reclaimable slab or very low memory pressure */ -			if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) +			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))  				break;  			/* Freed enough memory */ @@ -2819,7 +3697,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)  	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)  		return ZONE_RECLAIM_FULL; -	if (zone->all_unreclaimable) +	if (!zone_reclaimable(zone))  		return ZONE_RECLAIM_FULL;  	/* @@ -2854,181 +3732,81 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)  /*   * page_evictable - test whether a page is evictable   * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL   *   * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list.  The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list.   *   * Reasons page might not be evictable:   * (1) page's mapping marked unevictable   * (2) page is part of an mlocked VMA   *   */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page)  { - -	if (mapping_unevictable(page_mapping(page))) -		return 0; - -	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) -		return 0; - -	return 1; +	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);  } +#ifdef CONFIG_SHMEM  /** - * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list - * @page: page to check evictability and move to appropriate lru list - * @zone: zone page is in + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages:	array of pages to check + * @nr_pages:	number of pages to check   * - * Checks a page for evictability and moves the page to the appropriate - * zone lru list. + * Checks pages for evictability and moves them to the appropriate lru list.   * - * Restrictions: zone->lru_lock must be held, page must be on LRU and must - * have PageUnevictable set. + * This function is only used for SysV IPC SHM_UNLOCK.   */ -static void check_move_unevictable_page(struct page *page, struct zone *zone) +void check_move_unevictable_pages(struct page **pages, int nr_pages)  { -	VM_BUG_ON(PageActive(page)); - -retry: -	ClearPageUnevictable(page); -	if (page_evictable(page, NULL)) { -		enum lru_list l = page_lru_base_type(page); - -		__dec_zone_state(zone, NR_UNEVICTABLE); -		list_move(&page->lru, &zone->lru[l].list); -		mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); -		__inc_zone_state(zone, NR_INACTIVE_ANON + l); -		__count_vm_event(UNEVICTABLE_PGRESCUED); -	} else { -		/* -		 * rotate unevictable list -		 */ -		SetPageUnevictable(page); -		list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); -		mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); -		if (page_evictable(page, NULL)) -			goto retry; -	} -} - -/** - * scan_mapping_unevictable_pages - scan an address space for evictable pages - * @mapping: struct address_space to scan for evictable pages - * - * Scan all pages in mapping.  Check unevictable pages for - * evictability and move them to the appropriate zone lru list. - */ -void scan_mapping_unevictable_pages(struct address_space *mapping) -{ -	pgoff_t next = 0; -	pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> -			 PAGE_CACHE_SHIFT; -	struct zone *zone; -	struct pagevec pvec; - -	if (mapping->nrpages == 0) -		return; - -	pagevec_init(&pvec, 0); -	while (next < end && -		pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { -		int i; -		int pg_scanned = 0; +	struct lruvec *lruvec; +	struct zone *zone = NULL; +	int pgscanned = 0; +	int pgrescued = 0; +	int i; -		zone = NULL; +	for (i = 0; i < nr_pages; i++) { +		struct page *page = pages[i]; +		struct zone *pagezone; -		for (i = 0; i < pagevec_count(&pvec); i++) { -			struct page *page = pvec.pages[i]; -			pgoff_t page_index = page->index; -			struct zone *pagezone = page_zone(page); +		pgscanned++; +		pagezone = page_zone(page); +		if (pagezone != zone) { +			if (zone) +				spin_unlock_irq(&zone->lru_lock); +			zone = pagezone; +			spin_lock_irq(&zone->lru_lock); +		} +		lruvec = mem_cgroup_page_lruvec(page, zone); -			pg_scanned++; -			if (page_index > next) -				next = page_index; -			next++; +		if (!PageLRU(page) || !PageUnevictable(page)) +			continue; -			if (pagezone != zone) { -				if (zone) -					spin_unlock_irq(&zone->lru_lock); -				zone = pagezone; -				spin_lock_irq(&zone->lru_lock); -			} +		if (page_evictable(page)) { +			enum lru_list lru = page_lru_base_type(page); -			if (PageLRU(page) && PageUnevictable(page)) -				check_move_unevictable_page(page, zone); +			VM_BUG_ON_PAGE(PageActive(page), page); +			ClearPageUnevictable(page); +			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); +			add_page_to_lru_list(page, lruvec, lru); +			pgrescued++;  		} -		if (zone) -			spin_unlock_irq(&zone->lru_lock); -		pagevec_release(&pvec); - -		count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);  	} -} - -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable.  Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them.  Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) -{ -	struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; -	unsigned long scan; -	unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - -	while (nr_to_scan > 0) { -		unsigned long batch_size = min(nr_to_scan, -						SCAN_UNEVICTABLE_BATCH_SIZE); - -		spin_lock_irq(&zone->lru_lock); -		for (scan = 0;  scan < batch_size; scan++) { -			struct page *page = lru_to_page(l_unevictable); - -			if (!trylock_page(page)) -				continue; - -			prefetchw_prev_lru_page(page, l_unevictable, flags); - -			if (likely(PageLRU(page) && PageUnevictable(page))) -				check_move_unevictable_page(page, zone); - -			unlock_page(page); -		} +	if (zone) { +		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); +		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);  		spin_unlock_irq(&zone->lru_lock); - -		nr_to_scan -= batch_size;  	}  } +#endif /* CONFIG_SHMEM */ - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer:  scan all zones' unevictable LRU lists to check for - * pages that have become evictable.  Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system.  As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) +static void warn_scan_unevictable_pages(void)  { -	struct zone *zone; - -	for_each_zone(zone) { -		scan_zone_unevictable_pages(zone); -	} +	printk_once(KERN_WARNING +		    "%s: The scan_unevictable_pages sysctl/node-interface has been " +		    "disabled for lack of a legitimate use case.  If you have " +		    "one, please send an email to linux-mm@kvack.org.\n", +		    current->comm);  }  /* @@ -3041,11 +3819,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,  			   void __user *buffer,  			   size_t *length, loff_t *ppos)  { +	warn_scan_unevictable_pages();  	proc_doulongvec_minmax(table, write, buffer, length, ppos); - -	if (write && *(unsigned long *)table->data) -		scan_all_zones_unevictable_pages(); -  	scan_unevictable_pages = 0;  	return 0;  } @@ -3056,45 +3831,34 @@ int scan_unevictable_handler(struct ctl_table *table, int write,   * a specified node's per zone unevictable lists for evictable pages.   */ -static ssize_t read_scan_unevictable_node(struct sys_device *dev, -					  struct sysdev_attribute *attr, +static ssize_t read_scan_unevictable_node(struct device *dev, +					  struct device_attribute *attr,  					  char *buf)  { +	warn_scan_unevictable_pages();  	return sprintf(buf, "0\n");	/* always zero; should fit... */  } -static ssize_t write_scan_unevictable_node(struct sys_device *dev, -					   struct sysdev_attribute *attr, +static ssize_t write_scan_unevictable_node(struct device *dev, +					   struct device_attribute *attr,  					const char *buf, size_t count)  { -	struct zone *node_zones = NODE_DATA(dev->id)->node_zones; -	struct zone *zone; -	unsigned long res; -	unsigned long req = strict_strtoul(buf, 10, &res); - -	if (!req) -		return 1;	/* zero is no-op */ - -	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { -		if (!populated_zone(zone)) -			continue; -		scan_zone_unevictable_pages(zone); -	} +	warn_scan_unevictable_pages();  	return 1;  } -static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, +static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,  			read_scan_unevictable_node,  			write_scan_unevictable_node);  int scan_unevictable_register_node(struct node *node)  { -	return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +	return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);  }  void scan_unevictable_unregister_node(struct node *node)  { -	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +	device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);  }  #endif  | 
