diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 543 | 
1 files changed, 304 insertions, 239 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc..0f16ffe8eb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@   *  Multiqueue VM started 5.8.00, Rik van Riel.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/gfp.h> @@ -43,11 +45,13 @@  #include <linux/sysctl.h>  #include <linux/oom.h>  #include <linux/prefetch.h> +#include <linux/printk.h>  #include <asm/tlbflush.h>  #include <asm/div64.h>  #include <linux/swapops.h> +#include <linux/balloon_compaction.h>  #include "internal.h" @@ -82,6 +86,9 @@ struct scan_control {  	/* Scan (total_size >> priority) pages at once */  	int priority; +	/* anon vs. file LRUs scanning "ratio" */ +	int swappiness; +  	/*  	 * The memory cgroup that hit its limit and as a result is the  	 * primary target of this reclaim invocation. @@ -139,26 +146,14 @@ static bool global_reclaim(struct scan_control *sc)  {  	return !sc->target_mem_cgroup;  } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ -	struct mem_cgroup *root = sc->target_mem_cgroup; -	return !mem_cgroup_disabled() && -		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; -}  #else  static bool global_reclaim(struct scan_control *sc)  {  	return true;  } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ -	return false; -}  #endif -unsigned long zone_reclaimable_pages(struct zone *zone) +static unsigned long zone_reclaimable_pages(struct zone *zone)  {  	int nr; @@ -222,6 +217,7 @@ void unregister_shrinker(struct shrinker *shrinker)  	down_write(&shrinker_rwsem);  	list_del(&shrinker->list);  	up_write(&shrinker_rwsem); +	kfree(shrinker->nr_deferred);  }  EXPORT_SYMBOL(unregister_shrinker); @@ -234,15 +230,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,  	unsigned long freed = 0;  	unsigned long long delta;  	long total_scan; -	long max_pass; +	long freeable;  	long nr;  	long new_nr;  	int nid = shrinkctl->nid;  	long batch_size = shrinker->batch ? shrinker->batch  					  : SHRINK_BATCH; -	max_pass = shrinker->count_objects(shrinker, shrinkctl); -	if (max_pass == 0) +	freeable = shrinker->count_objects(shrinker, shrinkctl); +	if (freeable == 0)  		return 0;  	/* @@ -254,14 +250,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,  	total_scan = nr;  	delta = (4 * nr_pages_scanned) / shrinker->seeks; -	delta *= max_pass; +	delta *= freeable;  	do_div(delta, lru_pages + 1);  	total_scan += delta;  	if (total_scan < 0) {  		printk(KERN_ERR  		"shrink_slab: %pF negative objects to delete nr=%ld\n",  		       shrinker->scan_objects, total_scan); -		total_scan = max_pass; +		total_scan = freeable;  	}  	/* @@ -270,38 +266,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,  	 * shrinkers to return -1 all the time. This results in a large  	 * nr being built up so when a shrink that can do some work  	 * comes along it empties the entire cache due to nr >>> -	 * max_pass.  This is bad for sustaining a working set in +	 * freeable. This is bad for sustaining a working set in  	 * memory.  	 *  	 * Hence only allow the shrinker to scan the entire cache when  	 * a large delta change is calculated directly.  	 */ -	if (delta < max_pass / 4) -		total_scan = min(total_scan, max_pass / 2); +	if (delta < freeable / 4) +		total_scan = min(total_scan, freeable / 2);  	/*  	 * Avoid risking looping forever due to too large nr value:  	 * never try to free more than twice the estimate number of  	 * freeable entries.  	 */ -	if (total_scan > max_pass * 2) -		total_scan = max_pass * 2; +	if (total_scan > freeable * 2) +		total_scan = freeable * 2;  	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,  				nr_pages_scanned, lru_pages, -				max_pass, delta, total_scan); +				freeable, delta, total_scan); -	while (total_scan >= batch_size) { +	/* +	 * Normally, we should not scan less than batch_size objects in one +	 * pass to avoid too frequent shrinker calls, but if the slab has less +	 * than batch_size objects in total and we are really tight on memory, +	 * we will try to reclaim all available objects, otherwise we can end +	 * up failing allocations although there are plenty of reclaimable +	 * objects spread over several slabs with usage less than the +	 * batch_size. +	 * +	 * We detect the "tight on memory" situations by looking at the total +	 * number of objects we want to scan (total_scan). If it is greater +	 * than the total number of objects on slab (freeable), we must be +	 * scanning at high prio and therefore should try to reclaim as much as +	 * possible. +	 */ +	while (total_scan >= batch_size || +	       total_scan >= freeable) {  		unsigned long ret; +		unsigned long nr_to_scan = min(batch_size, total_scan); -		shrinkctl->nr_to_scan = batch_size; +		shrinkctl->nr_to_scan = nr_to_scan;  		ret = shrinker->scan_objects(shrinker, shrinkctl);  		if (ret == SHRINK_STOP)  			break;  		freed += ret; -		count_vm_events(SLABS_SCANNED, batch_size); -		total_scan -= batch_size; +		count_vm_events(SLABS_SCANNED, nr_to_scan); +		total_scan -= nr_to_scan;  		cond_resched();  	} @@ -317,7 +330,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,  	else  		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); -	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); +	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);  	return freed;  } @@ -362,16 +375,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,  	}  	list_for_each_entry(shrinker, &shrinker_list, list) { -		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { -			if (!node_online(shrinkctl->nid)) -				continue; - -			if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && -			    (shrinkctl->nid != 0)) -				break; - +		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { +			shrinkctl->nid = 0;  			freed += shrink_slab_node(shrinkctl, shrinker, -				 nr_pages_scanned, lru_pages); +					nr_pages_scanned, lru_pages); +			continue; +		} + +		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { +			if (node_online(shrinkctl->nid)) +				freed += shrink_slab_node(shrinkctl, shrinker, +						nr_pages_scanned, lru_pages);  		}  	} @@ -450,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  	 * stalls if we need to run get_block().  We could test  	 * PagePrivate for that.  	 * -	 * If this process is currently in __generic_file_aio_write() against +	 * If this process is currently in __generic_file_write_iter() against  	 * this page's queue, we can perform writeback even if that  	 * will block.  	 * @@ -469,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  		if (page_has_private(page)) {  			if (try_to_free_buffers(page)) {  				ClearPageDirty(page); -				printk("%s: orphaned page\n", __func__); +				pr_info("%s: orphaned page\n", __func__);  				return PAGE_CLEAN;  			}  		} @@ -515,7 +529,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,   * Same as remove_mapping, but if the page is removed from the mapping, it   * gets returned with a refcount of 0.   */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, +			    bool reclaimed)  {  	BUG_ON(!PageLocked(page));  	BUG_ON(mapping != page_mapping(page)); @@ -561,10 +576,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)  		swapcache_free(swap, page);  	} else {  		void (*freepage)(struct page *); +		void *shadow = NULL;  		freepage = mapping->a_ops->freepage; - -		__delete_from_page_cache(page); +		/* +		 * Remember a shadow entry for reclaimed file cache in +		 * order to detect refaults, thus thrashing, later on. +		 * +		 * But don't store shadows in an address space that is +		 * already exiting.  This is not just an optizimation, +		 * inode reclaim needs to empty out the radix tree or +		 * the nodes are lost.  Don't plant shadows behind its +		 * back. +		 */ +		if (reclaimed && page_is_file_cache(page) && +		    !mapping_exiting(mapping)) +			shadow = workingset_eviction(mapping, page); +		__delete_from_page_cache(page, shadow);  		spin_unlock_irq(&mapping->tree_lock);  		mem_cgroup_uncharge_cache_page(page); @@ -587,7 +615,7 @@ cannot_free:   */  int remove_mapping(struct address_space *mapping, struct page *page)  { -	if (__remove_mapping(mapping, page)) { +	if (__remove_mapping(mapping, page, false)) {  		/*  		 * Unfreezing the refcount with 1 rather than 2 effectively  		 * drops the pagecache ref for us without requiring another @@ -613,7 +641,7 @@ void putback_lru_page(struct page *page)  	bool is_unevictable;  	int was_unevictable = PageUnevictable(page); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page);  redo:  	ClearPageUnevictable(page); @@ -804,8 +832,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		if (!trylock_page(page))  			goto keep; -		VM_BUG_ON(PageActive(page)); -		VM_BUG_ON(page_zone(page) != zone); +		VM_BUG_ON_PAGE(PageActive(page), page); +		VM_BUG_ON_PAGE(page_zone(page) != zone, page);  		sc->nr_scanned++; @@ -1057,7 +1085,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			}  		} -		if (!mapping || !__remove_mapping(mapping, page)) +		if (!mapping || !__remove_mapping(mapping, page, true))  			goto keep_locked;  		/* @@ -1089,17 +1117,17 @@ activate_locked:  		/* Not a candidate for swapping, so reclaim swap space. */  		if (PageSwapCache(page) && vm_swap_full())  			try_to_free_swap(page); -		VM_BUG_ON(PageActive(page)); +		VM_BUG_ON_PAGE(PageActive(page), page);  		SetPageActive(page);  		pgactivate++;  keep_locked:  		unlock_page(page);  keep:  		list_add(&page->lru, &ret_pages); -		VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); +		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);  	} -	free_hot_cold_page_list(&free_pages, 1); +	free_hot_cold_page_list(&free_pages, true);  	list_splice(&ret_pages, page_list);  	count_vm_events(PGACTIVATE, pgactivate); @@ -1125,7 +1153,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,  	LIST_HEAD(clean_pages);  	list_for_each_entry_safe(page, next, page_list, lru) { -		if (page_is_file_cache(page) && !PageDirty(page)) { +		if (page_is_file_cache(page) && !PageDirty(page) && +		    !isolated_balloon_page(page)) {  			ClearPageActive(page);  			list_move(&page->lru, &clean_pages);  		} @@ -1135,7 +1164,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,  			TTU_UNMAP|TTU_IGNORE_ACCESS,  			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);  	list_splice(&clean_pages, page_list); -	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); +	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);  	return ret;  } @@ -1249,7 +1278,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,  		page = lru_to_page(src);  		prefetchw_prev_lru_page(page, src, flags); -		VM_BUG_ON(!PageLRU(page)); +		VM_BUG_ON_PAGE(!PageLRU(page), page);  		switch (__isolate_lru_page(page, mode)) {  		case 0: @@ -1304,7 +1333,7 @@ int isolate_lru_page(struct page *page)  {  	int ret = -EBUSY; -	VM_BUG_ON(!page_count(page)); +	VM_BUG_ON_PAGE(!page_count(page), page);  	if (PageLRU(page)) {  		struct zone *zone = page_zone(page); @@ -1375,7 +1404,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)  		struct page *page = lru_to_page(page_list);  		int lru; -		VM_BUG_ON(PageLRU(page)); +		VM_BUG_ON_PAGE(PageLRU(page), page);  		list_del(&page->lru);  		if (unlikely(!page_evictable(page))) {  			spin_unlock_irq(&zone->lru_lock); @@ -1416,6 +1445,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)  }  /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested.  In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ +	return !(current->flags & PF_LESS_THROTTLE) || +		current->backing_dev_info == NULL || +		bdi_write_congested(current->backing_dev_info); +} + +/*   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number   * of reclaimed pages   */ @@ -1496,7 +1538,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	spin_unlock_irq(&zone->lru_lock); -	free_hot_cold_page_list(&page_list, 1); +	free_hot_cold_page_list(&page_list, true);  	/*  	 * If reclaim is isolating dirty pages under writeback, it implies @@ -1531,19 +1573,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  		 * If dirty pages are scanned that are not queued for IO, it  		 * implies that flushers are not keeping up. In this case, flag  		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing -		 * pages from reclaim context. It will forcibly stall in the -		 * next check. +		 * pages from reclaim context.  		 */  		if (nr_unqueued_dirty == nr_taken)  			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);  		/* -		 * In addition, if kswapd scans pages marked marked for -		 * immediate reclaim and under writeback (nr_immediate), it -		 * implies that pages are cycling through the LRU faster than +		 * If kswapd scans pages marked marked for immediate +		 * reclaim and under writeback (nr_immediate), it implies +		 * that pages are cycling through the LRU faster than  		 * they are written so also forcibly stall.  		 */ -		if (nr_unqueued_dirty == nr_taken || nr_immediate) +		if (nr_immediate && current_may_throttle())  			congestion_wait(BLK_RW_ASYNC, HZ/10);  	} @@ -1552,7 +1593,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	 * is congested. Allow kswapd to continue until it starts encountering  	 * unqueued dirty pages or cycling through the LRU too quickly.  	 */ -	if (!sc->hibernation_mode && !current_is_kswapd()) +	if (!sc->hibernation_mode && !current_is_kswapd() && +	    current_may_throttle())  		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);  	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1595,7 +1637,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,  		page = lru_to_page(list);  		lruvec = mem_cgroup_page_lruvec(page, zone); -		VM_BUG_ON(PageLRU(page)); +		VM_BUG_ON_PAGE(PageLRU(page), page);  		SetPageLRU(page);  		nr_pages = hpage_nr_pages(page); @@ -1717,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);  	spin_unlock_irq(&zone->lru_lock); -	free_hot_cold_page_list(&l_hold, 1); +	free_hot_cold_page_list(&l_hold, true);  }  #ifdef CONFIG_SWAP @@ -1807,13 +1849,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,  	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);  } -static int vmscan_swappiness(struct scan_control *sc) -{ -	if (global_reclaim(sc)) -		return vm_swappiness; -	return mem_cgroup_swappiness(sc->target_mem_cgroup); -} -  enum scan_balance {  	SCAN_EQUAL,  	SCAN_FRACT, @@ -1839,10 +1874,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	struct zone *zone = lruvec_zone(lruvec);  	unsigned long anon_prio, file_prio;  	enum scan_balance scan_balance; -	unsigned long anon, file, free; +	unsigned long anon, file;  	bool force_scan = false;  	unsigned long ap, fp;  	enum lru_list lru; +	bool some_scanned; +	int pass;  	/*  	 * If the zone or memcg is small, nr[l] can be 0.  This @@ -1872,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * using the memory controller's swap limit feature would be  	 * too expensive.  	 */ -	if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { +	if (!global_reclaim(sc) && !sc->swappiness) {  		scan_balance = SCAN_FILE;  		goto out;  	} @@ -1882,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * system is close to OOM, scan both anon and file equally  	 * (unless the swappiness setting disagrees with swapping).  	 */ -	if (!sc->priority && vmscan_swappiness(sc)) { +	if (!sc->priority && sc->swappiness) {  		scan_balance = SCAN_EQUAL;  		goto out;  	} @@ -1893,13 +1930,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  		get_lru_size(lruvec, LRU_INACTIVE_FILE);  	/* -	 * If it's foreseeable that reclaiming the file cache won't be -	 * enough to get the zone back into a desirable shape, we have -	 * to swap.  Better start now and leave the - probably heavily -	 * thrashing - remaining file pages alone. +	 * Prevent the reclaimer from falling into the cache trap: as +	 * cache pages start out inactive, every cache fault will tip +	 * the scan balance towards the file LRU.  And as the file LRU +	 * shrinks, so does the window for rotation from references. +	 * This means we have a runaway feedback loop where a tiny +	 * thrashing file LRU becomes infinitely more attractive than +	 * anon pages.  Try to detect this based on file LRU size.  	 */  	if (global_reclaim(sc)) { -		free = zone_page_state(zone, NR_FREE_PAGES); +		unsigned long free = zone_page_state(zone, NR_FREE_PAGES); +  		if (unlikely(file + free <= high_wmark_pages(zone))) {  			scan_balance = SCAN_ANON;  			goto out; @@ -1921,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * With swappiness at 100, anonymous and file have the same priority.  	 * This scanning priority is essentially the inverse of IO cost.  	 */ -	anon_prio = vmscan_swappiness(sc); +	anon_prio = sc->swappiness;  	file_prio = 200 - anon_prio;  	/* @@ -1962,39 +2003,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	fraction[1] = fp;  	denominator = ap + fp + 1;  out: -	for_each_evictable_lru(lru) { -		int file = is_file_lru(lru); -		unsigned long size; -		unsigned long scan; +	some_scanned = false; +	/* Only use force_scan on second pass. */ +	for (pass = 0; !some_scanned && pass < 2; pass++) { +		for_each_evictable_lru(lru) { +			int file = is_file_lru(lru); +			unsigned long size; +			unsigned long scan; -		size = get_lru_size(lruvec, lru); -		scan = size >> sc->priority; +			size = get_lru_size(lruvec, lru); +			scan = size >> sc->priority; -		if (!scan && force_scan) -			scan = min(size, SWAP_CLUSTER_MAX); +			if (!scan && pass && force_scan) +				scan = min(size, SWAP_CLUSTER_MAX); -		switch (scan_balance) { -		case SCAN_EQUAL: -			/* Scan lists relative to size */ -			break; -		case SCAN_FRACT: +			switch (scan_balance) { +			case SCAN_EQUAL: +				/* Scan lists relative to size */ +				break; +			case SCAN_FRACT: +				/* +				 * Scan types proportional to swappiness and +				 * their relative recent reclaim efficiency. +				 */ +				scan = div64_u64(scan * fraction[file], +							denominator); +				break; +			case SCAN_FILE: +			case SCAN_ANON: +				/* Scan one type exclusively */ +				if ((scan_balance == SCAN_FILE) != file) +					scan = 0; +				break; +			default: +				/* Look ma, no brain */ +				BUG(); +			} +			nr[lru] = scan;  			/* -			 * Scan types proportional to swappiness and -			 * their relative recent reclaim efficiency. +			 * Skip the second pass and don't force_scan, +			 * if we found something to scan.  			 */ -			scan = div64_u64(scan * fraction[file], denominator); -			break; -		case SCAN_FILE: -		case SCAN_ANON: -			/* Scan one type exclusively */ -			if ((scan_balance == SCAN_FILE) != file) -				scan = 0; -			break; -		default: -			/* Look ma, no brain */ -			BUG(); +			some_scanned |= !!scan;  		} -		nr[lru] = scan;  	}  } @@ -2010,13 +2061,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  	unsigned long nr_reclaimed = 0;  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;  	struct blk_plug plug; -	bool scan_adjusted = false; +	bool scan_adjusted;  	get_scan_count(lruvec, sc, nr);  	/* Record the original scan target for proportional adjustments later */  	memcpy(targets, nr, sizeof(nr)); +	/* +	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal +	 * event that can occur when there is little memory pressure e.g. +	 * multiple streaming readers/writers. Hence, we do not abort scanning +	 * when the requested number of pages are reclaimed when scanning at +	 * DEF_PRIORITY on the assumption that the fact we are direct +	 * reclaiming implies that kswapd is not keeping up and it is best to +	 * do a batch of work at once. For memcg reclaim one check is made to +	 * abort proportional reclaim if either the file or anon lru has already +	 * dropped to zero at the first pass. +	 */ +	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && +			 sc->priority == DEF_PRIORITY); +  	blk_start_plug(&plug);  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||  					nr[LRU_INACTIVE_FILE]) { @@ -2037,17 +2102,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  			continue;  		/* -		 * For global direct reclaim, reclaim only the number of pages -		 * requested. Less care is taken to scan proportionally as it -		 * is more important to minimise direct reclaim stall latency -		 * than it is to properly age the LRU lists. -		 */ -		if (global_reclaim(sc) && !current_is_kswapd()) -			break; - -		/*  		 * For kswapd and memcg, reclaim at least the number of pages -		 * requested. Ensure that the anon and file LRUs shrink +		 * requested. Ensure that the anon and file LRUs are scanned  		 * proportionally what was requested by get_scan_count(). We  		 * stop reclaiming one LRU and reduce the amount scanning  		 * proportional to the original scan target. @@ -2055,6 +2111,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];  		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; +		/* +		 * It's just vindictive to attack the larger once the smaller +		 * has gone to zero.  And given the way we stop scanning the +		 * smaller below, this makes sure that we only make one nudge +		 * towards proportionality once we've got nr_to_reclaim. +		 */ +		if (!nr_file || !nr_anon) +			break; +  		if (nr_file > nr_anon) {  			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +  						targets[LRU_ACTIVE_ANON] + 1; @@ -2176,11 +2241,9 @@ static inline bool should_continue_reclaim(struct zone *zone,  	}  } -static int -__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) +static void shrink_zone(struct zone *zone, struct scan_control *sc)  {  	unsigned long nr_reclaimed, nr_scanned; -	int groups_scanned = 0;  	do {  		struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2188,19 +2251,18 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)  			.zone = zone,  			.priority = sc->priority,  		}; -		struct mem_cgroup *memcg = NULL; -		mem_cgroup_iter_filter filter = (soft_reclaim) ? -			mem_cgroup_soft_reclaim_eligible : NULL; +		struct mem_cgroup *memcg;  		nr_reclaimed = sc->nr_reclaimed;  		nr_scanned = sc->nr_scanned; -		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { +		memcg = mem_cgroup_iter(root, NULL, &reclaim); +		do {  			struct lruvec *lruvec; -			groups_scanned++;  			lruvec = mem_cgroup_zone_lruvec(zone, memcg); +			sc->swappiness = mem_cgroup_swappiness(memcg);  			shrink_lruvec(lruvec, sc);  			/* @@ -2218,7 +2280,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)  				mem_cgroup_iter_break(root, memcg);  				break;  			} -		} +			memcg = mem_cgroup_iter(root, memcg, &reclaim); +		} while (memcg);  		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,  			   sc->nr_scanned - nr_scanned, @@ -2226,37 +2289,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)  	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,  					 sc->nr_scanned - nr_scanned, sc)); - -	return groups_scanned; -} - - -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ -	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); -	unsigned long nr_scanned = sc->nr_scanned; -	int scanned_groups; - -	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); -	/* -	 * memcg iterator might race with other reclaimer or start from -	 * a incomplete tree walk so the tree walk in __shrink_zone -	 * might have missed groups that are above the soft limit. Try -	 * another loop to catch up with others. Do it just once to -	 * prevent from reclaim latencies when other reclaimers always -	 * preempt this one. -	 */ -	if (do_soft_reclaim && !scanned_groups) -		__shrink_zone(zone, sc, do_soft_reclaim); - -	/* -	 * No group is over the soft limit or those that are do not have -	 * pages in the zone we are reclaiming so we have to reclaim everybody -	 */ -	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { -		__shrink_zone(zone, sc, false); -		return; -	}  }  /* Returns true if compaction should go ahead for a high-order request */ @@ -2275,9 +2307,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)  	 * there is a buffer of free pages available to give compaction  	 * a reasonable chance of completing and allocating the page  	 */ -	balance_gap = min(low_wmark_pages(zone), -		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / -			KSWAPD_ZONE_BALANCE_GAP_RATIO); +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));  	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);  	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2320,16 +2351,28 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)  {  	struct zoneref *z;  	struct zone *zone; +	unsigned long nr_soft_reclaimed; +	unsigned long nr_soft_scanned; +	unsigned long lru_pages = 0;  	bool aborted_reclaim = false; +	struct reclaim_state *reclaim_state = current->reclaim_state; +	gfp_t orig_mask; +	struct shrink_control shrink = { +		.gfp_mask = sc->gfp_mask, +	}; +	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);  	/*  	 * If the number of buffer_heads in the machine exceeds the maximum  	 * allowed level, force direct reclaim to scan the highmem zone as  	 * highmem pages could be pinning lowmem pages storing buffer_heads  	 */ +	orig_mask = sc->gfp_mask;  	if (buffer_heads_over_limit)  		sc->gfp_mask |= __GFP_HIGHMEM; +	nodes_clear(shrink.nodes_to_scan); +  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  					gfp_zone(sc->gfp_mask), sc->nodemask) {  		if (!populated_zone(zone)) @@ -2341,6 +2384,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)  		if (global_reclaim(sc)) {  			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))  				continue; + +			lru_pages += zone_reclaimable_pages(zone); +			node_set(zone_to_nid(zone), shrink.nodes_to_scan); +  			if (sc->priority != DEF_PRIORITY &&  			    !zone_reclaimable(zone))  				continue;	/* Let kswapd poll it */ @@ -2354,17 +2401,50 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)  				 * noticeable problem, like transparent huge  				 * page allocations.  				 */ -				if (compaction_ready(zone, sc)) { +				if ((zonelist_zone_idx(z) <= requested_highidx) +				    && compaction_ready(zone, sc)) {  					aborted_reclaim = true;  					continue;  				}  			} +			/* +			 * This steals pages from memory cgroups over softlimit +			 * and returns the number of reclaimed pages and +			 * scanned pages. This works for global memory pressure +			 * and balancing, not for a memcg's limit. +			 */ +			nr_soft_scanned = 0; +			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, +						sc->order, sc->gfp_mask, +						&nr_soft_scanned); +			sc->nr_reclaimed += nr_soft_reclaimed; +			sc->nr_scanned += nr_soft_scanned;  			/* need some check for avoid more shrink_zone() */  		}  		shrink_zone(zone, sc);  	} +	/* +	 * Don't shrink slabs when reclaiming memory from over limit cgroups +	 * but do shrink slab at least once when aborting reclaim for +	 * compaction to avoid unevenly scanning file/anon LRU pages over slab +	 * pages. +	 */ +	if (global_reclaim(sc)) { +		shrink_slab(&shrink, sc->nr_scanned, lru_pages); +		if (reclaim_state) { +			sc->nr_reclaimed += reclaim_state->reclaimed_slab; +			reclaim_state->reclaimed_slab = 0; +		} +	} + +	/* +	 * Restore to original mask to avoid the impact on the caller if we +	 * promoted it to __GFP_HIGHMEM. +	 */ +	sc->gfp_mask = orig_mask; +  	return aborted_reclaim;  } @@ -2405,13 +2485,9 @@ static bool all_unreclaimable(struct zonelist *zonelist,   * 		else, the number of pages reclaimed   */  static unsigned long do_try_to_free_pages(struct zonelist *zonelist, -					struct scan_control *sc, -					struct shrink_control *shrink) +					  struct scan_control *sc)  {  	unsigned long total_scanned = 0; -	struct reclaim_state *reclaim_state = current->reclaim_state; -	struct zoneref *z; -	struct zone *zone;  	unsigned long writeback_threshold;  	bool aborted_reclaim; @@ -2426,32 +2502,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,  		sc->nr_scanned = 0;  		aborted_reclaim = shrink_zones(zonelist, sc); -		/* -		 * Don't shrink slabs when reclaiming memory from over limit -		 * cgroups but do shrink slab at least once when aborting -		 * reclaim for compaction to avoid unevenly scanning file/anon -		 * LRU pages over slab pages. -		 */ -		if (global_reclaim(sc)) { -			unsigned long lru_pages = 0; - -			nodes_clear(shrink->nodes_to_scan); -			for_each_zone_zonelist(zone, z, zonelist, -					gfp_zone(sc->gfp_mask)) { -				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) -					continue; - -				lru_pages += zone_reclaimable_pages(zone); -				node_set(zone_to_nid(zone), -					 shrink->nodes_to_scan); -			} - -			shrink_slab(shrink, sc->nr_scanned, lru_pages); -			if (reclaim_state) { -				sc->nr_reclaimed += reclaim_state->reclaimed_slab; -				reclaim_state->reclaimed_slab = 0; -			} -		}  		total_scanned += sc->nr_scanned;  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)  			goto out; @@ -2513,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)  	for (i = 0; i <= ZONE_NORMAL; i++) {  		zone = &pgdat->node_zones[i]; +		if (!populated_zone(zone)) +			continue; +  		pfmemalloc_reserve += min_wmark_pages(zone);  		free_pages += zone_page_state(zone, NR_FREE_PAGES);  	} +	/* If there are no reserves (unexpected config) then do not throttle */ +	if (!pfmemalloc_reserve) +		return true; +  	wmark_ok = free_pages > pfmemalloc_reserve / 2;  	/* kswapd must be awake if processes are being throttled */ @@ -2541,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,  					nodemask_t *nodemask)  { +	struct zoneref *z;  	struct zone *zone; -	int high_zoneidx = gfp_zone(gfp_mask); -	pg_data_t *pgdat; +	pg_data_t *pgdat = NULL;  	/*  	 * Kernel threads should not be throttled as they may be indirectly @@ -2562,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,  	if (fatal_signal_pending(current))  		goto out; -	/* Check if the pfmemalloc reserves are ok */ -	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); -	pgdat = zone->zone_pgdat; -	if (pfmemalloc_watermark_ok(pgdat)) +	/* +	 * Check if the pfmemalloc reserves are ok by finding the first node +	 * with a usable ZONE_NORMAL or lower zone. The expectation is that +	 * GFP_KERNEL will be required for allocating network buffers when +	 * swapping over the network so ZONE_HIGHMEM is unusable. +	 * +	 * Throttling is based on the first usable node and throttled processes +	 * wait on a queue until kswapd makes progress and wakes them. There +	 * is an affinity then between processes waking up and where reclaim +	 * progress has been made assuming the process wakes on the same node. +	 * More importantly, processes running on remote nodes will not compete +	 * for remote pfmemalloc reserves and processes on different nodes +	 * should make reasonable progress. +	 */ +	for_each_zone_zonelist_nodemask(zone, z, zonelist, +					gfp_mask, nodemask) { +		if (zone_idx(zone) > ZONE_NORMAL) +			continue; + +		/* Throttle based on the first usable node */ +		pgdat = zone->zone_pgdat; +		if (pfmemalloc_watermark_ok(pgdat)) +			goto out; +		break; +	} + +	/* If no zone was usable by the allocation flags then do not throttle */ +	if (!pgdat)  		goto out;  	/* Account for the throttling */ @@ -2613,9 +2694,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  		.target_mem_cgroup = NULL,  		.nodemask = nodemask,  	}; -	struct shrink_control shrink = { -		.gfp_mask = sc.gfp_mask, -	};  	/*  	 * Do not enter reclaim if fatal signal was delivered while throttled. @@ -2629,7 +2707,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  				sc.may_writepage,  				gfp_mask); -	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); +	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);  	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2651,6 +2729,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,  		.may_swap = !noswap,  		.order = 0,  		.priority = 0, +		.swappiness = mem_cgroup_swappiness(memcg),  		.target_mem_cgroup = memcg,  	};  	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); @@ -2696,9 +2775,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,  		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),  	}; -	struct shrink_control shrink = { -		.gfp_mask = sc.gfp_mask, -	};  	/*  	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't @@ -2713,7 +2789,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,  					    sc.may_writepage,  					    sc.gfp_mask); -	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); +	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2885,9 +2961,8 @@ static bool kswapd_shrink_zone(struct zone *zone,  	 * high wmark plus a "gap" where the gap is either the low  	 * watermark or 1% of the zone, whichever is smaller.  	 */ -	balance_gap = min(low_wmark_pages(zone), -		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / -		KSWAPD_ZONE_BALANCE_GAP_RATIO); +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));  	/*  	 * If there is no low memory pressure or the zone is balanced then no @@ -2952,6 +3027,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,  {  	int i;  	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */ +	unsigned long nr_soft_reclaimed; +	unsigned long nr_soft_scanned;  	struct scan_control sc = {  		.gfp_mask = GFP_KERNEL,  		.priority = DEF_PRIORITY, @@ -3066,6 +3143,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,  			sc.nr_scanned = 0; +			nr_soft_scanned = 0; +			/* +			 * Call soft limit reclaim before calling shrink_zone. +			 */ +			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, +							order, sc.gfp_mask, +							&nr_soft_scanned); +			sc.nr_reclaimed += nr_soft_reclaimed; +  			/*  			 * There should be no need to raise the scanning  			 * priority if enough pages are already being scanned @@ -3285,7 +3371,10 @@ static int kswapd(void *p)  		}  	} +	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);  	current->reclaim_state = NULL; +	lockdep_clear_current_reclaim_state(); +  	return 0;  } @@ -3315,27 +3404,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)  	wake_up_interruptible(&pgdat->kswapd_wait);  } -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ -	int nr; - -	nr = global_page_state(NR_ACTIVE_FILE) + -	     global_page_state(NR_INACTIVE_FILE); - -	if (get_nr_swap_pages() > 0) -		nr += global_page_state(NR_ACTIVE_ANON) + -		      global_page_state(NR_INACTIVE_ANON); - -	return nr; -} -  #ifdef CONFIG_HIBERNATION  /*   * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3358,9 +3426,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)  		.order = 0,  		.priority = DEF_PRIORITY,  	}; -	struct shrink_control shrink = { -		.gfp_mask = sc.gfp_mask, -	};  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);  	struct task_struct *p = current;  	unsigned long nr_reclaimed; @@ -3370,7 +3435,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)  	reclaim_state.reclaimed_slab = 0;  	p->reclaim_state = &reclaim_state; -	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); +	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);  	p->reclaim_state = NULL;  	lockdep_clear_current_reclaim_state(); @@ -3429,7 +3494,7 @@ int kswapd_run(int nid)  /*   * Called by memory hotplug when all memory in a node is offlined.  Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end().   */  void kswapd_stop(int nid)  { @@ -3719,7 +3784,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)  		if (page_evictable(page)) {  			enum lru_list lru = page_lru_base_type(page); -			VM_BUG_ON(PageActive(page)); +			VM_BUG_ON_PAGE(PageActive(page), page);  			ClearPageUnevictable(page);  			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);  			add_page_to_lru_list(page, lruvec, lru);  | 
