diff options
Diffstat (limited to 'mm/swap.c')
| -rw-r--r-- | mm/swap.c | 449 | 
1 files changed, 309 insertions, 140 deletions
diff --git a/mm/swap.c b/mm/swap.c index 759c3caf44b..9e8e3472248 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,7 +31,6 @@  #include <linux/memcontrol.h>  #include <linux/gfp.h>  #include <linux/uio.h> -#include <linux/hugetlb.h>  #include "internal.h" @@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)  		spin_lock_irqsave(&zone->lru_lock, flags);  		lruvec = mem_cgroup_page_lruvec(page, zone); -		VM_BUG_ON(!PageLRU(page)); +		VM_BUG_ON_PAGE(!PageLRU(page), page);  		__ClearPageLRU(page);  		del_page_from_lru_list(page, lruvec, page_off_lru(page));  		spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -68,7 +67,7 @@ static void __page_cache_release(struct page *page)  static void __put_single_page(struct page *page)  {  	__page_cache_release(page); -	free_hot_cold_page(page, 0); +	free_hot_cold_page(page, false);  }  static void __put_compound_page(struct page *page) @@ -80,101 +79,187 @@ static void __put_compound_page(struct page *page)  	(*dtor)(page);  } -static void put_compound_page(struct page *page) +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + *    PageHeadHuge will remain true until the compound page + *    is released and enters the buddy allocator, and it could + *    not be split by __split_huge_page_refcount(). + * + *    So if we see PageHeadHuge set, and we have the tail page pin, + *    then we could safely put head page. + * + * 2. Slab THP page: + * + *    PG_slab is cleared before the slab frees the head page, and + *    tail pin cannot be the last reference left on the head page, + *    because the slab code is free to reuse the compound page + *    after a kfree/kmem_cache_free without having to check if + *    there's any tail pin left.  In turn all tail pinsmust be always + *    released while the head is still pinned by the slab code + *    and so we know PG_slab will be still set too. + * + *    So if we see PageSlab set, and we have the tail page pin, + *    then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page)  {  	/* -	 * hugetlbfs pages cannot be split from under us.  If this is a -	 * hugetlbfs page, check refcount on head page and release the page if -	 * the refcount becomes zero. +	 * If @page is a THP tail, we must read the tail page +	 * flags after the head page flags. The +	 * __split_huge_page_refcount side enforces write memory barriers +	 * between clearing PageTail and before the head page +	 * can be freed and reallocated.  	 */ -	if (PageHuge(page)) { -		page = compound_head(page); +	smp_rmb(); +	if (likely(PageTail(page))) { +		/* +		 * __split_huge_page_refcount cannot race +		 * here, see the comment above this function. +		 */ +		VM_BUG_ON_PAGE(!PageHead(page_head), page_head); +		VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); +		if (put_page_testzero(page_head)) { +			/* +			 * If this is the tail of a slab THP page, +			 * the tail pin must not be the last reference +			 * held on the page, because the PG_slab cannot +			 * be cleared before all tail pins (which skips +			 * the _mapcount tail refcounting) have been +			 * released. +			 * +			 * If this is the tail of a hugetlbfs page, +			 * the tail pin may be the last reference on +			 * the page instead, because PageHeadHuge will +			 * not go away until the compound page enters +			 * the buddy allocator. +			 */ +			VM_BUG_ON_PAGE(PageSlab(page_head), page_head); +			__put_compound_page(page_head); +		} +	} else +		/* +		 * __split_huge_page_refcount run before us, +		 * @page was a THP tail. The split @page_head +		 * has been freed and reallocated as slab or +		 * hugetlbfs page of smaller order (only +		 * possible if reallocated as slab on x86). +		 */  		if (put_page_testzero(page)) -			__put_compound_page(page); - -		return; -	} - -	if (unlikely(PageTail(page))) { -		/* __split_huge_page_refcount can run under us */ -		struct page *page_head = compound_trans_head(page); +			__put_single_page(page); +} -		if (likely(page != page_head && -			   get_page_unless_zero(page_head))) { -			unsigned long flags; +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ +	if (likely(page != page_head && get_page_unless_zero(page_head))) { +		unsigned long flags; -			/* -			 * THP can not break up slab pages so avoid taking -			 * compound_lock().  Slab performs non-atomic bit ops -			 * on page->flags for better performance.  In particular -			 * slab_unlock() in slub used to be a hot path.  It is -			 * still hot on arches that do not support -			 * this_cpu_cmpxchg_double(). -			 */ -			if (PageSlab(page_head)) { -				if (PageTail(page)) { -					if (put_page_testzero(page_head)) -						VM_BUG_ON(1); - -					atomic_dec(&page->_mapcount); -					goto skip_lock_tail; -				} else -					goto skip_lock; -			} -			/* -			 * page_head wasn't a dangling pointer but it -			 * may not be a head page anymore by the time -			 * we obtain the lock. That is ok as long as it -			 * can't be freed from under us. -			 */ -			flags = compound_lock_irqsave(page_head); -			if (unlikely(!PageTail(page))) { -				/* __split_huge_page_refcount run before us */ -				compound_unlock_irqrestore(page_head, flags); -skip_lock: -				if (put_page_testzero(page_head)) -					__put_single_page(page_head); -out_put_single: -				if (put_page_testzero(page)) -					__put_single_page(page); -				return; -			} -			VM_BUG_ON(page_head != page->first_page); -			/* -			 * We can release the refcount taken by -			 * get_page_unless_zero() now that -			 * __split_huge_page_refcount() is blocked on -			 * the compound_lock. -			 */ -			if (put_page_testzero(page_head)) -				VM_BUG_ON(1); -			/* __split_huge_page_refcount will wait now */ -			VM_BUG_ON(page_mapcount(page) <= 0); -			atomic_dec(&page->_mapcount); -			VM_BUG_ON(atomic_read(&page_head->_count) <= 0); -			VM_BUG_ON(atomic_read(&page->_count) != 0); +		/* +		 * @page_head wasn't a dangling pointer but it may not +		 * be a head page anymore by the time we obtain the +		 * lock. That is ok as long as it can't be freed from +		 * under us. +		 */ +		flags = compound_lock_irqsave(page_head); +		if (unlikely(!PageTail(page))) { +			/* __split_huge_page_refcount run before us */  			compound_unlock_irqrestore(page_head, flags); - -skip_lock_tail:  			if (put_page_testzero(page_head)) { +				/* +				 * The @page_head may have been freed +				 * and reallocated as a compound page +				 * of smaller order and then freed +				 * again.  All we know is that it +				 * cannot have become: a THP page, a +				 * compound page of higher order, a +				 * tail page.  That is because we +				 * still hold the refcount of the +				 * split THP tail and page_head was +				 * the THP head before the split. +				 */  				if (PageHead(page_head))  					__put_compound_page(page_head);  				else  					__put_single_page(page_head);  			} -		} else { -			/* page_head is a dangling pointer */ -			VM_BUG_ON(PageTail(page)); -			goto out_put_single; +out_put_single: +			if (put_page_testzero(page)) +				__put_single_page(page); +			return;  		} -	} else if (put_page_testzero(page)) { -		if (PageHead(page)) -			__put_compound_page(page); -		else -			__put_single_page(page); +		VM_BUG_ON_PAGE(page_head != page->first_page, page); +		/* +		 * We can release the refcount taken by +		 * get_page_unless_zero() now that +		 * __split_huge_page_refcount() is blocked on the +		 * compound_lock. +		 */ +		if (put_page_testzero(page_head)) +			VM_BUG_ON_PAGE(1, page_head); +		/* __split_huge_page_refcount will wait now */ +		VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); +		atomic_dec(&page->_mapcount); +		VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); +		VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); +		compound_unlock_irqrestore(page_head, flags); + +		if (put_page_testzero(page_head)) { +			if (PageHead(page_head)) +				__put_compound_page(page_head); +			else +				__put_single_page(page_head); +		} +	} else { +		/* @page_head is a dangling pointer */ +		VM_BUG_ON_PAGE(PageTail(page), page); +		goto out_put_single;  	}  } +static void put_compound_page(struct page *page) +{ +	struct page *page_head; + +	/* +	 * We see the PageCompound set and PageTail not set, so @page maybe: +	 *  1. hugetlbfs head page, or +	 *  2. THP head page. +	 */ +	if (likely(!PageTail(page))) { +		if (put_page_testzero(page)) { +			/* +			 * By the time all refcounts have been released +			 * split_huge_page cannot run anymore from under us. +			 */ +			if (PageHead(page)) +				__put_compound_page(page); +			else +				__put_single_page(page); +		} +		return; +	} + +	/* +	 * We see the PageCompound set and PageTail set, so @page maybe: +	 *  1. a tail hugetlbfs page, or +	 *  2. a tail THP page, or +	 *  3. a split THP page. +	 * +	 *  Case 3 is possible, as we may race with +	 *  __split_huge_page_refcount tearing down a THP page. +	 */ +	page_head = compound_head_by_tail(page); +	if (!__compound_tail_refcounted(page_head)) +		put_unrefcounted_compound_page(page_head, page); +	else +		put_refcounted_compound_page(page_head, page); +} +  void put_page(struct page *page)  {  	if (unlikely(PageCompound(page))) @@ -198,52 +283,54 @@ bool __get_page_tail(struct page *page)  	 * proper PT lock that already serializes against  	 * split_huge_page().  	 */ -	bool got = false; -	struct page *page_head; - -	/* -	 * If this is a hugetlbfs page it cannot be split under us.  Simply -	 * increment refcount for the head page. -	 */ -	if (PageHuge(page)) { -		page_head = compound_head(page); -		atomic_inc(&page_head->_count); -		got = true; -	} else { -		unsigned long flags; - -		page_head = compound_trans_head(page); -		if (likely(page != page_head && -					get_page_unless_zero(page_head))) { - -			/* Ref to put_compound_page() comment. */ -			if (PageSlab(page_head)) { -				if (likely(PageTail(page))) { -					__get_page_tail_foll(page, false); -					return true; -				} else { -					put_page(page_head); -					return false; -				} -			} - +	unsigned long flags; +	bool got; +	struct page *page_head = compound_head(page); + +	/* Ref to put_compound_page() comment. */ +	if (!__compound_tail_refcounted(page_head)) { +		smp_rmb(); +		if (likely(PageTail(page))) {  			/* -			 * page_head wasn't a dangling pointer but it -			 * may not be a head page anymore by the time -			 * we obtain the lock. That is ok as long as it -			 * can't be freed from under us. +			 * This is a hugetlbfs page or a slab +			 * page. __split_huge_page_refcount +			 * cannot race here.  			 */ -			flags = compound_lock_irqsave(page_head); -			/* here __split_huge_page_refcount won't run anymore */ -			if (likely(PageTail(page))) { -				__get_page_tail_foll(page, false); -				got = true; -			} -			compound_unlock_irqrestore(page_head, flags); -			if (unlikely(!got)) -				put_page(page_head); +			VM_BUG_ON_PAGE(!PageHead(page_head), page_head); +			__get_page_tail_foll(page, true); +			return true; +		} else { +			/* +			 * __split_huge_page_refcount run +			 * before us, "page" was a THP +			 * tail. The split page_head has been +			 * freed and reallocated as slab or +			 * hugetlbfs page of smaller order +			 * (only possible if reallocated as +			 * slab on x86). +			 */ +			return false;  		}  	} + +	got = false; +	if (likely(page != page_head && get_page_unless_zero(page_head))) { +		/* +		 * page_head wasn't a dangling pointer but it +		 * may not be a head page anymore by the time +		 * we obtain the lock. That is ok as long as it +		 * can't be freed from under us. +		 */ +		flags = compound_lock_irqsave(page_head); +		/* here __split_huge_page_refcount won't run anymore */ +		if (likely(PageTail(page))) { +			__get_page_tail_foll(page, false); +			got = true; +		} +		compound_unlock_irqrestore(page_head, flags); +		if (unlikely(!got)) +			put_page(page_head); +	}  	return got;  }  EXPORT_SYMBOL(__get_page_tail); @@ -386,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)  		page_cache_get(page);  		local_irq_save(flags); -		pvec = &__get_cpu_var(lru_rotate_pvecs); +		pvec = this_cpu_ptr(&lru_rotate_pvecs);  		if (!pagevec_add(pvec, page))  			pagevec_move_tail(pvec);  		local_irq_restore(flags); @@ -519,6 +606,8 @@ void mark_page_accessed(struct page *page)  		else  			__lru_cache_activate_page(page);  		ClearPageReferenced(page); +		if (page_is_file_cache(page)) +			workingset_activation(page);  	} else if (!PageReferenced(page)) {  		SetPageReferenced(page);  	} @@ -526,12 +615,17 @@ void mark_page_accessed(struct page *page)  EXPORT_SYMBOL(mark_page_accessed);  /* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). + * Used to mark_page_accessed(page) that is not visible yet and when it is + * still safe to use non-atomic ops   */ -void __lru_cache_add(struct page *page) +void init_page_accessed(struct page *page) +{ +	if (!PageReferenced(page)) +		__SetPageReferenced(page); +} +EXPORT_SYMBOL(init_page_accessed); + +static void __lru_cache_add(struct page *page)  {  	struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -541,16 +635,39 @@ void __lru_cache_add(struct page *page)  	pagevec_add(pvec, page);  	put_cpu_var(lru_add_pvec);  } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ +	if (PageActive(page)) +		ClearPageActive(page); +	__lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ +	if (PageActive(page)) +		ClearPageActive(page); +	__lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file);  /**   * lru_cache_add - add a page to a page list   * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed().   */  void lru_cache_add(struct page *page)  { -	VM_BUG_ON(PageActive(page) && PageUnevictable(page)); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); +	VM_BUG_ON_PAGE(PageLRU(page), page);  	__lru_cache_add(page);  } @@ -756,7 +873,7 @@ void lru_add_drain_all(void)   * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()   * will free it.   */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold)  {  	int i;  	LIST_HEAD(pages_to_free); @@ -791,13 +908,13 @@ void release_pages(struct page **pages, int nr, int cold)  			}  			lruvec = mem_cgroup_page_lruvec(page, zone); -			VM_BUG_ON(!PageLRU(page)); +			VM_BUG_ON_PAGE(!PageLRU(page), page);  			__ClearPageLRU(page);  			del_page_from_lru_list(page, lruvec, page_off_lru(page));  		}  		/* Clear Active bit in case of parallel mark_page_accessed */ -		ClearPageActive(page); +		__ClearPageActive(page);  		list_add(&page->lru, &pages_to_free);  	} @@ -833,9 +950,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,  {  	const int file = 0; -	VM_BUG_ON(!PageHead(page)); -	VM_BUG_ON(PageCompound(page_tail)); -	VM_BUG_ON(PageLRU(page_tail)); +	VM_BUG_ON_PAGE(!PageHead(page), page); +	VM_BUG_ON_PAGE(PageCompound(page_tail), page); +	VM_BUG_ON_PAGE(PageLRU(page_tail), page);  	VM_BUG_ON(NR_CPUS != 1 &&  		  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); @@ -874,7 +991,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,  	int active = PageActive(page);  	enum lru_list lru = page_lru(page); -	VM_BUG_ON(PageLRU(page)); +	VM_BUG_ON_PAGE(PageLRU(page), page);  	SetPageLRU(page);  	add_page_to_lru_list(page, lruvec, lru); @@ -893,6 +1010,57 @@ void __pagevec_lru_add(struct pagevec *pvec)  EXPORT_SYMBOL(__pagevec_lru_add);  /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec:	Where the resulting entries are placed + * @mapping:	The address_space to search + * @start:	The starting entry index + * @nr_entries:	The maximum number of entries + * @indices:	The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping.  All + * entries are placed in @pvec.  pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes.  There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, +				struct address_space *mapping, +				pgoff_t start, unsigned nr_pages, +				pgoff_t *indices) +{ +	pvec->nr = find_get_entries(mapping, start, nr_pages, +				    pvec->pages, indices); +	return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec:	The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec.  This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ +	int i, j; + +	for (i = 0, j = 0; i < pagevec_count(pvec); i++) { +		struct page *page = pvec->pages[i]; +		if (!radix_tree_exceptional_entry(page)) +			pvec->pages[j++] = page; +	} +	pvec->nr = j; +} + +/**   * pagevec_lookup - gang pagecache lookup   * @pvec:	Where the resulting pages are placed   * @mapping:	The address_space to search @@ -934,7 +1102,8 @@ void __init swap_setup(void)  #ifdef CONFIG_SWAP  	int i; -	bdi_init(swapper_spaces[0].backing_dev_info); +	if (bdi_init(swapper_spaces[0].backing_dev_info)) +		panic("Failed to init swap bdi");  	for (i = 0; i < MAX_SWAPFILES; i++) {  		spin_lock_init(&swapper_spaces[i].tree_lock);  		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);  | 
