aboutsummaryrefslogtreecommitdiff
path: root/mm/swap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swap.c')
-rw-r--r--mm/swap.c449
1 files changed, 309 insertions, 140 deletions
diff --git a/mm/swap.c b/mm/swap.c
index 759c3caf44b..9e8e3472248 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
-#include <linux/hugetlb.h>
#include "internal.h"
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -68,7 +67,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
}
static void __put_compound_page(struct page *page)
@@ -80,101 +79,187 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-static void put_compound_page(struct page *page)
+/**
+ * Two special cases here: we could avoid taking compound_lock_irqsave
+ * and could skip the tail refcounting(in _mapcount).
+ *
+ * 1. Hugetlbfs page:
+ *
+ * PageHeadHuge will remain true until the compound page
+ * is released and enters the buddy allocator, and it could
+ * not be split by __split_huge_page_refcount().
+ *
+ * So if we see PageHeadHuge set, and we have the tail page pin,
+ * then we could safely put head page.
+ *
+ * 2. Slab THP page:
+ *
+ * PG_slab is cleared before the slab frees the head page, and
+ * tail pin cannot be the last reference left on the head page,
+ * because the slab code is free to reuse the compound page
+ * after a kfree/kmem_cache_free without having to check if
+ * there's any tail pin left. In turn all tail pinsmust be always
+ * released while the head is still pinned by the slab code
+ * and so we know PG_slab will be still set too.
+ *
+ * So if we see PageSlab set, and we have the tail page pin,
+ * then we could safely put head page.
+ */
+static __always_inline
+void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
{
/*
- * hugetlbfs pages cannot be split from under us. If this is a
- * hugetlbfs page, check refcount on head page and release the page if
- * the refcount becomes zero.
+ * If @page is a THP tail, we must read the tail page
+ * flags after the head page flags. The
+ * __split_huge_page_refcount side enforces write memory barriers
+ * between clearing PageTail and before the head page
+ * can be freed and reallocated.
*/
- if (PageHuge(page)) {
- page = compound_head(page);
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount cannot race
+ * here, see the comment above this function.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
+ if (put_page_testzero(page_head)) {
+ /*
+ * If this is the tail of a slab THP page,
+ * the tail pin must not be the last reference
+ * held on the page, because the PG_slab cannot
+ * be cleared before all tail pins (which skips
+ * the _mapcount tail refcounting) have been
+ * released.
+ *
+ * If this is the tail of a hugetlbfs page,
+ * the tail pin may be the last reference on
+ * the page instead, because PageHeadHuge will
+ * not go away until the compound page enters
+ * the buddy allocator.
+ */
+ VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
+ __put_compound_page(page_head);
+ }
+ } else
+ /*
+ * __split_huge_page_refcount run before us,
+ * @page was a THP tail. The split @page_head
+ * has been freed and reallocated as slab or
+ * hugetlbfs page of smaller order (only
+ * possible if reallocated as slab on x86).
+ */
if (put_page_testzero(page))
- __put_compound_page(page);
-
- return;
- }
-
- if (unlikely(PageTail(page))) {
- /* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
+ __put_single_page(page);
+}
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
- unsigned long flags;
+static __always_inline
+void put_refcounted_compound_page(struct page *page_head, struct page *page)
+{
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ unsigned long flags;
- /*
- * THP can not break up slab pages so avoid taking
- * compound_lock(). Slab performs non-atomic bit ops
- * on page->flags for better performance. In particular
- * slab_unlock() in slub used to be a hot path. It is
- * still hot on arches that do not support
- * this_cpu_cmpxchg_double().
- */
- if (PageSlab(page_head)) {
- if (PageTail(page)) {
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
-
- atomic_dec(&page->_mapcount);
- goto skip_lock_tail;
- } else
- goto skip_lock;
- }
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
-skip_lock:
- if (put_page_testzero(page_head))
- __put_single_page(page_head);
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON(page_head != page->first_page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on
- * the compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
+ /*
+ * @page_head wasn't a dangling pointer but it may not
+ * be a head page anymore by the time we obtain the
+ * lock. That is ok as long as it can't be freed from
+ * under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
-
-skip_lock_tail:
if (put_page_testzero(page_head)) {
+ /*
+ * The @page_head may have been freed
+ * and reallocated as a compound page
+ * of smaller order and then freed
+ * again. All we know is that it
+ * cannot have become: a THP page, a
+ * compound page of higher order, a
+ * tail page. That is because we
+ * still hold the refcount of the
+ * split THP tail and page_head was
+ * the THP head before the split.
+ */
if (PageHead(page_head))
__put_compound_page(page_head);
else
__put_single_page(page_head);
}
- } else {
- /* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
- goto out_put_single;
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
}
- } else if (put_page_testzero(page)) {
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ VM_BUG_ON_PAGE(page_head != page->first_page, page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON_PAGE(1, page_head);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+ compound_unlock_irqrestore(page_head, flags);
+
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* @page_head is a dangling pointer */
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ goto out_put_single;
}
}
+static void put_compound_page(struct page *page)
+{
+ struct page *page_head;
+
+ /*
+ * We see the PageCompound set and PageTail not set, so @page maybe:
+ * 1. hugetlbfs head page, or
+ * 2. THP head page.
+ */
+ if (likely(!PageTail(page))) {
+ if (put_page_testzero(page)) {
+ /*
+ * By the time all refcounts have been released
+ * split_huge_page cannot run anymore from under us.
+ */
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+ }
+ return;
+ }
+
+ /*
+ * We see the PageCompound set and PageTail set, so @page maybe:
+ * 1. a tail hugetlbfs page, or
+ * 2. a tail THP page, or
+ * 3. a split THP page.
+ *
+ * Case 3 is possible, as we may race with
+ * __split_huge_page_refcount tearing down a THP page.
+ */
+ page_head = compound_head_by_tail(page);
+ if (!__compound_tail_refcounted(page_head))
+ put_unrefcounted_compound_page(page_head, page);
+ else
+ put_refcounted_compound_page(page_head, page);
+}
+
void put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
@@ -198,52 +283,54 @@ bool __get_page_tail(struct page *page)
* proper PT lock that already serializes against
* split_huge_page().
*/
- bool got = false;
- struct page *page_head;
-
- /*
- * If this is a hugetlbfs page it cannot be split under us. Simply
- * increment refcount for the head page.
- */
- if (PageHuge(page)) {
- page_head = compound_head(page);
- atomic_inc(&page_head->_count);
- got = true;
- } else {
- unsigned long flags;
-
- page_head = compound_trans_head(page);
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
-
- /* Ref to put_compound_page() comment. */
- if (PageSlab(page_head)) {
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- return true;
- } else {
- put_page(page_head);
- return false;
- }
- }
-
+ unsigned long flags;
+ bool got;
+ struct page *page_head = compound_head(page);
+
+ /* Ref to put_compound_page() comment. */
+ if (!__compound_tail_refcounted(page_head)) {
+ smp_rmb();
+ if (likely(PageTail(page))) {
/*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
*/
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ __get_page_tail_foll(page, true);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ return false;
}
}
+
+ got = false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ /* here __split_huge_page_refcount won't run anymore */
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ got = true;
+ }
+ compound_unlock_irqrestore(page_head, flags);
+ if (unlikely(!got))
+ put_page(page_head);
+ }
return got;
}
EXPORT_SYMBOL(__get_page_tail);
@@ -386,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)
page_cache_get(page);
local_irq_save(flags);
- pvec = &__get_cpu_var(lru_rotate_pvecs);
+ pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
pagevec_move_tail(pvec);
local_irq_restore(flags);
@@ -519,6 +606,8 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
+ if (page_is_file_cache(page))
+ workingset_activation(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
@@ -526,12 +615,17 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
+ * still safe to use non-atomic ops
*/
-void __lru_cache_add(struct page *page)
+void init_page_accessed(struct page *page)
+{
+ if (!PageReferenced(page))
+ __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+
+static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -541,16 +635,39 @@ void __lru_cache_add(struct page *page)
pagevec_add(pvec, page);
put_cpu_var(lru_add_pvec);
}
-EXPORT_SYMBOL(__lru_cache_add);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+
+void lru_cache_add_file(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
- VM_BUG_ON(PageActive(page) && PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
__lru_cache_add(page);
}
@@ -756,7 +873,7 @@ void lru_add_drain_all(void)
* grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
* will free it.
*/
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
{
int i;
LIST_HEAD(pages_to_free);
@@ -791,13 +908,13 @@ void release_pages(struct page **pages, int nr, int cold)
}
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
/* Clear Active bit in case of parallel mark_page_accessed */
- ClearPageActive(page);
+ __ClearPageActive(page);
list_add(&page->lru, &pages_to_free);
}
@@ -833,9 +950,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
{
const int file = 0;
- VM_BUG_ON(!PageHead(page));
- VM_BUG_ON(PageCompound(page_tail));
- VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
@@ -874,7 +991,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
int active = PageActive(page);
enum lru_list lru = page_lru(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
@@ -893,6 +1010,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_lru_add);
/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of entries
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_entries pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages,
+ pgoff_t *indices)
+{
+ pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
+}
+
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
+ */
+void pagevec_remove_exceptionals(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+}
+
+/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
@@ -934,7 +1102,8 @@ void __init swap_setup(void)
#ifdef CONFIG_SWAP
int i;
- bdi_init(swapper_spaces[0].backing_dev_info);
+ if (bdi_init(swapper_spaces[0].backing_dev_info))
+ panic("Failed to init swap bdi");
for (i = 0; i < MAX_SWAPFILES; i++) {
spin_lock_init(&swapper_spaces[i].tree_lock);
INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);