aboutsummaryrefslogtreecommitdiff
path: root/mm/ksm.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/ksm.c')
-rw-r--r--mm/ksm.c1012
1 files changed, 700 insertions, 312 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d..346ddc9e4c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,11 +33,22 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#ifdef CONFIG_NUMA
+#define NUMA(x) (x)
+#define DO_NUMA(x) do { (x); } while (0)
+#else
+#define NUMA(x) (0)
+#define DO_NUMA(x) do { } while (0)
+#endif
+
/*
* A few notes about the KSM scanning process,
* to make it easier to understand the data structures below:
@@ -76,6 +87,9 @@
* take 10 attempts to find a page in the unstable tree, once it is found,
* it is secured in the stable tree. (When we scan a new page, we first
* compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
*/
/**
@@ -111,19 +125,32 @@ struct ksm_scan {
/**
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
- * @kpfn: page frame number of this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
struct stable_node {
- struct rb_node node;
+ union {
+ struct rb_node node; /* when node of stable tree */
+ struct { /* when listed for migration */
+ struct list_head *head;
+ struct list_head list;
+ };
+ };
struct hlist_head hlist;
unsigned long kpfn;
+#ifdef CONFIG_NUMA
+ int nid;
+#endif
};
/**
* struct rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
* @mm: the memory structure this rmap_item is pointing into
* @address: the virtual address this rmap_item tracks (+ flags in low bits)
* @oldchecksum: previous checksum of the page at that virtual address
@@ -133,7 +160,12 @@ struct stable_node {
*/
struct rmap_item {
struct rmap_item *rmap_list;
- struct anon_vma *anon_vma; /* when stable */
+ union {
+ struct anon_vma *anon_vma; /* when stable */
+#ifdef CONFIG_NUMA
+ int nid; /* when node of unstable tree */
+#endif
+ };
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */
@@ -151,12 +183,16 @@ struct rmap_item {
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
/* The stable and unstable tree heads */
-static struct rb_root root_stable_tree = RB_ROOT;
-static struct rb_root root_unstable_tree = RB_ROOT;
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
+
+/* Recently migrated nodes of stable tree, pending proper placement */
+static LIST_HEAD(migrate_nodes);
-#define MM_SLOTS_HASH_SHIFT 10
-#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
-static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -187,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes 1U
+#define ksm_nr_node_ids 1
+#endif
+
#define KSM_RUN_STOP 0
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
-static unsigned int ksm_run = KSM_RUN_STOP;
+#define KSM_RUN_OFFLINE 4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
@@ -273,45 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
- struct hlist_head *bucket;
- struct hlist_node *node;
+ struct mm_slot *slot;
+
+ hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
+ if (slot->mm == mm)
+ return slot;
- bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
- hlist_for_each_entry(mm_slot, node, bucket, link) {
- if (mm == mm_slot->mm)
- return mm_slot;
- }
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
- struct hlist_head *bucket;
-
- bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
- hlist_add_head(&mm_slot->link, bucket);
-}
-
-static inline int in_stable_tree(struct rmap_item *rmap_item)
-{
- return rmap_item->address & STABLE_FLAG;
-}
-
-static void hold_anon_vma(struct rmap_item *rmap_item,
- struct anon_vma *anon_vma)
-{
- rmap_item->anon_vma = anon_vma;
- get_anon_vma(anon_vma);
-}
-
-static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
-{
- struct anon_vma *anon_vma = rmap_item->anon_vma;
-
- drop_anon_vma(anon_vma);
+ hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
}
/*
@@ -345,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
- page = follow_page(vma, addr, FOLL_GET);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
@@ -386,6 +408,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ if (ksm_test_exit(mm))
+ return NULL;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return NULL;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ return NULL;
+ return vma;
+}
+
static void break_cow(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -396,21 +432,29 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
- goto out;
- break_ksm(vma, addr);
-out:
+ vma = find_mergeable_vma(mm, addr);
+ if (vma)
+ break_ksm(vma, addr);
up_read(&mm->mmap_sem);
}
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -419,18 +463,14 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct page *page;
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ vma = find_mergeable_vma(mm, addr);
+ if (!vma)
goto out;
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page)) {
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -441,22 +481,36 @@ out: page = NULL;
return page;
}
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+ return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
+}
+
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
- struct hlist_node *hlist;
- hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
- rb_erase(&stable_node->node, &root_stable_tree);
+ if (stable_node->head == &migrate_nodes)
+ list_del(&stable_node->list);
+ else
+ rb_erase(&stable_node->node,
+ root_stable_tree + NUMA(stable_node->nid));
free_stable_node(stable_node);
}
@@ -466,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* In which case we can trust the content of the page, and it
* returns the gotten page; but if the page has now been zapped,
* remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
*
* You would expect the stable_node to hold a reference to the ksm page.
* But if it increments the page's count, swapping out has to wait for
@@ -476,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* pointing back to this stable node. This relies on freeing a PageAnon
* page to reset its page->mapping to NULL, and relies on no other use of
* a page to put something that might look like our key in page->mapping.
- *
- * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
- * but this is different - made simpler by ksm_thread_mutex being held, but
- * interesting for assuming that no other use of the struct page could ever
- * put our expected_mapping into page->mapping (or a field of the union which
- * coincides with page->mapping). The RCU calls are not for KSM at all, but
- * to keep the page_count protocol described with page_cache_get_speculative.
- *
- * Note: it is possible that get_ksm_page() will return NULL one moment,
- * then page the next, if the page is in between page_freeze_refs() and
- * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node)
+static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
{
struct page *page;
void *expected_mapping;
+ unsigned long kpfn;
- page = pfn_to_page(stable_node->kpfn);
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
- rcu_read_lock();
- if (page->mapping != expected_mapping)
- goto stale;
- if (!get_page_unless_zero(page))
+again:
+ kpfn = ACCESS_ONCE(stable_node->kpfn);
+ page = pfn_to_page(kpfn);
+
+ /*
+ * page is computed from kpfn, so on most architectures reading
+ * page->mapping is naturally ordered after reading node->kpfn,
+ * but on Alpha we need to be more careful.
+ */
+ smp_read_barrier_depends();
+ if (ACCESS_ONCE(page->mapping) != expected_mapping)
goto stale;
- if (page->mapping != expected_mapping) {
+
+ /*
+ * We cannot do anything with the page while its refcount is 0.
+ * Usually 0 means free, or tail of a higher-order page: in which
+ * case this node is no longer referenced, and should be freed;
+ * however, it might mean that the page is under page_freeze_refs().
+ * The __remove_mapping() case is easy, again the node is now stale;
+ * but if page is swapcache in migrate_page_move_mapping(), it might
+ * still be our page, in which case it's essential to keep the node.
+ */
+ while (!get_page_unless_zero(page)) {
+ /*
+ * Another check for page->mapping != expected_mapping would
+ * work here too. We have chosen the !PageSwapCache test to
+ * optimize the common case, when the page is or is about to
+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
+ * in the freeze_refs section of __remove_mapping(); but Anon
+ * page->mapping reset to NULL later, in free_pages_prepare().
+ */
+ if (!PageSwapCache(page))
+ goto stale;
+ cpu_relax();
+ }
+
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
put_page(page);
goto stale;
}
- rcu_read_unlock();
+
+ if (lock_it) {
+ lock_page(page);
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto stale;
+ }
+ }
return page;
+
stale:
- rcu_read_unlock();
+ /*
+ * We come here from above when page->mapping or !PageSwapCache
+ * suggests that the node is stale; but it might be under migration.
+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * before checking whether node->kpfn has been changed.
+ */
+ smp_rmb();
+ if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ goto again;
remove_node_from_stable_tree(stable_node);
return NULL;
}
@@ -525,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node);
+ page = get_ksm_page(stable_node, true);
if (!page)
goto out;
- lock_page(page);
hlist_del(&rmap_item->hlist);
unlock_page(page);
put_page(page);
@@ -539,7 +630,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -554,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
BUG_ON(age > 1);
if (!age)
- rb_erase(&rmap_item->node, &root_unstable_tree);
-
+ rb_erase(&rmap_item->node,
+ root_unstable_tree + NUMA(rmap_item->nid));
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
@@ -575,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
}
/*
- * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -608,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
/*
* Only called through the sysfs control interface:
*/
+static int remove_stable_node(struct stable_node *stable_node)
+{
+ struct page *page;
+ int err;
+
+ page = get_ksm_page(stable_node, true);
+ if (!page) {
+ /*
+ * get_ksm_page did remove_node_from_stable_tree itself.
+ */
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(page_mapped(page))) {
+ /*
+ * This should not happen: but if it does, just refuse to let
+ * merge_across_nodes be switched - there is no need to panic.
+ */
+ err = -EBUSY;
+ } else {
+ /*
+ * The stable node did not yet appear stale to get_ksm_page(),
+ * since that allows for an unmapped ksm page to be recognized
+ * right up until it is freed; but the node is safe to remove.
+ * This page might be in a pagevec waiting to be freed,
+ * or it might be PageSwapCache (perhaps under writeback),
+ * or it might have been removed from swapcache a moment ago.
+ */
+ set_page_stable_node(page, NULL);
+ remove_node_from_stable_tree(stable_node);
+ err = 0;
+ }
+
+ unlock_page(page);
+ put_page(page);
+ return err;
+}
+
+static int remove_all_stable_nodes(void)
+{
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ int nid;
+ int err = 0;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ while (root_stable_tree[nid].rb_node) {
+ stable_node = rb_entry(root_stable_tree[nid].rb_node,
+ struct stable_node, node);
+ if (remove_stable_node(stable_node)) {
+ err = -EBUSY;
+ break; /* proceed to next nid */
+ }
+ cond_resched();
+ }
+ }
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this, struct stable_node, list);
+ if (remove_stable_node(stable_node))
+ err = -EBUSY;
+ cond_resched();
+ }
+ return err;
+}
+
static int unmerge_and_remove_all_rmap_items(void)
{
struct mm_slot *mm_slot;
@@ -641,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void)
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
struct mm_slot, mm_list);
if (ksm_test_exit(mm)) {
- hlist_del(&mm_slot->link);
+ hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -655,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void)
}
}
+ /* Clean up stable nodes, but don't worry if some are still busy */
+ remove_all_stable_nodes();
ksm_scan.seqnr = 0;
return 0;
@@ -670,9 +828,9 @@ error:
static u32 calc_checksum(struct page *page)
{
u32 checksum;
- void *addr = kmap_atomic(page, KM_USER0);
+ void *addr = kmap_atomic(page);
checksum = jhash2(addr, PAGE_SIZE / 4, 17);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
return checksum;
}
@@ -681,11 +839,11 @@ static int memcmp_pages(struct page *page1, struct page *page2)
char *addr1, *addr2;
int ret;
- addr1 = kmap_atomic(page1, KM_USER0);
- addr2 = kmap_atomic(page2, KM_USER1);
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2, KM_USER1);
- kunmap_atomic(addr1, KM_USER0);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
return ret;
}
@@ -703,14 +861,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
+ BUG_ON(PageTransCompound(page));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
- goto out;
+ goto out_mn;
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
@@ -718,7 +884,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
swapped = PageSwapCache(page);
flush_cache_page(vma, addr, page_to_pfn(page));
/*
- * Ok this is tricky, when get_user_pages_fast() run it doesnt
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
* with the pagecount against the mapcount is racey and
* O_DIRECT can happen right after the check.
@@ -745,6 +911,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
out_unlock:
pte_unmap_unlock(ptep, ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
@@ -762,34 +930,30 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
goto out;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
pte_unmap_unlock(ptep, ptl);
- goto out;
+ goto out_mn;
}
get_page(kpage);
@@ -800,14 +964,45 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -828,6 +1023,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -899,8 +1097,12 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
if (err)
goto out;
+ /* Unstable nid is in union with stable anon_vma: remove first */
+ remove_rmap_item_from_tree(rmap_item);
+
/* Must get reference to anon_vma while still holding mmap_sem */
- hold_anon_vma(rmap_item, vma->anon_vma);
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
return err;
@@ -948,42 +1150,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
*/
static struct page *stable_tree_search(struct page *page)
{
- struct rb_node *node = root_stable_tree.rb_node;
+ int nid;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
struct stable_node *stable_node;
+ struct stable_node *page_node;
- stable_node = page_stable_node(page);
- if (stable_node) { /* ksm page forked */
+ page_node = page_stable_node(page);
+ if (page_node && page_node->head != &migrate_nodes) {
+ /* ksm page forked */
get_page(page);
return page;
}
- while (node) {
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_stable_tree + nid;
+again:
+ new = &root->rb_node;
+ parent = NULL;
+
+ while (*new) {
struct page *tree_page;
int ret;
cond_resched();
- stable_node = rb_entry(node, struct stable_node, node);
- tree_page = get_ksm_page(stable_node);
+ stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
ret = memcmp_pages(page, tree_page);
+ put_page(tree_page);
- if (ret < 0) {
- put_page(tree_page);
- node = node->rb_left;
- } else if (ret > 0) {
- put_page(tree_page);
- node = node->rb_right;
- } else
- return tree_page;
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ /*
+ * Lock and unlock the stable_node's page (which
+ * might already have been migrated) so that page
+ * migration is sure to notice its raised count.
+ * It would be more elegant to return stable_node
+ * than kpage, but that involves more changes.
+ */
+ tree_page = get_ksm_page(stable_node, true);
+ if (tree_page) {
+ unlock_page(tree_page);
+ if (get_kpfn_nid(stable_node->kpfn) !=
+ NUMA(stable_node->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
+ }
+ /*
+ * There is now a place for page_node, but the tree may
+ * have been rebalanced, so re-evaluate parent and new.
+ */
+ if (page_node)
+ goto again;
+ return NULL;
+ }
}
- return NULL;
+ if (!page_node)
+ return NULL;
+
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_link_node(&page_node->node, parent, new);
+ rb_insert_color(&page_node->node, root);
+ get_page(page);
+ return page;
+
+replace:
+ if (page_node) {
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node->node, &page_node->node, root);
+ get_page(page);
+ } else {
+ rb_erase(&stable_node->node, root);
+ page = NULL;
+ }
+ stable_node->head = &migrate_nodes;
+ list_add(&stable_node->list, stable_node->head);
+ return page;
}
/*
- * stable_tree_insert - insert rmap_item pointing to new ksm page
+ * stable_tree_insert - insert stable tree node pointing to new ksm page
* into the stable tree.
*
* This function returns the stable tree node just allocated on success,
@@ -991,17 +1250,25 @@ static struct page *stable_tree_search(struct page *page)
*/
static struct stable_node *stable_tree_insert(struct page *kpage)
{
- struct rb_node **new = &root_stable_tree.rb_node;
+ int nid;
+ unsigned long kpfn;
+ struct rb_root *root;
+ struct rb_node **new;
struct rb_node *parent = NULL;
struct stable_node *stable_node;
+ kpfn = page_to_pfn(kpage);
+ nid = get_kpfn_nid(kpfn);
+ root = root_stable_tree + nid;
+ new = &root->rb_node;
+
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node);
+ tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
@@ -1027,13 +1294,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
if (!stable_node)
return NULL;
- rb_link_node(&stable_node->node, parent, new);
- rb_insert_color(&stable_node->node, &root_stable_tree);
-
INIT_HLIST_HEAD(&stable_node->hlist);
-
- stable_node->kpfn = page_to_pfn(kpage);
+ stable_node->kpfn = kpfn;
set_page_stable_node(kpage, stable_node);
+ DO_NUMA(stable_node->nid = nid);
+ rb_link_node(&stable_node->node, parent, new);
+ rb_insert_color(&stable_node->node, root);
return stable_node;
}
@@ -1056,10 +1322,15 @@ static
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
-
{
- struct rb_node **new = &root_unstable_tree.rb_node;
+ struct rb_node **new;
+ struct rb_root *root;
struct rb_node *parent = NULL;
+ int nid;
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_unstable_tree + nid;
+ new = &root->rb_node;
while (*new) {
struct rmap_item *tree_rmap_item;
@@ -1089,6 +1360,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
} else if (ret > 0) {
put_page(tree_page);
new = &parent->rb_right;
+ } else if (!ksm_merge_across_nodes &&
+ page_to_nid(tree_page) != nid) {
+ /*
+ * If tree_page has been migrated to another NUMA node,
+ * it will be flushed out and put in the right unstable
+ * tree next time: only merge with it when across_nodes.
+ */
+ put_page(tree_page);
+ return NULL;
} else {
*tree_pagep = tree_page;
return tree_rmap_item;
@@ -1097,8 +1377,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
rmap_item->address |= UNSTABLE_FLAG;
rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+ DO_NUMA(rmap_item->nid = nid);
rb_link_node(&rmap_item->node, parent, new);
- rb_insert_color(&rmap_item->node, &root_unstable_tree);
+ rb_insert_color(&rmap_item->node, root);
ksm_pages_unshared++;
return NULL;
@@ -1140,10 +1421,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
unsigned int checksum;
int err;
- remove_rmap_item_from_tree(rmap_item);
+ stable_node = page_stable_node(page);
+ if (stable_node) {
+ if (stable_node->head != &migrate_nodes &&
+ get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
+ rb_erase(&stable_node->node,
+ root_stable_tree + NUMA(stable_node->nid));
+ stable_node->head = &migrate_nodes;
+ list_add(&stable_node->list, stable_node->head);
+ }
+ if (stable_node->head != &migrate_nodes &&
+ rmap_item->head == stable_node)
+ return;
+ }
/* We first start with searching the page inside the stable tree */
kpage = stable_tree_search(page);
+ if (kpage == page && rmap_item->head == stable_node) {
+ put_page(kpage);
+ return;
+ }
+
+ remove_rmap_item_from_tree(rmap_item);
+
if (kpage) {
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
@@ -1177,14 +1477,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
- /*
- * As soon as we merge this page, we want to remove the
- * rmap_item of the page we have merged with from the unstable
- * tree, and insert it instead as new node in the stable tree.
- */
if (kpage) {
- remove_rmap_item_from_tree(tree_rmap_item);
-
+ /*
+ * The pages were successfully merged: insert new
+ * node in the stable tree and add both rmap_items.
+ */
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
@@ -1241,18 +1538,59 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
+ int nid;
if (list_empty(&ksm_mm_head.mm_list))
return NULL;
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
- root_unstable_tree = RB_ROOT;
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
+ /*
+ * Whereas stale stable_nodes on the stable_tree itself
+ * get pruned in the regular course of stable_tree_search(),
+ * those moved out to the migrate_nodes list can accumulate:
+ * so prune them once before each full scan.
+ */
+ if (!ksm_merge_across_nodes) {
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ struct page *page;
+
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this,
+ struct stable_node, list);
+ page = get_ksm_page(stable_node, false);
+ if (page)
+ put_page(page);
+ cond_resched();
+ }
+ }
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++)
+ root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
@@ -1277,7 +1615,13 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1635,7 @@ next_mm:
up_read(&mm->mmap_sem);
return rmap_item;
}
- if (!IS_ERR_OR_NULL(*page))
- put_page(*page);
+ put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -1321,7 +1664,7 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
- hlist_del(&slot->link);
+ hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -1352,13 +1695,12 @@ static void ksm_do_scan(unsigned int scan_npages)
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
- while (scan_npages--) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
- if (!PageKsm(page) || !in_stable_tree(rmap_item))
- cmp_and_merge_page(page, rmap_item);
+ cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
@@ -1370,19 +1712,23 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
if (ksmd_should_run())
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
+ try_to_freeze();
+
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
- wait_event_interruptible(ksm_thread_wait,
+ wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
@@ -1402,10 +1748,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+#ifdef VM_SAO
+ if (*vm_flags & VM_SAO)
+ return 0;
+#endif
+
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm);
if (err)
@@ -1447,11 +1797,19 @@ int __ksm_enter(struct mm_struct *mm)
spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
- * Insert just behind the scanning cursor, to let the area settle
+ * When KSM_RUN_MERGE (or KSM_RUN_STOP),
+ * insert just behind the scanning cursor, to let the area settle
* down a little; when fork is followed by immediate exec, we don't
* want ksmd to waste time setting up and tearing down an rmap_list.
+ *
+ * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
+ * scanning cursor, otherwise KSM pages in newly forked mms will be
+ * missed: then we might as well insert at the end of the list.
*/
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ if (ksm_run & KSM_RUN_UNMERGE)
+ list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ else
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1481,7 +1839,7 @@ void __ksm_exit(struct mm_struct *mm)
mm_slot = get_mm_slot(mm);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
- hlist_del(&mm_slot->link);
+ hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
easy_to_free = 1;
} else {
@@ -1501,158 +1859,64 @@ void __ksm_exit(struct mm_struct *mm)
}
}
-struct page *ksm_does_need_to_copy(struct page *page,
+struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ struct anon_vma *anon_vma = page_anon_vma(page);
struct page *new_page;
+ if (PageKsm(page)) {
+ if (page_stable_node(page) &&
+ !(ksm_run & KSM_RUN_UNMERGE))
+ return page; /* no need to copy it */
+ } else if (!anon_vma) {
+ return page; /* no need to copy it */
+ } else if (anon_vma->root == vma->anon_vma->root &&
+ page->index == linear_page_index(vma, address)) {
+ return page; /* still no need to copy it */
+ }
+ if (!PageUptodate(page))
+ return page; /* let do_swap_page report the error */
+
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- SetPageSwapBacked(new_page);
__set_page_locked(new_page);
-
- if (page_evictable(new_page, vma))
- lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
- else
- add_page_to_unevictable_list(new_page);
}
return new_page;
}
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
- unsigned long *vm_flags)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- struct hlist_node *hlist;
- unsigned int mapcount = page_mapcount(page);
- int referenced = 0;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return 0;
-again:
- hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
- continue;
-
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
-
- referenced += page_referenced_one(page, vma,
- rmap_item->address, &mapcount, vm_flags);
- if (!search_new_forks || !mapcount)
- break;
- }
- anon_vma_unlock(anon_vma);
- if (!mapcount)
- goto out;
- }
- if (!search_new_forks++)
- goto again;
-out:
- return referenced;
-}
-
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
- struct hlist_node *hlist;
struct rmap_item *rmap_item;
int ret = SWAP_AGAIN;
int search_new_forks = 0;
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return SWAP_FAIL;
-again:
- hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
- continue;
-
- ret = try_to_unmap_one(page, vma,
- rmap_item->address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page)) {
- anon_vma_unlock(anon_vma);
- goto out;
- }
- }
- anon_vma_unlock(anon_vma);
- }
- if (!search_new_forks++)
- goto again;
-out:
- return ret;
-}
-
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
-{
- struct stable_node *stable_node;
- struct hlist_node *hlist;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
stable_node = page_stable_node(page);
if (!stable_node)
return ret;
again:
- hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1666,13 +1930,21 @@ again:
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
- ret = rmap_one(page, vma, rmap_item->address, arg);
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg);
if (ret != SWAP_AGAIN) {
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
+ goto out;
+ }
+ if (rwc->done && rwc->done(page)) {
+ anon_vma_unlock_read(anon_vma);
goto out;
}
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1680,71 +1952,128 @@ out:
return ret;
}
+#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
- VM_BUG_ON(!PageLocked(oldpage));
- VM_BUG_ON(!PageLocked(newpage));
- VM_BUG_ON(newpage->mapping != oldpage->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
stable_node = page_stable_node(newpage);
if (stable_node) {
- VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
+ /*
+ * newpage->mapping was set in advance; now we need smp_wmb()
+ * to make sure that the new stable_node->kpfn is visible
+ * to get_ksm_page() before it can see that oldpage->mapping
+ * has gone stale (or that PageSwapCache has been cleared).
+ */
+ smp_wmb();
+ set_page_stable_node(oldpage, NULL);
}
}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_HOTREMOVE
-static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
- unsigned long end_pfn)
+static int just_wait(void *word)
{
- struct rb_node *node;
+ schedule();
+ return 0;
+}
- for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
- struct stable_node *stable_node;
+static void wait_while_offlining(void)
+{
+ while (ksm_run & KSM_RUN_OFFLINE) {
+ mutex_unlock(&ksm_thread_mutex);
+ wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
+ just_wait, TASK_UNINTERRUPTIBLE);
+ mutex_lock(&ksm_thread_mutex);
+ }
+}
- stable_node = rb_entry(node, struct stable_node, node);
+static void ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ struct rb_node *node;
+ int nid;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ node = rb_first(root_stable_tree + nid);
+ while (node) {
+ stable_node = rb_entry(node, struct stable_node, node);
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ node = rb_first(root_stable_tree + nid);
+ } else
+ node = rb_next(node);
+ cond_resched();
+ }
+ }
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this, struct stable_node, list);
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
- return stable_node;
+ remove_node_from_stable_tree(stable_node);
+ cond_resched();
}
- return NULL;
}
static int ksm_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_notify *mn = arg;
- struct stable_node *stable_node;
switch (action) {
case MEM_GOING_OFFLINE:
/*
- * Keep it very simple for now: just lock out ksmd and
- * MADV_UNMERGEABLE while any memory is going offline.
+ * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
+ * and remove_all_stable_nodes() while memory is going offline:
+ * it is unsafe for them to touch the stable tree at this time.
+ * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * which do not need the ksm_thread_mutex are all safe.
*/
mutex_lock(&ksm_thread_mutex);
+ ksm_run |= KSM_RUN_OFFLINE;
+ mutex_unlock(&ksm_thread_mutex);
break;
case MEM_OFFLINE:
/*
* Most of the work is done by page migration; but there might
* be a few stable_nodes left over, still pointing to struct
- * pages which have been offlined: prune those from the tree.
+ * pages which have been offlined: prune those from the tree,
+ * otherwise get_ksm_page() might later try to access a
+ * non-existent struct page.
*/
- while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
- mn->start_pfn + mn->nr_pages)) != NULL)
- remove_node_from_stable_tree(stable_node);
+ ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages);
/* fallthrough */
case MEM_CANCEL_OFFLINE:
+ mutex_lock(&ksm_thread_mutex);
+ ksm_run &= ~KSM_RUN_OFFLINE;
mutex_unlock(&ksm_thread_mutex);
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
break;
}
return NOTIFY_OK;
}
+#else
+static void wait_while_offlining(void)
+{
+}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SYSFS
@@ -1771,7 +2100,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = strict_strtoul(buf, 10, &msecs);
+ err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -1794,7 +2123,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
int err;
unsigned long nr_pages;
- err = strict_strtoul(buf, 10, &nr_pages);
+ err = kstrtoul(buf, 10, &nr_pages);
if (err || nr_pages > UINT_MAX)
return -EINVAL;
@@ -1807,7 +2136,7 @@ KSM_ATTR(pages_to_scan);
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", ksm_run);
+ return sprintf(buf, "%lu\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1816,7 +2145,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
int err;
unsigned long flags;
- err = strict_strtoul(buf, 10, &flags);
+ err = kstrtoul(buf, 10, &flags);
if (err || flags > UINT_MAX)
return -EINVAL;
if (flags > KSM_RUN_UNMERGE)
@@ -1830,12 +2159,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
*/
mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
- current->flags |= PF_OOM_ORIGIN;
+ set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
- current->flags &= ~PF_OOM_ORIGIN;
+ clear_current_oom_origin();
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
@@ -1851,6 +2181,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
}
KSM_ATTR(run);
+#ifdef CONFIG_NUMA
+static ssize_t merge_across_nodes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+}
+
+static ssize_t merge_across_nodes_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long knob;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_merge_across_nodes != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else if (root_stable_tree == one_stable_tree) {
+ struct rb_root *buf;
+ /*
+ * This is the first time that we switch away from the
+ * default of merging across nodes: must now allocate
+ * a buffer to hold as many roots as may be needed.
+ * Allocate stable and unstable together:
+ * MAXSMP NODES_SHIFT 10 will use 16kB.
+ */
+ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
+ GFP_KERNEL);
+ /* Let us assume that RB_ROOT is NULL is zero */
+ if (!buf)
+ err = -ENOMEM;
+ else {
+ root_stable_tree = buf;
+ root_unstable_tree = buf + nr_node_ids;
+ /* Stable tree is empty but not the unstable */
+ root_unstable_tree[0] = one_unstable_tree[0];
+ }
+ }
+ if (!err) {
+ ksm_merge_across_nodes = knob;
+ ksm_nr_node_ids = knob ? 1 : nr_node_ids;
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(merge_across_nodes);
+#endif
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -1905,6 +2293,9 @@ static struct attribute *ksm_attrs[] = {
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
&full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+ &merge_across_nodes_attr.attr,
+#endif
NULL,
};
@@ -1943,10 +2334,7 @@ static int __init ksm_init(void)
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_MEMORY_HOTREMOVE
- /*
- * Choose a high priority since the callback takes ksm_thread_mutex:
- * later callbacks could only be taking locks which nest within that.
- */
+ /* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
return 0;
@@ -1956,4 +2344,4 @@ out_free:
out:
return err;
}
-module_init(ksm_init)
+subsys_initcall(ksm_init);