diff options
Diffstat (limited to 'mm/ksm.c')
| -rw-r--r-- | mm/ksm.c | 1012 | 
1 files changed, 700 insertions, 312 deletions
@@ -33,11 +33,22 @@  #include <linux/mmu_notifier.h>  #include <linux/swap.h>  #include <linux/ksm.h> -#include <linux/hash.h> +#include <linux/hashtable.h> +#include <linux/freezer.h> +#include <linux/oom.h> +#include <linux/numa.h>  #include <asm/tlbflush.h>  #include "internal.h" +#ifdef CONFIG_NUMA +#define NUMA(x)		(x) +#define DO_NUMA(x)	do { (x); } while (0) +#else +#define NUMA(x)		(0) +#define DO_NUMA(x)	do { } while (0) +#endif +  /*   * A few notes about the KSM scanning process,   * to make it easier to understand the data structures below: @@ -76,6 +87,9 @@   *    take 10 attempts to find a page in the unstable tree, once it is found,   *    it is secured in the stable tree.  (When we scan a new page, we first   *    compare it against the stable tree, and then against the unstable tree.) + * + * If the merge_across_nodes tunable is unset, then KSM maintains multiple + * stable trees and multiple unstable trees: one of each for each NUMA node.   */  /** @@ -111,19 +125,32 @@ struct ksm_scan {  /**   * struct stable_node - node of the stable rbtree   * @node: rb node of this ksm page in the stable tree + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @list: linked into migrate_nodes, pending placement in the proper node tree   * @hlist: hlist head of rmap_items using this ksm page - * @kpfn: page frame number of this ksm page + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @nid: NUMA node id of stable tree in which linked (may not match kpfn)   */  struct stable_node { -	struct rb_node node; +	union { +		struct rb_node node;	/* when node of stable tree */ +		struct {		/* when listed for migration */ +			struct list_head *head; +			struct list_head list; +		}; +	};  	struct hlist_head hlist;  	unsigned long kpfn; +#ifdef CONFIG_NUMA +	int nid; +#endif  };  /**   * struct rmap_item - reverse mapping item for virtual addresses   * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list   * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @nid: NUMA node id of unstable tree in which linked (may not match page)   * @mm: the memory structure this rmap_item is pointing into   * @address: the virtual address this rmap_item tracks (+ flags in low bits)   * @oldchecksum: previous checksum of the page at that virtual address @@ -133,7 +160,12 @@ struct stable_node {   */  struct rmap_item {  	struct rmap_item *rmap_list; -	struct anon_vma *anon_vma;	/* when stable */ +	union { +		struct anon_vma *anon_vma;	/* when stable */ +#ifdef CONFIG_NUMA +		int nid;		/* when node of unstable tree */ +#endif +	};  	struct mm_struct *mm;  	unsigned long address;		/* + low bits used for flags below */  	unsigned int oldchecksum;	/* when unstable */ @@ -151,12 +183,16 @@ struct rmap_item {  #define STABLE_FLAG	0x200	/* is listed from the stable tree */  /* The stable and unstable tree heads */ -static struct rb_root root_stable_tree = RB_ROOT; -static struct rb_root root_unstable_tree = RB_ROOT; +static struct rb_root one_stable_tree[1] = { RB_ROOT }; +static struct rb_root one_unstable_tree[1] = { RB_ROOT }; +static struct rb_root *root_stable_tree = one_stable_tree; +static struct rb_root *root_unstable_tree = one_unstable_tree; + +/* Recently migrated nodes of stable tree, pending proper placement */ +static LIST_HEAD(migrate_nodes); -#define MM_SLOTS_HASH_SHIFT 10 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; +#define MM_SLOTS_HASH_BITS 10 +static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);  static struct mm_slot ksm_mm_head = {  	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), @@ -187,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;  /* Milliseconds ksmd should sleep between batches */  static unsigned int ksm_thread_sleep_millisecs = 20; +#ifdef CONFIG_NUMA +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; +static int ksm_nr_node_ids = 1; +#else +#define ksm_merge_across_nodes	1U +#define ksm_nr_node_ids		1 +#endif +  #define KSM_RUN_STOP	0  #define KSM_RUN_MERGE	1  #define KSM_RUN_UNMERGE	2 -static unsigned int ksm_run = KSM_RUN_STOP; +#define KSM_RUN_OFFLINE	4 +static unsigned long ksm_run = KSM_RUN_STOP; +static void wait_while_offlining(void);  static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);  static DEFINE_MUTEX(ksm_thread_mutex); @@ -273,45 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)  static struct mm_slot *get_mm_slot(struct mm_struct *mm)  { -	struct mm_slot *mm_slot; -	struct hlist_head *bucket; -	struct hlist_node *node; +	struct mm_slot *slot; + +	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) +		if (slot->mm == mm) +			return slot; -	bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; -	hlist_for_each_entry(mm_slot, node, bucket, link) { -		if (mm == mm_slot->mm) -			return mm_slot; -	}  	return NULL;  }  static void insert_to_mm_slots_hash(struct mm_struct *mm,  				    struct mm_slot *mm_slot)  { -	struct hlist_head *bucket; - -	bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];  	mm_slot->mm = mm; -	hlist_add_head(&mm_slot->link, bucket); -} - -static inline int in_stable_tree(struct rmap_item *rmap_item) -{ -	return rmap_item->address & STABLE_FLAG; -} - -static void hold_anon_vma(struct rmap_item *rmap_item, -			  struct anon_vma *anon_vma) -{ -	rmap_item->anon_vma = anon_vma; -	get_anon_vma(anon_vma); -} - -static void ksm_drop_anon_vma(struct rmap_item *rmap_item) -{ -	struct anon_vma *anon_vma = rmap_item->anon_vma; - -	drop_anon_vma(anon_vma); +	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);  }  /* @@ -345,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)  	do {  		cond_resched(); -		page = follow_page(vma, addr, FOLL_GET); +		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);  		if (IS_ERR_OR_NULL(page))  			break;  		if (PageKsm(page)) @@ -386,6 +408,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)  	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;  } +static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, +		unsigned long addr) +{ +	struct vm_area_struct *vma; +	if (ksm_test_exit(mm)) +		return NULL; +	vma = find_vma(mm, addr); +	if (!vma || vma->vm_start > addr) +		return NULL; +	if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) +		return NULL; +	return vma; +} +  static void break_cow(struct rmap_item *rmap_item)  {  	struct mm_struct *mm = rmap_item->mm; @@ -396,21 +432,29 @@ static void break_cow(struct rmap_item *rmap_item)  	 * It is not an accident that whenever we want to break COW  	 * to undo, we also need to drop a reference to the anon_vma.  	 */ -	ksm_drop_anon_vma(rmap_item); +	put_anon_vma(rmap_item->anon_vma);  	down_read(&mm->mmap_sem); -	if (ksm_test_exit(mm)) -		goto out; -	vma = find_vma(mm, addr); -	if (!vma || vma->vm_start > addr) -		goto out; -	if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) -		goto out; -	break_ksm(vma, addr); -out: +	vma = find_mergeable_vma(mm, addr); +	if (vma) +		break_ksm(vma, addr);  	up_read(&mm->mmap_sem);  } +static struct page *page_trans_compound_anon(struct page *page) +{ +	if (PageTransCompound(page)) { +		struct page *head = compound_head(page); +		/* +		 * head may actually be splitted and freed from under +		 * us but it's ok here. +		 */ +		if (PageAnon(head)) +			return head; +	} +	return NULL; +} +  static struct page *get_mergeable_page(struct rmap_item *rmap_item)  {  	struct mm_struct *mm = rmap_item->mm; @@ -419,18 +463,14 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)  	struct page *page;  	down_read(&mm->mmap_sem); -	if (ksm_test_exit(mm)) -		goto out; -	vma = find_vma(mm, addr); -	if (!vma || vma->vm_start > addr) -		goto out; -	if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) +	vma = find_mergeable_vma(mm, addr); +	if (!vma)  		goto out;  	page = follow_page(vma, addr, FOLL_GET);  	if (IS_ERR_OR_NULL(page))  		goto out; -	if (PageAnon(page)) { +	if (PageAnon(page) || page_trans_compound_anon(page)) {  		flush_anon_page(vma, page, addr);  		flush_dcache_page(page);  	} else { @@ -441,22 +481,36 @@ out:		page = NULL;  	return page;  } +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ +	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); +} +  static void remove_node_from_stable_tree(struct stable_node *stable_node)  {  	struct rmap_item *rmap_item; -	struct hlist_node *hlist; -	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { +	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {  		if (rmap_item->hlist.next)  			ksm_pages_sharing--;  		else  			ksm_pages_shared--; -		ksm_drop_anon_vma(rmap_item); +		put_anon_vma(rmap_item->anon_vma);  		rmap_item->address &= PAGE_MASK;  		cond_resched();  	} -	rb_erase(&stable_node->node, &root_stable_tree); +	if (stable_node->head == &migrate_nodes) +		list_del(&stable_node->list); +	else +		rb_erase(&stable_node->node, +			 root_stable_tree + NUMA(stable_node->nid));  	free_stable_node(stable_node);  } @@ -466,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)   * In which case we can trust the content of the page, and it   * returns the gotten page; but if the page has now been zapped,   * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated.   *   * You would expect the stable_node to hold a reference to the ksm page.   * But if it increments the page's count, swapping out has to wait for @@ -476,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)   * pointing back to this stable node.  This relies on freeing a PageAnon   * page to reset its page->mapping to NULL, and relies on no other use of   * a page to put something that might look like our key in page->mapping. - * - * include/linux/pagemap.h page_cache_get_speculative() is a good reference, - * but this is different - made simpler by ksm_thread_mutex being held, but - * interesting for assuming that no other use of the struct page could ever - * put our expected_mapping into page->mapping (or a field of the union which - * coincides with page->mapping).  The RCU calls are not for KSM at all, but - * to keep the page_count protocol described with page_cache_get_speculative. - * - * Note: it is possible that get_ksm_page() will return NULL one moment, - * then page the next, if the page is in between page_freeze_refs() and - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page   * is on its way to being freed; but it is an anomaly to bear in mind.   */ -static struct page *get_ksm_page(struct stable_node *stable_node) +static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)  {  	struct page *page;  	void *expected_mapping; +	unsigned long kpfn; -	page = pfn_to_page(stable_node->kpfn);  	expected_mapping = (void *)stable_node +  				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); -	rcu_read_lock(); -	if (page->mapping != expected_mapping) -		goto stale; -	if (!get_page_unless_zero(page)) +again: +	kpfn = ACCESS_ONCE(stable_node->kpfn); +	page = pfn_to_page(kpfn); + +	/* +	 * page is computed from kpfn, so on most architectures reading +	 * page->mapping is naturally ordered after reading node->kpfn, +	 * but on Alpha we need to be more careful. +	 */ +	smp_read_barrier_depends(); +	if (ACCESS_ONCE(page->mapping) != expected_mapping)  		goto stale; -	if (page->mapping != expected_mapping) { + +	/* +	 * We cannot do anything with the page while its refcount is 0. +	 * Usually 0 means free, or tail of a higher-order page: in which +	 * case this node is no longer referenced, and should be freed; +	 * however, it might mean that the page is under page_freeze_refs(). +	 * The __remove_mapping() case is easy, again the node is now stale; +	 * but if page is swapcache in migrate_page_move_mapping(), it might +	 * still be our page, in which case it's essential to keep the node. +	 */ +	while (!get_page_unless_zero(page)) { +		/* +		 * Another check for page->mapping != expected_mapping would +		 * work here too.  We have chosen the !PageSwapCache test to +		 * optimize the common case, when the page is or is about to +		 * be freed: PageSwapCache is cleared (under spin_lock_irq) +		 * in the freeze_refs section of __remove_mapping(); but Anon +		 * page->mapping reset to NULL later, in free_pages_prepare(). +		 */ +		if (!PageSwapCache(page)) +			goto stale; +		cpu_relax(); +	} + +	if (ACCESS_ONCE(page->mapping) != expected_mapping) {  		put_page(page);  		goto stale;  	} -	rcu_read_unlock(); + +	if (lock_it) { +		lock_page(page); +		if (ACCESS_ONCE(page->mapping) != expected_mapping) { +			unlock_page(page); +			put_page(page); +			goto stale; +		} +	}  	return page; +  stale: -	rcu_read_unlock(); +	/* +	 * We come here from above when page->mapping or !PageSwapCache +	 * suggests that the node is stale; but it might be under migration. +	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), +	 * before checking whether node->kpfn has been changed. +	 */ +	smp_rmb(); +	if (ACCESS_ONCE(stable_node->kpfn) != kpfn) +		goto again;  	remove_node_from_stable_tree(stable_node);  	return NULL;  } @@ -525,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)  		struct page *page;  		stable_node = rmap_item->head; -		page = get_ksm_page(stable_node); +		page = get_ksm_page(stable_node, true);  		if (!page)  			goto out; -		lock_page(page);  		hlist_del(&rmap_item->hlist);  		unlock_page(page);  		put_page(page); @@ -539,7 +630,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)  		else  			ksm_pages_shared--; -		ksm_drop_anon_vma(rmap_item); +		put_anon_vma(rmap_item->anon_vma);  		rmap_item->address &= PAGE_MASK;  	} else if (rmap_item->address & UNSTABLE_FLAG) { @@ -554,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)  		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);  		BUG_ON(age > 1);  		if (!age) -			rb_erase(&rmap_item->node, &root_unstable_tree); - +			rb_erase(&rmap_item->node, +				 root_unstable_tree + NUMA(rmap_item->nid));  		ksm_pages_unshared--;  		rmap_item->address &= PAGE_MASK;  	} @@ -575,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,  }  /* - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * Though it's very tempting to unmerge rmap_items from stable tree rather   * than check every pte of a given vma, the locking doesn't quite work for   * that - an rmap_item is assigned to the stable tree after inserting ksm   * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing @@ -608,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,  /*   * Only called through the sysfs control interface:   */ +static int remove_stable_node(struct stable_node *stable_node) +{ +	struct page *page; +	int err; + +	page = get_ksm_page(stable_node, true); +	if (!page) { +		/* +		 * get_ksm_page did remove_node_from_stable_tree itself. +		 */ +		return 0; +	} + +	if (WARN_ON_ONCE(page_mapped(page))) { +		/* +		 * This should not happen: but if it does, just refuse to let +		 * merge_across_nodes be switched - there is no need to panic. +		 */ +		err = -EBUSY; +	} else { +		/* +		 * The stable node did not yet appear stale to get_ksm_page(), +		 * since that allows for an unmapped ksm page to be recognized +		 * right up until it is freed; but the node is safe to remove. +		 * This page might be in a pagevec waiting to be freed, +		 * or it might be PageSwapCache (perhaps under writeback), +		 * or it might have been removed from swapcache a moment ago. +		 */ +		set_page_stable_node(page, NULL); +		remove_node_from_stable_tree(stable_node); +		err = 0; +	} + +	unlock_page(page); +	put_page(page); +	return err; +} + +static int remove_all_stable_nodes(void) +{ +	struct stable_node *stable_node; +	struct list_head *this, *next; +	int nid; +	int err = 0; + +	for (nid = 0; nid < ksm_nr_node_ids; nid++) { +		while (root_stable_tree[nid].rb_node) { +			stable_node = rb_entry(root_stable_tree[nid].rb_node, +						struct stable_node, node); +			if (remove_stable_node(stable_node)) { +				err = -EBUSY; +				break;	/* proceed to next nid */ +			} +			cond_resched(); +		} +	} +	list_for_each_safe(this, next, &migrate_nodes) { +		stable_node = list_entry(this, struct stable_node, list); +		if (remove_stable_node(stable_node)) +			err = -EBUSY; +		cond_resched(); +	} +	return err; +} +  static int unmerge_and_remove_all_rmap_items(void)  {  	struct mm_slot *mm_slot; @@ -641,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void)  		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,  						struct mm_slot, mm_list);  		if (ksm_test_exit(mm)) { -			hlist_del(&mm_slot->link); +			hash_del(&mm_slot->link);  			list_del(&mm_slot->mm_list);  			spin_unlock(&ksm_mmlist_lock); @@ -655,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void)  		}  	} +	/* Clean up stable nodes, but don't worry if some are still busy */ +	remove_all_stable_nodes();  	ksm_scan.seqnr = 0;  	return 0; @@ -670,9 +828,9 @@ error:  static u32 calc_checksum(struct page *page)  {  	u32 checksum; -	void *addr = kmap_atomic(page, KM_USER0); +	void *addr = kmap_atomic(page);  	checksum = jhash2(addr, PAGE_SIZE / 4, 17); -	kunmap_atomic(addr, KM_USER0); +	kunmap_atomic(addr);  	return checksum;  } @@ -681,11 +839,11 @@ static int memcmp_pages(struct page *page1, struct page *page2)  	char *addr1, *addr2;  	int ret; -	addr1 = kmap_atomic(page1, KM_USER0); -	addr2 = kmap_atomic(page2, KM_USER1); +	addr1 = kmap_atomic(page1); +	addr2 = kmap_atomic(page2);  	ret = memcmp(addr1, addr2, PAGE_SIZE); -	kunmap_atomic(addr2, KM_USER1); -	kunmap_atomic(addr1, KM_USER0); +	kunmap_atomic(addr2); +	kunmap_atomic(addr1);  	return ret;  } @@ -703,14 +861,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,  	spinlock_t *ptl;  	int swapped;  	int err = -EFAULT; +	unsigned long mmun_start;	/* For mmu_notifiers */ +	unsigned long mmun_end;		/* For mmu_notifiers */  	addr = page_address_in_vma(page, vma);  	if (addr == -EFAULT)  		goto out; +	BUG_ON(PageTransCompound(page)); + +	mmun_start = addr; +	mmun_end   = addr + PAGE_SIZE; +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); +  	ptep = page_check_address(page, mm, addr, &ptl, 0);  	if (!ptep) -		goto out; +		goto out_mn;  	if (pte_write(*ptep) || pte_dirty(*ptep)) {  		pte_t entry; @@ -718,7 +884,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,  		swapped = PageSwapCache(page);  		flush_cache_page(vma, addr, page_to_pfn(page));  		/* -		 * Ok this is tricky, when get_user_pages_fast() run it doesnt +		 * Ok this is tricky, when get_user_pages_fast() run it doesn't  		 * take any lock, therefore the check that we are going to make  		 * with the pagecount against the mapcount is racey and  		 * O_DIRECT can happen right after the check. @@ -745,6 +911,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,  out_unlock:  	pte_unmap_unlock(ptep, ptl); +out_mn: +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  out:  	return err;  } @@ -762,34 +930,30 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,  			struct page *kpage, pte_t orig_pte)  {  	struct mm_struct *mm = vma->vm_mm; -	pgd_t *pgd; -	pud_t *pud;  	pmd_t *pmd;  	pte_t *ptep;  	spinlock_t *ptl;  	unsigned long addr;  	int err = -EFAULT; +	unsigned long mmun_start;	/* For mmu_notifiers */ +	unsigned long mmun_end;		/* For mmu_notifiers */  	addr = page_address_in_vma(page, vma);  	if (addr == -EFAULT)  		goto out; -	pgd = pgd_offset(mm, addr); -	if (!pgd_present(*pgd)) +	pmd = mm_find_pmd(mm, addr); +	if (!pmd)  		goto out; -	pud = pud_offset(pgd, addr); -	if (!pud_present(*pud)) -		goto out; - -	pmd = pmd_offset(pud, addr); -	if (!pmd_present(*pmd)) -		goto out; +	mmun_start = addr; +	mmun_end   = addr + PAGE_SIZE; +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);  	if (!pte_same(*ptep, orig_pte)) {  		pte_unmap_unlock(ptep, ptl); -		goto out; +		goto out_mn;  	}  	get_page(kpage); @@ -800,14 +964,45 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,  	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));  	page_remove_rmap(page); +	if (!page_mapped(page)) +		try_to_free_swap(page);  	put_page(page);  	pte_unmap_unlock(ptep, ptl);  	err = 0; +out_mn: +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  out:  	return err;  } +static int page_trans_compound_anon_split(struct page *page) +{ +	int ret = 0; +	struct page *transhuge_head = page_trans_compound_anon(page); +	if (transhuge_head) { +		/* Get the reference on the head to split it. */ +		if (get_page_unless_zero(transhuge_head)) { +			/* +			 * Recheck we got the reference while the head +			 * was still anonymous. +			 */ +			if (PageAnon(transhuge_head)) +				ret = split_huge_page(transhuge_head); +			else +				/* +				 * Retry later if split_huge_page run +				 * from under us. +				 */ +				ret = 1; +			put_page(transhuge_head); +		} else +			/* Retry later if split_huge_page run from under us. */ +			ret = 1; +	} +	return ret; +} +  /*   * try_to_merge_one_page - take two pages and merge them into one   * @vma: the vma that holds the pte pointing to page @@ -828,6 +1023,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,  	if (!(vma->vm_flags & VM_MERGEABLE))  		goto out; +	if (PageTransCompound(page) && page_trans_compound_anon_split(page)) +		goto out; +	BUG_ON(PageTransCompound(page));  	if (!PageAnon(page))  		goto out; @@ -899,8 +1097,12 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,  	if (err)  		goto out; +	/* Unstable nid is in union with stable anon_vma: remove first */ +	remove_rmap_item_from_tree(rmap_item); +  	/* Must get reference to anon_vma while still holding mmap_sem */ -	hold_anon_vma(rmap_item, vma->anon_vma); +	rmap_item->anon_vma = vma->anon_vma; +	get_anon_vma(vma->anon_vma);  out:  	up_read(&mm->mmap_sem);  	return err; @@ -948,42 +1150,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,   */  static struct page *stable_tree_search(struct page *page)  { -	struct rb_node *node = root_stable_tree.rb_node; +	int nid; +	struct rb_root *root; +	struct rb_node **new; +	struct rb_node *parent;  	struct stable_node *stable_node; +	struct stable_node *page_node; -	stable_node = page_stable_node(page); -	if (stable_node) {			/* ksm page forked */ +	page_node = page_stable_node(page); +	if (page_node && page_node->head != &migrate_nodes) { +		/* ksm page forked */  		get_page(page);  		return page;  	} -	while (node) { +	nid = get_kpfn_nid(page_to_pfn(page)); +	root = root_stable_tree + nid; +again: +	new = &root->rb_node; +	parent = NULL; + +	while (*new) {  		struct page *tree_page;  		int ret;  		cond_resched(); -		stable_node = rb_entry(node, struct stable_node, node); -		tree_page = get_ksm_page(stable_node); +		stable_node = rb_entry(*new, struct stable_node, node); +		tree_page = get_ksm_page(stable_node, false);  		if (!tree_page)  			return NULL;  		ret = memcmp_pages(page, tree_page); +		put_page(tree_page); -		if (ret < 0) { -			put_page(tree_page); -			node = node->rb_left; -		} else if (ret > 0) { -			put_page(tree_page); -			node = node->rb_right; -		} else -			return tree_page; +		parent = *new; +		if (ret < 0) +			new = &parent->rb_left; +		else if (ret > 0) +			new = &parent->rb_right; +		else { +			/* +			 * Lock and unlock the stable_node's page (which +			 * might already have been migrated) so that page +			 * migration is sure to notice its raised count. +			 * It would be more elegant to return stable_node +			 * than kpage, but that involves more changes. +			 */ +			tree_page = get_ksm_page(stable_node, true); +			if (tree_page) { +				unlock_page(tree_page); +				if (get_kpfn_nid(stable_node->kpfn) != +						NUMA(stable_node->nid)) { +					put_page(tree_page); +					goto replace; +				} +				return tree_page; +			} +			/* +			 * There is now a place for page_node, but the tree may +			 * have been rebalanced, so re-evaluate parent and new. +			 */ +			if (page_node) +				goto again; +			return NULL; +		}  	} -	return NULL; +	if (!page_node) +		return NULL; + +	list_del(&page_node->list); +	DO_NUMA(page_node->nid = nid); +	rb_link_node(&page_node->node, parent, new); +	rb_insert_color(&page_node->node, root); +	get_page(page); +	return page; + +replace: +	if (page_node) { +		list_del(&page_node->list); +		DO_NUMA(page_node->nid = nid); +		rb_replace_node(&stable_node->node, &page_node->node, root); +		get_page(page); +	} else { +		rb_erase(&stable_node->node, root); +		page = NULL; +	} +	stable_node->head = &migrate_nodes; +	list_add(&stable_node->list, stable_node->head); +	return page;  }  /* - * stable_tree_insert - insert rmap_item pointing to new ksm page + * stable_tree_insert - insert stable tree node pointing to new ksm page   * into the stable tree.   *   * This function returns the stable tree node just allocated on success, @@ -991,17 +1250,25 @@ static struct page *stable_tree_search(struct page *page)   */  static struct stable_node *stable_tree_insert(struct page *kpage)  { -	struct rb_node **new = &root_stable_tree.rb_node; +	int nid; +	unsigned long kpfn; +	struct rb_root *root; +	struct rb_node **new;  	struct rb_node *parent = NULL;  	struct stable_node *stable_node; +	kpfn = page_to_pfn(kpage); +	nid = get_kpfn_nid(kpfn); +	root = root_stable_tree + nid; +	new = &root->rb_node; +  	while (*new) {  		struct page *tree_page;  		int ret;  		cond_resched();  		stable_node = rb_entry(*new, struct stable_node, node); -		tree_page = get_ksm_page(stable_node); +		tree_page = get_ksm_page(stable_node, false);  		if (!tree_page)  			return NULL; @@ -1027,13 +1294,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)  	if (!stable_node)  		return NULL; -	rb_link_node(&stable_node->node, parent, new); -	rb_insert_color(&stable_node->node, &root_stable_tree); -  	INIT_HLIST_HEAD(&stable_node->hlist); - -	stable_node->kpfn = page_to_pfn(kpage); +	stable_node->kpfn = kpfn;  	set_page_stable_node(kpage, stable_node); +	DO_NUMA(stable_node->nid = nid); +	rb_link_node(&stable_node->node, parent, new); +	rb_insert_color(&stable_node->node, root);  	return stable_node;  } @@ -1056,10 +1322,15 @@ static  struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,  					      struct page *page,  					      struct page **tree_pagep) -  { -	struct rb_node **new = &root_unstable_tree.rb_node; +	struct rb_node **new; +	struct rb_root *root;  	struct rb_node *parent = NULL; +	int nid; + +	nid = get_kpfn_nid(page_to_pfn(page)); +	root = root_unstable_tree + nid; +	new = &root->rb_node;  	while (*new) {  		struct rmap_item *tree_rmap_item; @@ -1089,6 +1360,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,  		} else if (ret > 0) {  			put_page(tree_page);  			new = &parent->rb_right; +		} else if (!ksm_merge_across_nodes && +			   page_to_nid(tree_page) != nid) { +			/* +			 * If tree_page has been migrated to another NUMA node, +			 * it will be flushed out and put in the right unstable +			 * tree next time: only merge with it when across_nodes. +			 */ +			put_page(tree_page); +			return NULL;  		} else {  			*tree_pagep = tree_page;  			return tree_rmap_item; @@ -1097,8 +1377,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,  	rmap_item->address |= UNSTABLE_FLAG;  	rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); +	DO_NUMA(rmap_item->nid = nid);  	rb_link_node(&rmap_item->node, parent, new); -	rb_insert_color(&rmap_item->node, &root_unstable_tree); +	rb_insert_color(&rmap_item->node, root);  	ksm_pages_unshared++;  	return NULL; @@ -1140,10 +1421,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)  	unsigned int checksum;  	int err; -	remove_rmap_item_from_tree(rmap_item); +	stable_node = page_stable_node(page); +	if (stable_node) { +		if (stable_node->head != &migrate_nodes && +		    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { +			rb_erase(&stable_node->node, +				 root_stable_tree + NUMA(stable_node->nid)); +			stable_node->head = &migrate_nodes; +			list_add(&stable_node->list, stable_node->head); +		} +		if (stable_node->head != &migrate_nodes && +		    rmap_item->head == stable_node) +			return; +	}  	/* We first start with searching the page inside the stable tree */  	kpage = stable_tree_search(page); +	if (kpage == page && rmap_item->head == stable_node) { +		put_page(kpage); +		return; +	} + +	remove_rmap_item_from_tree(rmap_item); +  	if (kpage) {  		err = try_to_merge_with_ksm_page(rmap_item, page, kpage);  		if (!err) { @@ -1177,14 +1477,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)  		kpage = try_to_merge_two_pages(rmap_item, page,  						tree_rmap_item, tree_page);  		put_page(tree_page); -		/* -		 * As soon as we merge this page, we want to remove the -		 * rmap_item of the page we have merged with from the unstable -		 * tree, and insert it instead as new node in the stable tree. -		 */  		if (kpage) { -			remove_rmap_item_from_tree(tree_rmap_item); - +			/* +			 * The pages were successfully merged: insert new +			 * node in the stable tree and add both rmap_items. +			 */  			lock_page(kpage);  			stable_node = stable_tree_insert(kpage);  			if (stable_node) { @@ -1241,18 +1538,59 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)  	struct mm_slot *slot;  	struct vm_area_struct *vma;  	struct rmap_item *rmap_item; +	int nid;  	if (list_empty(&ksm_mm_head.mm_list))  		return NULL;  	slot = ksm_scan.mm_slot;  	if (slot == &ksm_mm_head) { -		root_unstable_tree = RB_ROOT; +		/* +		 * A number of pages can hang around indefinitely on per-cpu +		 * pagevecs, raised page count preventing write_protect_page +		 * from merging them.  Though it doesn't really matter much, +		 * it is puzzling to see some stuck in pages_volatile until +		 * other activity jostles them out, and they also prevented +		 * LTP's KSM test from succeeding deterministically; so drain +		 * them here (here rather than on entry to ksm_do_scan(), +		 * so we don't IPI too often when pages_to_scan is set low). +		 */ +		lru_add_drain_all(); + +		/* +		 * Whereas stale stable_nodes on the stable_tree itself +		 * get pruned in the regular course of stable_tree_search(), +		 * those moved out to the migrate_nodes list can accumulate: +		 * so prune them once before each full scan. +		 */ +		if (!ksm_merge_across_nodes) { +			struct stable_node *stable_node; +			struct list_head *this, *next; +			struct page *page; + +			list_for_each_safe(this, next, &migrate_nodes) { +				stable_node = list_entry(this, +						struct stable_node, list); +				page = get_ksm_page(stable_node, false); +				if (page) +					put_page(page); +				cond_resched(); +			} +		} + +		for (nid = 0; nid < ksm_nr_node_ids; nid++) +			root_unstable_tree[nid] = RB_ROOT;  		spin_lock(&ksm_mmlist_lock);  		slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);  		ksm_scan.mm_slot = slot;  		spin_unlock(&ksm_mmlist_lock); +		/* +		 * Although we tested list_empty() above, a racing __ksm_exit +		 * of the last mm on the list may have removed it since then. +		 */ +		if (slot == &ksm_mm_head) +			return NULL;  next_mm:  		ksm_scan.address = 0;  		ksm_scan.rmap_list = &slot->rmap_list; @@ -1277,7 +1615,13 @@ next_mm:  			if (ksm_test_exit(mm))  				break;  			*page = follow_page(vma, ksm_scan.address, FOLL_GET); -			if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { +			if (IS_ERR_OR_NULL(*page)) { +				ksm_scan.address += PAGE_SIZE; +				cond_resched(); +				continue; +			} +			if (PageAnon(*page) || +			    page_trans_compound_anon(*page)) {  				flush_anon_page(vma, *page, ksm_scan.address);  				flush_dcache_page(*page);  				rmap_item = get_next_rmap_item(slot, @@ -1291,8 +1635,7 @@ next_mm:  				up_read(&mm->mmap_sem);  				return rmap_item;  			} -			if (!IS_ERR_OR_NULL(*page)) -				put_page(*page); +			put_page(*page);  			ksm_scan.address += PAGE_SIZE;  			cond_resched();  		} @@ -1321,7 +1664,7 @@ next_mm:  		 * or when all VM_MERGEABLE areas have been unmapped (and  		 * mmap_sem then protects against race with MADV_MERGEABLE).  		 */ -		hlist_del(&slot->link); +		hash_del(&slot->link);  		list_del(&slot->mm_list);  		spin_unlock(&ksm_mmlist_lock); @@ -1352,13 +1695,12 @@ static void ksm_do_scan(unsigned int scan_npages)  	struct rmap_item *rmap_item;  	struct page *uninitialized_var(page); -	while (scan_npages--) { +	while (scan_npages-- && likely(!freezing(current))) {  		cond_resched();  		rmap_item = scan_get_next_rmap_item(&page);  		if (!rmap_item)  			return; -		if (!PageKsm(page) || !in_stable_tree(rmap_item)) -			cmp_and_merge_page(page, rmap_item); +		cmp_and_merge_page(page, rmap_item);  		put_page(page);  	}  } @@ -1370,19 +1712,23 @@ static int ksmd_should_run(void)  static int ksm_scan_thread(void *nothing)  { +	set_freezable();  	set_user_nice(current, 5);  	while (!kthread_should_stop()) {  		mutex_lock(&ksm_thread_mutex); +		wait_while_offlining();  		if (ksmd_should_run())  			ksm_do_scan(ksm_thread_pages_to_scan);  		mutex_unlock(&ksm_thread_mutex); +		try_to_freeze(); +  		if (ksmd_should_run()) {  			schedule_timeout_interruptible(  				msecs_to_jiffies(ksm_thread_sleep_millisecs));  		} else { -			wait_event_interruptible(ksm_thread_wait, +			wait_event_freezable(ksm_thread_wait,  				ksmd_should_run() || kthread_should_stop());  		}  	} @@ -1402,10 +1748,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,  		 */  		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |  				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND | -				 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE | -				 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) +				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))  			return 0;		/* just ignore the advice */ +#ifdef VM_SAO +		if (*vm_flags & VM_SAO) +			return 0; +#endif +  		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {  			err = __ksm_enter(mm);  			if (err) @@ -1447,11 +1797,19 @@ int __ksm_enter(struct mm_struct *mm)  	spin_lock(&ksm_mmlist_lock);  	insert_to_mm_slots_hash(mm, mm_slot);  	/* -	 * Insert just behind the scanning cursor, to let the area settle +	 * When KSM_RUN_MERGE (or KSM_RUN_STOP), +	 * insert just behind the scanning cursor, to let the area settle  	 * down a little; when fork is followed by immediate exec, we don't  	 * want ksmd to waste time setting up and tearing down an rmap_list. +	 * +	 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its +	 * scanning cursor, otherwise KSM pages in newly forked mms will be +	 * missed: then we might as well insert at the end of the list.  	 */ -	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); +	if (ksm_run & KSM_RUN_UNMERGE) +		list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); +	else +		list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);  	spin_unlock(&ksm_mmlist_lock);  	set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -1481,7 +1839,7 @@ void __ksm_exit(struct mm_struct *mm)  	mm_slot = get_mm_slot(mm);  	if (mm_slot && ksm_scan.mm_slot != mm_slot) {  		if (!mm_slot->rmap_list) { -			hlist_del(&mm_slot->link); +			hash_del(&mm_slot->link);  			list_del(&mm_slot->mm_list);  			easy_to_free = 1;  		} else { @@ -1501,158 +1859,64 @@ void __ksm_exit(struct mm_struct *mm)  	}  } -struct page *ksm_does_need_to_copy(struct page *page, +struct page *ksm_might_need_to_copy(struct page *page,  			struct vm_area_struct *vma, unsigned long address)  { +	struct anon_vma *anon_vma = page_anon_vma(page);  	struct page *new_page; +	if (PageKsm(page)) { +		if (page_stable_node(page) && +		    !(ksm_run & KSM_RUN_UNMERGE)) +			return page;	/* no need to copy it */ +	} else if (!anon_vma) { +		return page;		/* no need to copy it */ +	} else if (anon_vma->root == vma->anon_vma->root && +		 page->index == linear_page_index(vma, address)) { +		return page;		/* still no need to copy it */ +	} +	if (!PageUptodate(page)) +		return page;		/* let do_swap_page report the error */ +  	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);  	if (new_page) {  		copy_user_highpage(new_page, page, address, vma);  		SetPageDirty(new_page);  		__SetPageUptodate(new_page); -		SetPageSwapBacked(new_page);  		__set_page_locked(new_page); - -		if (page_evictable(new_page, vma)) -			lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); -		else -			add_page_to_unevictable_list(new_page);  	}  	return new_page;  } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, -			unsigned long *vm_flags) -{ -	struct stable_node *stable_node; -	struct rmap_item *rmap_item; -	struct hlist_node *hlist; -	unsigned int mapcount = page_mapcount(page); -	int referenced = 0; -	int search_new_forks = 0; - -	VM_BUG_ON(!PageKsm(page)); -	VM_BUG_ON(!PageLocked(page)); - -	stable_node = page_stable_node(page); -	if (!stable_node) -		return 0; -again: -	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { -		struct anon_vma *anon_vma = rmap_item->anon_vma; -		struct anon_vma_chain *vmac; -		struct vm_area_struct *vma; - -		anon_vma_lock(anon_vma); -		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { -			vma = vmac->vma; -			if (rmap_item->address < vma->vm_start || -			    rmap_item->address >= vma->vm_end) -				continue; -			/* -			 * Initially we examine only the vma which covers this -			 * rmap_item; but later, if there is still work to do, -			 * we examine covering vmas in other mms: in case they -			 * were forked from the original since ksmd passed. -			 */ -			if ((rmap_item->mm == vma->vm_mm) == search_new_forks) -				continue; - -			if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) -				continue; - -			referenced += page_referenced_one(page, vma, -				rmap_item->address, &mapcount, vm_flags); -			if (!search_new_forks || !mapcount) -				break; -		} -		anon_vma_unlock(anon_vma); -		if (!mapcount) -			goto out; -	} -	if (!search_new_forks++) -		goto again; -out: -	return referenced; -} - -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)  {  	struct stable_node *stable_node; -	struct hlist_node *hlist;  	struct rmap_item *rmap_item;  	int ret = SWAP_AGAIN;  	int search_new_forks = 0; -	VM_BUG_ON(!PageKsm(page)); -	VM_BUG_ON(!PageLocked(page)); - -	stable_node = page_stable_node(page); -	if (!stable_node) -		return SWAP_FAIL; -again: -	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { -		struct anon_vma *anon_vma = rmap_item->anon_vma; -		struct anon_vma_chain *vmac; -		struct vm_area_struct *vma; - -		anon_vma_lock(anon_vma); -		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { -			vma = vmac->vma; -			if (rmap_item->address < vma->vm_start || -			    rmap_item->address >= vma->vm_end) -				continue; -			/* -			 * Initially we examine only the vma which covers this -			 * rmap_item; but later, if there is still work to do, -			 * we examine covering vmas in other mms: in case they -			 * were forked from the original since ksmd passed. -			 */ -			if ((rmap_item->mm == vma->vm_mm) == search_new_forks) -				continue; - -			ret = try_to_unmap_one(page, vma, -					rmap_item->address, flags); -			if (ret != SWAP_AGAIN || !page_mapped(page)) { -				anon_vma_unlock(anon_vma); -				goto out; -			} -		} -		anon_vma_unlock(anon_vma); -	} -	if (!search_new_forks++) -		goto again; -out: -	return ret; -} - -#ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, -		  struct vm_area_struct *, unsigned long, void *), void *arg) -{ -	struct stable_node *stable_node; -	struct hlist_node *hlist; -	struct rmap_item *rmap_item; -	int ret = SWAP_AGAIN; -	int search_new_forks = 0; +	VM_BUG_ON_PAGE(!PageKsm(page), page); -	VM_BUG_ON(!PageKsm(page)); -	VM_BUG_ON(!PageLocked(page)); +	/* +	 * Rely on the page lock to protect against concurrent modifications +	 * to that page's node of the stable tree. +	 */ +	VM_BUG_ON_PAGE(!PageLocked(page), page);  	stable_node = page_stable_node(page);  	if (!stable_node)  		return ret;  again: -	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { +	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {  		struct anon_vma *anon_vma = rmap_item->anon_vma;  		struct anon_vma_chain *vmac;  		struct vm_area_struct *vma; -		anon_vma_lock(anon_vma); -		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { +		anon_vma_lock_read(anon_vma); +		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, +					       0, ULONG_MAX) {  			vma = vmac->vma;  			if (rmap_item->address < vma->vm_start ||  			    rmap_item->address >= vma->vm_end) @@ -1666,13 +1930,21 @@ again:  			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)  				continue; -			ret = rmap_one(page, vma, rmap_item->address, arg); +			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) +				continue; + +			ret = rwc->rmap_one(page, vma, +					rmap_item->address, rwc->arg);  			if (ret != SWAP_AGAIN) { -				anon_vma_unlock(anon_vma); +				anon_vma_unlock_read(anon_vma); +				goto out; +			} +			if (rwc->done && rwc->done(page)) { +				anon_vma_unlock_read(anon_vma);  				goto out;  			}  		} -		anon_vma_unlock(anon_vma); +		anon_vma_unlock_read(anon_vma);  	}  	if (!search_new_forks++)  		goto again; @@ -1680,71 +1952,128 @@ out:  	return ret;  } +#ifdef CONFIG_MIGRATION  void ksm_migrate_page(struct page *newpage, struct page *oldpage)  {  	struct stable_node *stable_node; -	VM_BUG_ON(!PageLocked(oldpage)); -	VM_BUG_ON(!PageLocked(newpage)); -	VM_BUG_ON(newpage->mapping != oldpage->mapping); +	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); +	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); +	VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);  	stable_node = page_stable_node(newpage);  	if (stable_node) { -		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); +		VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);  		stable_node->kpfn = page_to_pfn(newpage); +		/* +		 * newpage->mapping was set in advance; now we need smp_wmb() +		 * to make sure that the new stable_node->kpfn is visible +		 * to get_ksm_page() before it can see that oldpage->mapping +		 * has gone stale (or that PageSwapCache has been cleared). +		 */ +		smp_wmb(); +		set_page_stable_node(oldpage, NULL);  	}  }  #endif /* CONFIG_MIGRATION */  #ifdef CONFIG_MEMORY_HOTREMOVE -static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, -						 unsigned long end_pfn) +static int just_wait(void *word)  { -	struct rb_node *node; +	schedule(); +	return 0; +} -	for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { -		struct stable_node *stable_node; +static void wait_while_offlining(void) +{ +	while (ksm_run & KSM_RUN_OFFLINE) { +		mutex_unlock(&ksm_thread_mutex); +		wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), +				just_wait, TASK_UNINTERRUPTIBLE); +		mutex_lock(&ksm_thread_mutex); +	} +} -		stable_node = rb_entry(node, struct stable_node, node); +static void ksm_check_stable_tree(unsigned long start_pfn, +				  unsigned long end_pfn) +{ +	struct stable_node *stable_node; +	struct list_head *this, *next; +	struct rb_node *node; +	int nid; + +	for (nid = 0; nid < ksm_nr_node_ids; nid++) { +		node = rb_first(root_stable_tree + nid); +		while (node) { +			stable_node = rb_entry(node, struct stable_node, node); +			if (stable_node->kpfn >= start_pfn && +			    stable_node->kpfn < end_pfn) { +				/* +				 * Don't get_ksm_page, page has already gone: +				 * which is why we keep kpfn instead of page* +				 */ +				remove_node_from_stable_tree(stable_node); +				node = rb_first(root_stable_tree + nid); +			} else +				node = rb_next(node); +			cond_resched(); +		} +	} +	list_for_each_safe(this, next, &migrate_nodes) { +		stable_node = list_entry(this, struct stable_node, list);  		if (stable_node->kpfn >= start_pfn &&  		    stable_node->kpfn < end_pfn) -			return stable_node; +			remove_node_from_stable_tree(stable_node); +		cond_resched();  	} -	return NULL;  }  static int ksm_memory_callback(struct notifier_block *self,  			       unsigned long action, void *arg)  {  	struct memory_notify *mn = arg; -	struct stable_node *stable_node;  	switch (action) {  	case MEM_GOING_OFFLINE:  		/* -		 * Keep it very simple for now: just lock out ksmd and -		 * MADV_UNMERGEABLE while any memory is going offline. +		 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() +		 * and remove_all_stable_nodes() while memory is going offline: +		 * it is unsafe for them to touch the stable tree at this time. +		 * But unmerge_ksm_pages(), rmap lookups and other entry points +		 * which do not need the ksm_thread_mutex are all safe.  		 */  		mutex_lock(&ksm_thread_mutex); +		ksm_run |= KSM_RUN_OFFLINE; +		mutex_unlock(&ksm_thread_mutex);  		break;  	case MEM_OFFLINE:  		/*  		 * Most of the work is done by page migration; but there might  		 * be a few stable_nodes left over, still pointing to struct -		 * pages which have been offlined: prune those from the tree. +		 * pages which have been offlined: prune those from the tree, +		 * otherwise get_ksm_page() might later try to access a +		 * non-existent struct page.  		 */ -		while ((stable_node = ksm_check_stable_tree(mn->start_pfn, -					mn->start_pfn + mn->nr_pages)) != NULL) -			remove_node_from_stable_tree(stable_node); +		ksm_check_stable_tree(mn->start_pfn, +				      mn->start_pfn + mn->nr_pages);  		/* fallthrough */  	case MEM_CANCEL_OFFLINE: +		mutex_lock(&ksm_thread_mutex); +		ksm_run &= ~KSM_RUN_OFFLINE;  		mutex_unlock(&ksm_thread_mutex); + +		smp_mb();	/* wake_up_bit advises this */ +		wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));  		break;  	}  	return NOTIFY_OK;  } +#else +static void wait_while_offlining(void) +{ +}  #endif /* CONFIG_MEMORY_HOTREMOVE */  #ifdef CONFIG_SYSFS @@ -1771,7 +2100,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,  	unsigned long msecs;  	int err; -	err = strict_strtoul(buf, 10, &msecs); +	err = kstrtoul(buf, 10, &msecs);  	if (err || msecs > UINT_MAX)  		return -EINVAL; @@ -1794,7 +2123,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,  	int err;  	unsigned long nr_pages; -	err = strict_strtoul(buf, 10, &nr_pages); +	err = kstrtoul(buf, 10, &nr_pages);  	if (err || nr_pages > UINT_MAX)  		return -EINVAL; @@ -1807,7 +2136,7 @@ KSM_ATTR(pages_to_scan);  static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,  			char *buf)  { -	return sprintf(buf, "%u\n", ksm_run); +	return sprintf(buf, "%lu\n", ksm_run);  }  static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1816,7 +2145,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,  	int err;  	unsigned long flags; -	err = strict_strtoul(buf, 10, &flags); +	err = kstrtoul(buf, 10, &flags);  	if (err || flags > UINT_MAX)  		return -EINVAL;  	if (flags > KSM_RUN_UNMERGE) @@ -1830,12 +2159,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,  	 */  	mutex_lock(&ksm_thread_mutex); +	wait_while_offlining();  	if (ksm_run != flags) {  		ksm_run = flags;  		if (flags & KSM_RUN_UNMERGE) { -			current->flags |= PF_OOM_ORIGIN; +			set_current_oom_origin();  			err = unmerge_and_remove_all_rmap_items(); -			current->flags &= ~PF_OOM_ORIGIN; +			clear_current_oom_origin();  			if (err) {  				ksm_run = KSM_RUN_STOP;  				count = err; @@ -1851,6 +2181,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,  }  KSM_ATTR(run); +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, +				struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, +				   struct kobj_attribute *attr, +				   const char *buf, size_t count) +{ +	int err; +	unsigned long knob; + +	err = kstrtoul(buf, 10, &knob); +	if (err) +		return err; +	if (knob > 1) +		return -EINVAL; + +	mutex_lock(&ksm_thread_mutex); +	wait_while_offlining(); +	if (ksm_merge_across_nodes != knob) { +		if (ksm_pages_shared || remove_all_stable_nodes()) +			err = -EBUSY; +		else if (root_stable_tree == one_stable_tree) { +			struct rb_root *buf; +			/* +			 * This is the first time that we switch away from the +			 * default of merging across nodes: must now allocate +			 * a buffer to hold as many roots as may be needed. +			 * Allocate stable and unstable together: +			 * MAXSMP NODES_SHIFT 10 will use 16kB. +			 */ +			buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), +				      GFP_KERNEL); +			/* Let us assume that RB_ROOT is NULL is zero */ +			if (!buf) +				err = -ENOMEM; +			else { +				root_stable_tree = buf; +				root_unstable_tree = buf + nr_node_ids; +				/* Stable tree is empty but not the unstable */ +				root_unstable_tree[0] = one_unstable_tree[0]; +			} +		} +		if (!err) { +			ksm_merge_across_nodes = knob; +			ksm_nr_node_ids = knob ? 1 : nr_node_ids; +		} +	} +	mutex_unlock(&ksm_thread_mutex); + +	return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif +  static ssize_t pages_shared_show(struct kobject *kobj,  				 struct kobj_attribute *attr, char *buf)  { @@ -1905,6 +2293,9 @@ static struct attribute *ksm_attrs[] = {  	&pages_unshared_attr.attr,  	&pages_volatile_attr.attr,  	&full_scans_attr.attr, +#ifdef CONFIG_NUMA +	&merge_across_nodes_attr.attr, +#endif  	NULL,  }; @@ -1943,10 +2334,7 @@ static int __init ksm_init(void)  #endif /* CONFIG_SYSFS */  #ifdef CONFIG_MEMORY_HOTREMOVE -	/* -	 * Choose a high priority since the callback takes ksm_thread_mutex: -	 * later callbacks could only be taking locks which nest within that. -	 */ +	/* There is no significance to this priority 100 */  	hotplug_memory_notifier(ksm_memory_callback, 100);  #endif  	return 0; @@ -1956,4 +2344,4 @@ out_free:  out:  	return err;  } -module_init(ksm_init) +subsys_initcall(ksm_init);  | 
