diff options
Diffstat (limited to 'mm/huge_memory.c')
| -rw-r--r-- | mm/huge_memory.c | 1886 |
1 files changed, 1232 insertions, 654 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dbe99a5f207..33514d88fef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -5,6 +5,8 @@ * the COPYING file in the top-level directory. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/sched.h> #include <linux/highmem.h> @@ -12,21 +14,27 @@ #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/migrate.h> +#include <linux/hashtable.h> + #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" /* - * By default transparent hugepage support is enabled for all mappings - * and khugepaged scans all mappings. Defrag is only invoked by - * khugepaged hugepage allocations and by page faults inside - * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived - * allocations. + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS @@ -36,7 +44,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; @@ -57,12 +66,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); -static int mm_slots_hash_init(void); static int khugepaged_slab_init(void); -static void khugepaged_slab_free(void); -#define MM_SLOTS_HASH_HEADS 1024 -static struct hlist_head *mm_slots_hash __read_mostly; +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + static struct kmem_cache *mm_slot_cache __read_mostly; /** @@ -89,7 +97,8 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; -} khugepaged_scan = { +}; +static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; @@ -99,12 +108,8 @@ static int set_recommended_min_free_kbytes(void) struct zone *zone; int nr_zones = 0; unsigned long recommended_min; - extern int min_free_kbytes; - if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) && - !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags)) + if (!khugepaged_enabled()) return 0; for_each_populated_zone(zone) @@ -127,8 +132,14 @@ static int set_recommended_min_free_kbytes(void) (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); - if (recommended_min > min_free_kbytes) + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu " + "to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + min_free_kbytes = recommended_min; + } setup_per_zone_wmarks(); return 0; } @@ -138,34 +149,102 @@ static int start_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { - int wakeup; - if (unlikely(!mm_slot_cache || !mm_slots_hash)) { - err = -ENOMEM; - goto out; - } - mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (unlikely(IS_ERR(khugepaged_thread))) { - printk(KERN_ERR - "khugepaged: kthread_run(khugepaged) failed\n"); + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } - wakeup = !list_empty(&khugepaged_scan.mm_head); - mutex_unlock(&khugepaged_mutex); - if (wakeup) + + if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); set_recommended_min_free_kbytes(); - } else - /* wakeup to exit */ - wake_up_interruptible(&khugepaged_wait); -out: + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } + return err; } +static atomic_t huge_zero_refcount; +static struct page *huge_zero_page __read_mostly; + +static inline bool is_huge_zero_page(struct page *page) +{ + return ACCESS_ONCE(huge_zero_page) == page; +} + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_page(pmd_page(pmd)); +} + +static struct page *get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return ACCESS_ONCE(huge_zero_page); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); + return NULL; + } + count_vm_event(THP_ZERO_PAGE_ALLOC); + preempt_disable(); + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + preempt_enable(); + __free_page(zero_page); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return ACCESS_ONCE(huge_zero_page); +} + +static void put_huge_zero_page(void) +{ + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); +} + +static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; +} + +static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + __free_page(zero_page); + return HPAGE_PMD_NR; + } + + return 0; +} + +static struct shrinker huge_zero_page_shrinker = { + .count_objects = shrink_huge_zero_page_count, + .scan_objects = shrink_huge_zero_page_scan, + .seeks = DEFAULT_SEEKS, +}; + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -223,18 +302,16 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err = start_khugepaged(); + int err; + + mutex_lock(&khugepaged_mutex); + err = start_khugepaged(); + mutex_unlock(&khugepaged_mutex); + if (err) ret = err; } - if (ret > 0 && - (test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) || - test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags))) - set_recommended_min_free_kbytes(); - return ret; } static struct kobj_attribute enabled_attr = @@ -244,24 +321,28 @@ static ssize_t single_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { - if (test_bit(flag, &transparent_hugepage_flags)) - return sprintf(buf, "[yes] no\n"); - else - return sprintf(buf, "yes [no]\n"); + return sprintf(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); } + static ssize_t single_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) { - if (!memcmp("yes", buf, - min(sizeof("yes")-1, count))) { + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) set_bit(flag, &transparent_hugepage_flags); - } else if (!memcmp("no", buf, - min(sizeof("no")-1, count))) { + else clear_bit(flag, &transparent_hugepage_flags); - } else - return -EINVAL; return count; } @@ -289,6 +370,20 @@ static ssize_t defrag_store(struct kobject *kobj, static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); +static ssize_t use_zero_page_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static ssize_t use_zero_page_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static struct kobj_attribute use_zero_page_attr = + __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -310,6 +405,7 @@ static struct kobj_attribute debug_cow_attr = static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, + &use_zero_page_attr.attr, #ifdef CONFIG_DEBUG_VM &debug_cow_attr.attr, #endif @@ -334,7 +430,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -361,7 +457,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -387,7 +483,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long pages; - err = strict_strtoul(buf, 10, &pages); + err = kstrtoul(buf, 10, &pages); if (err || !pages || pages > UINT_MAX) return -EINVAL; @@ -455,7 +551,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, int err; unsigned long max_ptes_none; - err = strict_strtoul(buf, 10, &max_ptes_none); + err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR-1) return -EINVAL; @@ -482,51 +578,74 @@ static struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; -#endif /* CONFIG_SYSFS */ -static int __init hugepage_init(void) +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; -#ifdef CONFIG_SYSFS - static struct kobject *hugepage_kobj; -#endif - err = -EINVAL; - if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; - goto out; + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { + pr_err("failed to create transparent hugepage kobject\n"); + return -ENOMEM; } -#ifdef CONFIG_SYSFS - err = -ENOMEM; - hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); - if (unlikely(!hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed kobject create\n"); - goto out; + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); + if (err) { + pr_err("failed to register transparent hugepage group\n"); + goto delete_obj; } - err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + pr_err("failed to register transparent hugepage group\n"); + goto remove_hp_group; } - err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); - if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); - goto out; + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + return -EINVAL; } -#endif + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + return err; err = khugepaged_slab_init(); if (err) goto out; - err = mm_slots_hash_init(); - if (err) { - khugepaged_slab_free(); - goto out; - } + register_shrinker(&huge_zero_page_shrinker); /* * By default disable transparent hugepages on smaller systems, @@ -538,12 +657,12 @@ static int __init hugepage_init(void) start_khugepaged(); - set_recommended_min_free_kbytes(); - + return 0; out: + hugepage_exit_sysfs(hugepage_kobj); return err; } -module_init(hugepage_init) +subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { @@ -571,98 +690,98 @@ static int __init setup_transparent_hugepage(char *str) } out: if (!ret) - printk(KERN_WARNING - "transparent_hugepage= cannot parse, ignored\n"); + pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); -static void prepare_pmd_huge_pte(pgtable_t pgtable, - struct mm_struct *mm) -{ - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(&pgtable->lru); - else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; -} - -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); return pmd; } +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) +{ + pmd_t entry; + entry = mk_pmd(page, prot); + entry = pmd_mkhuge(entry); + return entry; +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *page) { - int ret = 0; pgtable_t pgtable; + spinlock_t *ptl; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) { - mem_cgroup_uncharge_page(page); - put_page(page); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - } clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); - /* - * The spinlocking to take the lru_lock inside - * page_add_new_anon_rmap() acts as a full memory - * barrier to be sure clear_huge_page writes become - * visible after the set_pmd_at() write. - */ page_add_new_anon_rmap(page, vma, haddr); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - spin_unlock(&mm->page_table_lock); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); } - return ret; + return 0; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag) +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr, int nd) + unsigned long haddr, int nd, + gfp_t extra_gfp) { - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) +/* Caller must hold page table lock. */ +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + struct page *zero_page) { - return alloc_pages(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER); + pmd_t entry; + if (!pmd_none(*pmd)) + return false; + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_wrprotect(entry); + entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); + atomic_long_inc(&mm->nr_ptes); + return true; } -#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -670,49 +789,65 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; - pte_t *pte; - if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && + transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; + pgtable_t pgtable; + struct page *zero_page; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id()); - if (unlikely(!page)) - goto out; - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { - put_page(page); - goto out; + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { + pte_free(mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + ptl = pmd_lock(mm, pmd); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_page); + spin_unlock(ptl); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); } - - return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); - } -out: - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + } + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + + count_vm_event(THP_FAULT_ALLOC); + return 0; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -723,8 +858,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -732,51 +868,108 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); goto out_unlock; } + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + struct page *zero_page; + bool set; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_page = get_huge_zero_page(); + set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_page); + BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ + ret = 0; + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ goto out; } src_page = pmd_page(pmd); - VM_BUG_ON(!PageHead(src_page)); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); get_page(src_page); page_dup_rmap(src_page); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - prepare_pmd_huge_pte(pgtable, dst_mm); + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } -/* no "address" argument so destroys page coloring of some arch */ -pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) { - pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; - assert_spin_locked(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; - /* FIFO */ - pgtable = mm->pmd_huge_pte; - if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, - struct page, lru); - list_del(&pgtable->lru); + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + +/* + * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages + * during copy_user_huge_page()'s copy_page_rep(): in the case when + * the source page gets split and a tail freed before copy completes. + * Called under pmd_lock of checked pmd, so safe from splitting itself. + */ +static void get_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + atomic_add(HPAGE_PMD_NR, &page->_count); + while (++page < endpage) + get_huge_page_tail(page); + } else { + get_page(page); + } +} + +static void put_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + while (page < endpage) + put_page(page++); + } else { + put_page(page); } - return pgtable; } static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, @@ -786,10 +979,13 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -799,10 +995,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | + __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -820,20 +1017,24 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { copy_user_highpage(pages[i], page + i, - haddr + PAGE_SHIFT*i, vma); + haddr + PAGE_SIZE * i, vma); __SetPageUptodate(pages[i]); cond_resched(); } - spin_lock(&mm->page_table_lock); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -848,11 +1049,12 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -861,7 +1063,8 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -875,93 +1078,143 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; - struct page *page, *new_page; + struct page *page = NULL, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); - spin_lock(&mm->page_table_lock); + haddr = address & HPAGE_PMD_MASK; + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); - VM_BUG_ON(!PageCompound(page) || !PageHead(page)); - haddr = address & HPAGE_PMD_MASK; + VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); if (page_mapcount(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } - get_page(page); - spin_unlock(&mm->page_table_lock); - + get_user_huge_page(page); + spin_unlock(ptl); +alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id()); + vma, haddr, numa_node_id(), 0); else new_page = NULL; if (unlikely(!new_page)) { - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); - put_page(page); + if (!page) { + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + } else { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) { + split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } + put_user_huge_page(page); + } + count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); - put_page(page); - ret |= VM_FAULT_OOM; + if (page) { + split_huge_page(page); + put_user_huge_page(page); + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + count_vm_event(THP_FAULT_FALLBACK); goto out; } - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + + if (!page) + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - spin_lock(&mm->page_table_lock); - put_page(page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + spin_lock(ptl); + if (page) + put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); + goto out_mn; } else { pmd_t entry; - VM_BUG_ON(!PageHead(page)); - entry = mk_pmd(new_page, vma->vm_page_prot); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); - page_remove_rmap(page); - put_page(page); + update_mmu_cache_pmd(vma, address, pmd); + if (!page) { + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + put_huge_zero_page(); + } else { + VM_BUG_ON_PAGE(!PageHead(page), page); + page_remove_rmap(page); + put_page(page); + } ret |= VM_FAULT_WRITE; } -out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(ptl); + return ret; } -struct page *follow_trans_huge_pmd(struct mm_struct *mm, +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto out; + page = pmd_page(*pmd); - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); if (flags & FOLL_TOUCH) { pmd_t _pmd; /* @@ -973,46 +1226,187 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + spinlock_t *ptl; + struct anon_vma *anon_vma = NULL; + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); + int target_nid, last_cpupid = -1; + bool page_locked; + bool migrated = false; + int flags = 0; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + + page = pmd_page(pmd); + BUG_ON(is_huge_zero_page(page)); + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == this_nid) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } + + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pmd_write(pmd)) + flags |= TNF_NO_GROUP; + + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) + goto clear_pmdnuma; + } + + /* Migration could have started since the pmd_trans_migrating check */ + if (!page_locked) { + spin_unlock(ptl); + wait_on_page_locked(page); + page_nid = -1; + goto out; + } + + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ + get_page(page); + spin_unlock(ptl); + anon_vma = page_lock_anon_vma_read(page); + + /* Confirm the PMD did not change while page_table_lock was released */ + spin_lock(ptl); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); + page_nid = -1; + goto out_unlock; + } + + /* Bail if we fail to protect against THP splits for any reason */ + if (unlikely(!anon_vma)) { + put_page(page); + page_nid = -1; + goto clear_pmdnuma; + } + + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ + spin_unlock(ptl); + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, page, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } + + goto out; +clear_pmdnuma: + BUG_ON(!PageLocked(page)); + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); +out_unlock: + spin_unlock(ptl); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, - pmd_t *pmd) + pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - spin_lock(&tlb->mm->page_table_lock); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&tlb->mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, - pmd); + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pgtable_t pgtable; + pmd_t orig_pmd; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + if (is_huge_zero_pmd(orig_pmd)) { + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); } else { - struct page *page; - pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); + page = pmd_page(orig_pmd); page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); - spin_unlock(&tlb->mm->page_table_lock); + VM_BUG_ON_PAGE(!PageHead(page), page); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); tlb_remove_page(tlb, page); - pte_free(tlb->mm, pgtable); - ret = 1; } - } else - spin_unlock(&tlb->mm->page_table_lock); - + pte_free(tlb->mm, pgtable); + ret = 1; + } return ret; } @@ -1020,79 +1414,179 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - spin_lock(&vma->vm_mm->page_table_lock); - if (likely(pmd_trans_huge(*pmd))) { - ret = !pmd_trans_splitting(*pmd); - spin_unlock(&vma->vm_mm->page_table_lock); - if (unlikely(!ret)) - wait_split_huge_page(vma->anon_vma, pmd); - else { - /* - * All logical pages in the range are present - * if backed by a huge page. - */ - memset(vec, 1, (end - addr) >> PAGE_SHIFT); - } - } else - spin_unlock(&vma->vm_mm->page_table_lock); + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + spin_unlock(ptl); + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + ret = 1; + } return ret; } +int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + int ret = 0; + pmd_t pmd; + + struct mm_struct *mm = vma->vm_mm; + + if ((old_addr & ~HPAGE_PMD_MASK) || + (new_addr & ~HPAGE_PMD_MASK) || + old_end - old_addr < HPAGE_PMD_SIZE || + (new_vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) { + VM_BUG_ON(pmd_trans_huge(*new_pmd)); + goto out; + } + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); + if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + VM_BUG_ON(!pmd_none(*new_pmd)); + + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + } +out: + return ret; +} + +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * - HPAGE_PMD_NR is protections changed and TLB flush necessary + */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - spin_lock(&mm->page_table_lock); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - pmd_t entry; - + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + pmd_t entry; + ret = 1; + if (!prot_numa) { entry = pmdp_get_and_clear(mm, addr, pmd); + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); - ret = 1; + BUG_ON(pmd_write(entry)); + } else { + struct page *page = pmd_page(*pmd); + + /* + * Do not trap faults against the zero page. The + * read-only data is likely to be read-cached on the + * local CPU cache and it is less useful to know about + * local vs remote hits on the zero page. + */ + if (!is_huge_zero_page(page) && + !pmd_numa(*pmd)) { + pmdp_set_numa(mm, addr, pmd); + ret = HPAGE_PMD_NR; + } } - } else - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); + } return ret; } +/* + * Returns 1 if a given pmd maps a stable (not under splitting) thp. + * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. + * + * Note that if it returns 1, this routine returns without unlocking page + * table locks. So callers must unlock them. + */ +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) +{ + *ptl = pmd_lock(vma->vm_mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(*ptl); + wait_split_huge_page(vma->anon_vma, pmd); + return -1; + } else { + /* Thp mapped by 'pmd' is stable, so we can + * handle it as it is. */ + return 1; + } + } + spin_unlock(*ptl); + return 0; +} + +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; - + return NULL; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - goto out; - + return NULL; pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; + + *ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1102,14 +1596,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1117,47 +1612,73 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; - spin_lock(&mm->page_table_lock); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->lock to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } -static void __split_huge_page_refcount(struct page *page) +static void __split_huge_page_refcount(struct page *page, + struct list_head *list) { int i; - unsigned long head_index = page->index; struct zone *zone = page_zone(page); - int zonestat; + struct lruvec *lruvec; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1172,13 +1693,12 @@ static void __split_huge_page_refcount(struct page *page) ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_mlocked) | - (1L << PG_uptodate))); + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1195,35 +1715,25 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); page_tail->mapping = page->mapping; - page_tail->index = ++head_index; + page_tail->index = page->index + i; + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - mem_cgroup_split_huge_fixup(page, page_tail); - - lru_add_page_tail(zone, page, page_tail); + lru_add_page_tail(page, page_tail, lruvec, list); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); - - /* - * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, - * so adjust those appropriately if this page is on the LRU. - */ - if (PageLRU(page)) { - zonestat = NR_LRU_BASE + page_lru(page); - __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); - } + __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); ClearPageCompound(page); compound_unlock(page); @@ -1254,20 +1764,20 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); - for (i = 0, haddr = address; i < HPAGE_PMD_NR; - i++, haddr += PAGE_SIZE) { + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); entry = mk_pte(page + i, vma->vm_page_prot); @@ -1278,13 +1788,14 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1312,33 +1823,32 @@ static int __split_huge_page_map(struct page *page, * SMP TLB and finally we write the non-huge version * of the pmd entry with pmd_populate. */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } -/* must be called with anon_vma->root->lock hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma) + struct anon_vma *anon_vma, + struct list_head *list) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1351,64 +1861,91 @@ static void __split_huge_page(struct page *page, * the newly established pmd of the child later during the * walk, to be able to set it as pmd_trans_splitting too. */ - if (mapcount != page_mapcount(page)) - printk(KERN_ERR "mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG_ON(mapcount != page_mapcount(page)); + if (mapcount != page_mapcount(page)) { + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG(); + } - __split_huge_page_refcount(page); + __split_huge_page_refcount(page, list); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } - if (mapcount != mapcount2) - printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG_ON(mapcount != mapcount2); + if (mapcount != mapcount2) { + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG(); + } } -int split_huge_page(struct page *page) +/* + * Split a hugepage into normal pages. This doesn't change the position of head + * page. If @list is null, tail pages will be added to LRU list, otherwise, to + * @list. Both head page and tail pages will inherit mapping, flags, and so on + * from the hugepage. + * Return 0 if the hugepage is split successfully otherwise return 1. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) { struct anon_vma *anon_vma; int ret = 1; + BUG_ON(is_huge_zero_page(page)); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); if (!anon_vma) goto out; + anon_vma_lock_write(anon_vma); + ret = 0; if (!PageCompound(page)) goto out_unlock; BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma); + __split_huge_page(page, anon_vma, list); + count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); out: return ret; } +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) + int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_HUGEPAGE | - VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; @@ -1424,11 +1961,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_NOHUGEPAGE | - VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) + if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; @@ -1454,12 +1987,6 @@ static int __init khugepaged_slab_init(void) return 0; } -static void __init khugepaged_slab_free(void) -{ - kmem_cache_destroy(mm_slot_cache); - mm_slot_cache = NULL; -} - static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -1472,47 +1999,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) kmem_cache_free(mm_slot_cache, mm_slot); } -static int __init mm_slots_hash_init(void) -{ - mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!mm_slots_hash) - return -ENOMEM; - return 0; -} - -#if 0 -static void __init mm_slots_hash_free(void) -{ - kfree(mm_slots_hash); - mm_slots_hash = NULL; -} -#endif - static struct mm_slot *get_mm_slot(struct mm_struct *mm) { struct mm_slot *mm_slot; - struct hlist_head *bucket; - struct hlist_node *node; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; - hlist_for_each_entry(mm_slot, node, bucket, hash) { + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) if (mm == mm_slot->mm) return mm_slot; - } + return NULL; } static void insert_to_mm_slots_hash(struct mm_struct *mm, struct mm_slot *mm_slot) { - struct hlist_head *bucket; - - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; mm_slot->mm = mm; - hlist_add_head(&mm_slot->hash, bucket); + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } static inline int khugepaged_test_exit(struct mm_struct *mm) @@ -1562,10 +2064,10 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) * page fault if needed. */ return 0; - if (vma->vm_file || vma->vm_ops) + if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1581,18 +2083,17 @@ void __khugepaged_exit(struct mm_struct *mm) spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); free = 1; } + spin_unlock(&khugepaged_mm_lock); if (free) { - spin_unlock(&khugepaged_mm_lock); clear_bit(MMF_VM_HUGEPAGE, &mm->flags); free_mm_slot(mm_slot); mmdrop(mm); } else if (mm_slot) { - spin_unlock(&khugepaged_mm_lock); /* * This is required to serialize against * khugepaged_test_exit() (which is guaranteed to run @@ -1603,8 +2104,7 @@ void __khugepaged_exit(struct mm_struct *mm) */ down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); - } else - spin_unlock(&khugepaged_mm_lock); + } } static void release_pte_page(struct page *page) @@ -1624,82 +2124,66 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) } } -static void release_all_pte_pages(pte_t *pte) -{ - release_pte_pages(pte, pte + HPAGE_PMD_NR); -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { struct page *page; pte_t *_pte; - int referenced = 0, isolated = 0, none = 0; + int referenced = 0, none = 0; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval)) { if (++none <= khugepaged_max_ptes_none) continue; - else { - release_pte_pages(pte, _pte); + else goto out; - } } - if (!pte_present(pteval) || !pte_write(pteval)) { - release_pte_pages(pte, _pte); + if (!pte_present(pteval) || !pte_write(pteval)) goto out; - } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - release_pte_pages(pte, _pte); + if (unlikely(!page)) goto out; - } - VM_BUG_ON(PageCompound(page)); - BUG_ON(!PageAnon(page)); - VM_BUG_ON(!PageSwapBacked(page)); + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) { - release_pte_pages(pte, _pte); + if (page_count(page) != 1) goto out; - } /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { - release_pte_pages(pte, _pte); + if (!trylock_page(page)) goto out; - } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (isolate_lru_page(page)) { unlock_page(page); - release_pte_pages(pte, _pte); goto out; } /* 0 stands for page_is_file_cache(page) == false */ inc_zone_page_state(page, NR_ISOLATED_ANON + 0); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } - if (unlikely(!referenced)) - release_all_pte_pages(pte); - else - isolated = 1; + if (likely(referenced)) + return 1; out: - return isolated; + release_pte_pages(pte, _pte); + return 0; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -1718,8 +2202,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON(page_mapcount(src_page) != 1); - VM_BUG_ON(page_count(src_page) != 2); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to @@ -1742,28 +2225,63 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) +static void khugepaged_alloc_sleep(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *ptl; - int isolated; - unsigned long hstart, hend; + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#ifndef CONFIG_NUMA - VM_BUG_ON(!*hpage); - new_page = *hpage; -#else - VM_BUG_ON(*hpage); +static int khugepaged_node_load[MAX_NUMNODES]; + +#ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + VM_BUG_ON_PAGE(*hpage, *hpage); /* * Allocate the page while the vma is still valid and under * the mmap_sem read mode so there is no memory allocation @@ -1774,22 +2292,115 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node); - if (unlikely(!new_page)) { - up_read(&mm->mmap_sem); + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + /* + * After allocating the hugepage, release the mmap_sem read lock in + * preparation for taking it in write mode. + */ + up_read(&mm->mmap_sem); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return; + return NULL; } + + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + return *hpage; +} #endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { - up_read(&mm->mmap_sem); - put_page(new_page); + +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + VM_BUG_ON(vma->vm_flags & VM_NO_THP); + return true; +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *pmd_ptl, *pte_ptl; + int isolated; + unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + if (!new_page) return; - } - /* after allocating the hugepage upgrade to mmap_sem write mode */ - up_read(&mm->mmap_sem); + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + return; /* * Prevent all access to pagetables with the exception of @@ -1801,61 +2412,53 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; vma = find_vma(mm, address); + if (!vma) + goto out; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + if (!hugepage_vma_check(vma)) goto out; - - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) - goto out; - if (is_vma_temporary_stack(vma)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - /* pmd can't go away or become huge under us */ - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) - goto out; - - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + _pmd = pmdp_clear_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); - spin_unlock(&mm->page_table_lock); - anon_vma_unlock(vma->anon_vma); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -1863,18 +2466,15 @@ static void collapse_huge_page(struct mm_struct *mm, * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - VM_BUG_ON(page_count(pgtable) != 1); - VM_BUG_ON(page_mapcount(pgtable) != 0); - _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - _pmd = pmd_mkhuge(_pmd); /* * spin_lock() below is not the equivalent of smp_wmb(), so @@ -1883,18 +2483,16 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, entry); - prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; - spin_unlock(&mm->page_table_lock); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); -#ifndef CONFIG_NUMA *hpage = NULL; -#endif + khugepaged_pages_collapsed++; out_up_write: up_write(&mm->mmap_sem); @@ -1902,9 +2500,6 @@ out_up_write: out: mem_cgroup_uncharge_page(new_page); -#ifdef CONFIG_NUMA - put_page(new_page); -#endif goto out_up_write; } @@ -1913,30 +2508,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, unsigned long address, struct page **hpage) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte, *_pte; int ret = 0, referenced = 0, none = 0; struct page *page; unsigned long _address; spinlock_t *ptl; - int node = -1; + int node = NUMA_NO_NODE; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -1953,13 +2539,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == -1) - node = page_to_nid(page); - VM_BUG_ON(PageCompound(page)); + node = page_to_nid(page); + khugepaged_node_load[node]++; + VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; /* cannot use mapcount: can't collapse if there's a gup pin */ @@ -1973,9 +2560,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } @@ -1984,11 +2573,11 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); /* @@ -2005,6 +2594,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) static unsigned int khugepaged_scan_mm_slot(unsigned int pages, struct page **hpage) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) { struct mm_slot *mm_slot; struct mm_struct *mm; @@ -2012,7 +2603,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2040,22 +2631,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - - if ((!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) { - skip: + if (!hugepage_vma_check(vma)) { +skip: progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) - goto skip; - if (is_vma_temporary_stack(vma)) - goto skip; - - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart >= hend) @@ -2129,29 +2709,23 @@ static int khugepaged_has_work(void) static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { - cond_resched(); - -#ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) - break; - } -#else - if (IS_ERR(*hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#endif + + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break; @@ -2162,78 +2736,32 @@ static void khugepaged_do_scan(struct page **hpage) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - hpage); + &hpage); else progress = pages; spin_unlock(&khugepaged_mm_lock); } -} -static void khugepaged_alloc_sleep(void) -{ - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); } -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(void) +static void khugepaged_wait_work(void) { - struct page *hpage; + try_to_freeze(); - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) - khugepaged_alloc_sleep(); - } while (unlikely(!hpage) && - likely(khugepaged_enabled())); - return hpage; -} -#endif - -static void khugepaged_loop(void) -{ - struct page *hpage; - -#ifdef CONFIG_NUMA - hpage = NULL; -#endif - while (likely(khugepaged_enabled())) { -#ifndef CONFIG_NUMA - hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) - break; -#else - if (IS_ERR(hpage)) { - khugepaged_alloc_sleep(); - hpage = NULL; - } -#endif + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; - khugepaged_do_scan(&hpage); -#ifndef CONFIG_NUMA - if (hpage) - put_page(hpage); -#endif - try_to_freeze(); - if (unlikely(kthread_should_stop())) - break; - if (khugepaged_has_work()) { - DEFINE_WAIT(wait); - if (!khugepaged_scan_sleep_millisecs) - continue; - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_scan_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); - } else if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, - khugepaged_wait_event()); + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) @@ -2241,22 +2769,11 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); - - /* serialize with start_khugepaged() */ - mutex_lock(&khugepaged_mutex); - - for (;;) { - mutex_unlock(&khugepaged_mutex); - VM_BUG_ON(khugepaged_thread != current); - khugepaged_loop(); - VM_BUG_ON(khugepaged_thread != current); + set_user_nice(current, MAX_NICE); - mutex_lock(&khugepaged_mutex); - if (!khugepaged_enabled()) - break; - if (unlikely(kthread_should_stop())) - break; + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); @@ -2265,31 +2782,92 @@ static int khugepaged(void *none) if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); + return 0; +} - khugepaged_thread = NULL; - mutex_unlock(&khugepaged_mutex); +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd; + int i; - return 0; + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); } -void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd) { + spinlock_t *ptl; struct page *page; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); - spin_lock(&mm->page_table_lock); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; +again: + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + if (is_huge_zero_pmd(*pmd)) { + __split_huge_zero_page_pmd(vma, haddr, pmd); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); put_page(page); - BUG_ON(pmd_trans_huge(*pmd)); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; +} + +void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, + pmd_t *pmd) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, address); + BUG_ON(vma == NULL); + split_huge_page_pmd(vma, address, pmd); } static void split_huge_page_address(struct mm_struct *mm, @@ -2316,7 +2894,7 @@ static void split_huge_page_address(struct mm_struct *mm, * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, address, pmd); } void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
