diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 15 | ||||
-rw-r--r-- | mm/huge_memory.c | 108 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 7 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 7 | ||||
-rw-r--r-- | mm/memory-failure.c | 7 | ||||
-rw-r--r-- | mm/memory.c | 198 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 283 | ||||
-rw-r--r-- | mm/migrate.c | 337 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 135 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 66 | ||||
-rw-r--r-- | mm/vmstat.c | 16 |
18 files changed, 1098 insertions, 131 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 12979121822..5ad7f4f4d6f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); + count_vm_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) + count_vm_events(COMPACTISOLATED, total_isolated); + return total_isolated; } @@ -609,6 +613,10 @@ next_pageblock: trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); + if (nr_isolated) + count_vm_events(COMPACTISOLATED, nr_isolated); + return low_pfn; } @@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; - count_vm_event(COMPACTBLOCKS); - count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); - if (nr_remaining) - count_vm_events(COMPACTPAGEFAILED, nr_remaining); trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c81305..d7ee1691fd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -19,6 +19,7 @@ #include <linux/freezer.h> #include <linux/mman.h> #include <linux/pagemap.h> +#include <linux/migrate.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -690,7 +691,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -848,7 +849,8 @@ out: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) @@ -1287,6 +1289,81 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; + int current_nid = -1; + bool migrated; + bool page_locked = false; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + page = pmd_page(pmd); + get_page(page); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + put_page(page); + goto clear_pmdnuma; + } + + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; + + /* Confirm the PTE did not while locked */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; + +clear_pmdnuma: + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); + +out_unlock: + spin_unlock(&mm->page_table_lock); + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1375,7 +1452,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); @@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); @@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { @@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page) BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96b..e5318c7793a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3016,7 +3016,7 @@ same_page: return i ? i : -EFAULT; } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; @@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); + unsigned long pages = 0; BUG_ON(address >= end); flush_cache_range(vma, address, end); @@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (!ptep) continue; - if (huge_pmd_unshare(mm, &address, ptep)) + if (huge_pmd_unshare(mm, &address, ptep)) { + pages++; continue; + } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); + pages++; } } spin_unlock(&mm->page_table_lock); @@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + + return pages << h->order; } int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/internal.h b/mm/internal.h index 52d1fa95719..d597f94cc20 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); @@ -1624,7 +1624,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1678,7 +1678,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1731,7 +1731,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8c..bbfac5063ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f..c6e4dd3e1c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* @@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a0..e6a3b933517 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> +#include <linux/migrate.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(vma, address, pmd); @@ -1532,6 +1535,8 @@ split_fallthrough: pte = *ptep; if (!pte_present(pte)) goto no_page; + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + /* + * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault + * would be called on PROT_NONE ranges. We must never invoke + * handle_mm_fault on PROT_NONE ranges or the NUMA hinting + * page faults would unprotect the PROT_NONE ranges if + * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd + * bitflag. So to avoid that, don't set FOLL_NUMA if + * FOLL_FORCE is set. + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + i = 0; do { @@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int current_nid) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + + return mpol_misplaced(page, vma, addr); +} + +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page = NULL; + spinlock_t *ptl; + int current_nid = -1; + int target_nid; + bool migrated = false; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * ptep_modify_prot_start is not called as this is clearing + * the _PAGE_NUMA bit and it is not really expected that there + * would be concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + pte = pte_mknonnuma(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + + current_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, current_nid); + pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + /* + * Account for the fault against the current node if it not + * being replaced regardless of where the page is located. + */ + current_nid = numa_node_id(); + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + current_nid = target_nid; + +out: + if (current_nid != -1) + task_numa_fault(current_nid, 1, migrated); + return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + pte_t *pte, *orig_pte; + unsigned long _addr = addr & PMD_MASK; + unsigned long offset; + spinlock_t *ptl; + bool numa = false; + int local_nid = numa_node_id(); + + spin_lock(&mm->page_table_lock); + pmd = *pmdp; + if (pmd_numa(pmd)) { + set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + numa = true; + } + spin_unlock(&mm->page_table_lock); + + if (!numa) + return 0; + + /* we're in a page fault so some vma must be in the range */ + BUG_ON(!vma); + BUG_ON(vma->vm_start >= _addr + PMD_SIZE); + offset = max(_addr, vma->vm_start) & ~PMD_MASK; + VM_BUG_ON(offset >= PMD_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); + pte += offset >> PAGE_SHIFT; + for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { + pte_t pteval = *pte; + struct page *page; + int curr_nid = local_nid; + int target_nid; + bool migrated; + if (!pte_present(pteval)) + continue; + if (!pte_numa(pteval)) + continue; + if (addr >= vma->vm_end) { + vma = find_vma(mm, addr); + /* there's a pte present so there must be a vma */ + BUG_ON(!vma); + BUG_ON(addr < vma->vm_start); + } + if (pte_numa(pteval)) { + pteval = pte_mknonnuma(pteval); + set_pte_at(mm, addr, pte, pteval); + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + + /* + * Note that the NUMA fault is later accounted to either + * the node that is currently running or where the page is + * migrated to. + */ + curr_nid = local_nid; + target_nid = numa_migrate_prep(page, vma, addr, + page_to_nid(page)); + if (target_nid == -1) { + put_page(page); + continue; + } + + /* Migrate to the requested node */ + pte_unmap_unlock(pte, ptl); + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + curr_nid = target_nid; + task_numa_fault(curr_nid, 1, migrated); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); + } + pte_unmap_unlock(orig_pte, ptl); + + return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } + if (pte_numa(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3520,8 +3704,11 @@ retry: if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - if (dirty && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_numa(orig_pmd)) + return do_huge_pmd_numa_page(mm, vma, address, + orig_pmd, pmd); + + if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3536,16 +3723,21 @@ retry: huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); } + return 0; } } + if (pmd_numa(*pmd)) + return do_pmd_numa_page(mm, vma, address, pmd); + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e8..962e353aa86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC); + true, MIGRATE_SYNC, + MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6..d1b315e9862 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -117,6 +118,26 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (!pol) { + node = numa_node_id(); + if (node != -1) + pol = &preferred_node_policy[node]; + + /* preferred_node_policy is not initialised early in boot */ + if (!pol->mode) + pol = NULL; + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); + + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + + return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); @@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1956,7 +2028,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) { + int last_nid; + + polnid = numa_node_id(); + + /* + * Multi-stage node selection is used in conjunction + * with a periodic migration fault to build a temporal + * task<->page relation. By using a two-stage filter we |