diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/huge_memory.c | 201 | ||||
-rw-r--r-- | mm/hugetlb.c | 110 | ||||
-rw-r--r-- | mm/memcontrol.c | 10 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 42 | ||||
-rw-r--r-- | mm/mempolicy.c | 5 | ||||
-rw-r--r-- | mm/migrate.c | 14 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 16 | ||||
-rw-r--r-- | mm/rmap.c | 15 |
13 files changed, 263 insertions, 175 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3f4ffda152b..de31af25620 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -218,9 +218,11 @@ config SPLIT_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 - default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + # # support for memory balloon compaction config BALLOON_COMPACTION diff --git a/mm/filemap.c b/mm/filemap.c index ae4846ff484..b7749a92021 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp, * @filp: the file to read * @ppos: current file position * @desc: read_descriptor - * @actor: read method * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp, * of the logic when it comes to error handling etc. */ static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc, read_actor_t actor) + read_descriptor_t *desc) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1200,13 +1199,14 @@ page_ok: * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * - * The actor routine returns how many bytes were actually used.. + * The file_read_actor routine returns how many bytes were + * actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = file_read_actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; @@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (desc.count == 0) continue; desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); + do_generic_file_read(filp, ppos, &desc); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0556c6a4495..bccd5a628ea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -710,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct page *page) { pgtable_t pgtable; + spinlock_t *ptl; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); @@ -724,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); @@ -738,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; - spin_unlock(&mm->page_table_lock); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); } return 0; @@ -759,6 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } +/* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -771,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); return true; } @@ -790,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; @@ -802,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); @@ -838,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -848,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -858,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } /* - * mm->page_table_lock is enough to be sure that huge zero pmd is not + * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ @@ -879,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ @@ -896,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - dst_mm->nr_ptes++; + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } @@ -912,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm, pmd_t *pmd, pmd_t orig_pmd, int dirty) { + spinlock_t *ptl; pmd_t entry; unsigned long haddr; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto unlock; @@ -925,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm, update_mmu_cache_pmd(vma, address, pmd); unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; struct page *page; @@ -958,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_page; @@ -985,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); @@ -995,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, out: return ret; out_free_page: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_page(page); put_page(page); @@ -1009,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1055,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); @@ -1081,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1092,7 +1100,7 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1107,17 +1115,19 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; @@ -1133,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) @@ -1180,11 +1190,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (page) put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); goto out_mn; @@ -1206,13 +1216,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return ret; } @@ -1224,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; @@ -1271,6 +1281,7 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + spinlock_t *ptl; struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; @@ -1280,7 +1291,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1318,7 +1329,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * relock and check_same as the page may no longer be mapped. * As the fault is being retried, do not account for it. */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1326,13 +1337,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!page_locked) lock_page(page); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); @@ -1344,7 +1355,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) { @@ -1361,7 +1372,7 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: if (anon_vma) @@ -1376,9 +1387,10 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1392,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1401,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1415,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1435,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1455,12 +1469,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - spin_unlock(&mm->page_table_lock); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); } out: return ret; @@ -1476,9 +1499,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; ret = 1; if (!prot_numa) { @@ -1507,7 +1531,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (ret == HPAGE_PMD_NR) set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); } return ret; @@ -1520,12 +1544,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1534,27 +1559,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pmd = mm_find_pmd(mm, address); if (!pmd) - goto out; + return NULL; + *ptl = pmd_lock(mm, pmd); if (pmd_none(*pmd)) - goto out; + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1564,14 +1599,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1579,6 +1615,7 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1586,9 +1623,8 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1599,8 +1635,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1731,14 +1767,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1793,8 +1829,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } @@ -2346,7 +2382,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte_t *pte; pgtable_t pgtable; struct page *new_page; - spinlock_t *ptl; + spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2389,12 +2425,12 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); mmun_start = address; mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow @@ -2402,16 +2438,16 @@ static void collapse_huge_page(struct mm_struct *mm, * to avoid the risk of CPU bugs in that area. */ _pmd = pmdp_clear_flush(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing @@ -2419,7 +2455,7 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2430,7 +2466,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -2445,13 +2481,13 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); *hpage = NULL; @@ -2780,6 +2816,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { + spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2792,22 +2829,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_end = haddr + HPAGE_PMD_SIZE; again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b7656e804d..7d57af21f49 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2376,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2387,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2398,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); } return 0; @@ -2442,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2455,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - continue; + goto unlock; } page = pte_page(pte); @@ -2484,7 +2487,7 @@ again: */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2501,13 +2504,18 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + spin_unlock(ptl); break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2613,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) + struct page *pagecache_page, spinlock_t *ptl) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -2647,8 +2655,8 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + /* Drop page table lock as buddy allocator may be called */ + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2666,13 +2674,13 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page_table_lock, and - * our job is done. + * race occurs while re-acquiring page table + * lock, and our job is done. */ return 0; } @@ -2680,7 +2688,7 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2695,7 +2703,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2707,10 +2715,10 @@ retry_avoidcopy: mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page_table_lock to check for racing updates + * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -2724,13 +2732,13 @@ retry_avoidcopy: /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return 0; } @@ -2778,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2864,7 +2873,8 @@ retry: goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2885,16 +2895,16 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); @@ -2906,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2918,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(mm, ptep); + migration_entry_wait_huge(vma, mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2974,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + goto out_ptl; if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page); - goto out_page_table_lock; + pagecache_page, ptl); + goto out_ptl; } entry = huge_pte_mkdirty(entry); } @@ -2993,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_page_table_lock: - spin_unlock(&mm->page_table_lock); +out_ptl: + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -3020,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -3030,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -3043,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } @@ -3062,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3096,8 +3114,8 @@ same_page: */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3118,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { pages++; + spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3134,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: @@ -3298,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; + spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3320,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); @@ -3340,7 +3362,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with vma->vm_mm->page_table_lock held. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3cd40b2d5d..f1a0ae6e11b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6605,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6797,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6816,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f9d78ec7831..b7c171602ba 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", diff --git a/mm/memory.c b/mm/memory.c index bf8665849a5..0409e8f43fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -550,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { + spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -570,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* fall through */ } split_fallthrough: @@ -4269,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS +static struct kmem_cache *page_ptl_cachep; +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kfree(page->ptl); +} +#endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4cc19f6ab6c..c4403cdf343 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE int nid; struct page *page; + spinlock_t *ptl; - spin_lock(&vma->vm_mm->page_table_lock); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); page = pte_page(huge_ptep_get((pte_t *)pmd)); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) isolate_huge_page(page, private); unlock: - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); #else BUG(); #endif diff --git a/mm/migrate.c b/mm/migrate.c index dfc8300ecbb..316e720a202 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { pmd = mm_find_pmd(mm, addr); if (!pmd) @@ -249,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { - spinlock_t *ptl = &(mm)->page_table_lock; + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); __migration_entry_wait(mm, pte, ptl); } @@ -1666,6 +1667,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long address, struct page *page, int node) { + spinlock_t *ptl; unsigned long haddr = address & HPAGE_PMD_MASK; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; @@ -1705,9 +1707,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1752,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * before it's fully transferred to the new page. */ mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(new_page); unlock_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index 5a6baddde15..834b2d785f1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2724,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6738c47f1f7..1e4a600a616 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_long_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40bd6c..cbb38545d9d 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } diff --git a/mm/rmap.c b/mm/rmap.c index fd3ee7a54a1..55c8b8dc9ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (unlikely(PageHuge(page))) { pte = huge_pte_offset(mm, address); - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; } @@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) goto out; - } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter |