diff options
Diffstat (limited to 'arch/x86/mm/pgtable.c')
| -rw-r--r-- | arch/x86/mm/pgtable.c | 202 |
1 files changed, 166 insertions, 36 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ed34f5e3599..6fb6927f9e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> @@ -6,6 +7,14 @@ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return (pte_t *)__get_free_page(PGALLOC_GFP); @@ -15,16 +24,33 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif - if (pte) - pgtable_page_ctor(pte); + pte = alloc_pages(__userpte_alloc_gfp, 0); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); @@ -35,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if PAGETABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { + struct page *page = virt_to_page(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif + pgtable_pmd_page_dtor(page); + tlb_remove_page(tlb, page); } #if PAGETABLE_LEVELS > 3 @@ -65,7 +100,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -76,27 +123,23 @@ static void pgd_ctor(pgd_t *pgd) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) return; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -107,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd) * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc */ #ifdef CONFIG_X86_PAE @@ -138,8 +181,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ - if (mm == current->active_mm) - write_cr3(read_cr3()); + flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ @@ -153,8 +195,10 @@ static void free_pmds(pmd_t *pmds[]) int i; for(i = 0; i < PREALLOCATED_PMDS; i++) - if (pmds[i]) + if (pmds[i]) { + pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + } } static int preallocate_pmds(pmd_t *pmds[]) @@ -164,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[]) for(i = 0; i < PREALLOCATED_PMDS; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); - if (pmd == NULL) + if (!pmd) + failed = true; + if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { + free_page((unsigned long)pmd); + pmd = NULL; failed = true; + } pmds[i] = pmd; } @@ -204,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -212,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) @@ -228,7 +275,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - unsigned long flags; pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -248,12 +294,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return pgd; @@ -273,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -282,12 +335,35 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); - flush_tlb_page(vma, address); } return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -303,18 +379,72 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ int young; - young = ptep_test_and_clear_young(vma, address, ptep); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) - flush_tlb_page(vma, address); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)pmdp); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve @@ -326,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } |
