diff options
Diffstat (limited to 'arch/x86/mm/pgtable.c')
| -rw-r--r-- | arch/x86/mm/pgtable.c | 146 | 
1 files changed, 122 insertions, 24 deletions
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d7bc8..6fb6927f9e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)  	struct page *pte;  	pte = alloc_pages(__userpte_alloc_gfp, 0); -	if (pte) -		pgtable_page_ctor(pte); +	if (!pte) +		return NULL; +	if (!pgtable_page_ctor(pte)) { +		__free_page(pte); +		return NULL; +	}  	return pte;  } @@ -57,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)  #if PAGETABLE_LEVELS > 2  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)  { +	struct page *page = virt_to_page(pmd);  	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); -	tlb_remove_page(tlb, virt_to_page(pmd)); +	/* +	 * NOTE! For PAE, any changes to the top page-directory-pointer-table +	 * entries need a full cr3 reload to flush. +	 */ +#ifdef CONFIG_X86_PAE +	tlb->need_flush_all = 1; +#endif +	pgtable_pmd_page_dtor(page); +	tlb_remove_page(tlb, page);  }  #if PAGETABLE_LEVELS > 3 @@ -121,14 +134,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)  static void pgd_dtor(pgd_t *pgd)  { -	unsigned long flags; /* can be called from interrupt context */ -  	if (SHARED_KERNEL_PMD)  		return; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	pgd_list_del(pgd); -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  }  /* @@ -139,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd)   * against pageattr.c; it is the unique case in which a valid change   * of kernel pagetables can't be lazily synchronized by vmalloc faults.   * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc   */  #ifdef CONFIG_X86_PAE @@ -170,8 +181,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)  	 * section 8.1: in PAE mode we explicitly have to flush the  	 * TLB via cr3 if the top-level pgd is changed...  	 */ -	if (mm == current->active_mm) -		write_cr3(read_cr3()); +	flush_tlb_mm(mm);  }  #else  /* !CONFIG_X86_PAE */ @@ -185,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])  	int i;  	for(i = 0; i < PREALLOCATED_PMDS; i++) -		if (pmds[i]) +		if (pmds[i]) { +			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));  			free_page((unsigned long)pmds[i]); +		}  }  static int preallocate_pmds(pmd_t *pmds[]) @@ -196,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])  	for(i = 0; i < PREALLOCATED_PMDS; i++) {  		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); -		if (pmd == NULL) +		if (!pmd) +			failed = true; +		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { +			free_page((unsigned long)pmd); +			pmd = NULL;  			failed = true; +		}  		pmds[i] = pmd;  	} @@ -236,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)  static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])  {  	pud_t *pud; -	unsigned long addr;  	int i;  	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -244,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])  	pud = pud_offset(pgd, 0); - 	for (addr = i = 0; i < PREALLOCATED_PMDS; -	     i++, pud++, addr += PUD_SIZE) { +	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {  		pmd_t *pmd = pmds[i];  		if (i >= KERNEL_PGD_BOUNDARY) @@ -260,7 +275,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)  {  	pgd_t *pgd;  	pmd_t *pmds[PREALLOCATED_PMDS]; -	unsigned long flags;  	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -280,12 +294,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)  	 * respect to anything walking the pgd_list, so that they  	 * never see a partially populated pgd.  	 */ -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	pgd_ctor(mm, pgd);  	pgd_prepopulate_pmd(mm, pgd, pmds); -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  	return pgd; @@ -305,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)  	free_page((unsigned long)pgd);  } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */  int ptep_set_access_flags(struct vm_area_struct *vma,  			  unsigned long address, pte_t *ptep,  			  pte_t entry, int dirty) @@ -314,12 +335,35 @@ int ptep_set_access_flags(struct vm_area_struct *vma,  	if (changed && dirty) {  		*ptep = entry;  		pte_update_defer(vma->vm_mm, address, ptep); -		flush_tlb_page(vma, address);  	}  	return changed;  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, +			  unsigned long address, pmd_t *pmdp, +			  pmd_t entry, int dirty) +{ +	int changed = !pmd_same(*pmdp, entry); + +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +	if (changed && dirty) { +		*pmdp = entry; +		pmd_update_defer(vma->vm_mm, address, pmdp); +		/* +		 * We had a write-protection fault here and changed the pmd +		 * to to more permissive. No need to flush the TLB for that, +		 * #PF is architecturally guaranteed to do that and in the +		 * worst-case we'll generate a spurious fault. +		 */ +	} + +	return changed; +} +#endif +  int ptep_test_and_clear_young(struct vm_area_struct *vma,  			      unsigned long addr, pte_t *ptep)  { @@ -335,18 +379,72 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,  	return ret;  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, +			      unsigned long addr, pmd_t *pmdp) +{ +	int ret = 0; + +	if (pmd_young(*pmdp)) +		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, +					 (unsigned long *)pmdp); + +	if (ret) +		pmd_update(vma->vm_mm, addr, pmdp); + +	return ret; +} +#endif +  int ptep_clear_flush_young(struct vm_area_struct *vma,  			   unsigned long address, pte_t *ptep)  { +	/* +	 * On x86 CPUs, clearing the accessed bit without a TLB flush +	 * doesn't cause data corruption. [ It could cause incorrect +	 * page aging and the (mistaken) reclaim of hot pages, but the +	 * chance of that should be relatively low. ] +	 * +	 * So as a performance optimization don't flush the TLB when +	 * clearing the accessed bit, it will eventually be flushed by +	 * a context switch or a VM operation anyway. [ In the rare +	 * event of it not getting flushed for a long time the delay +	 * shouldn't really matter because there's no real memory +	 * pressure for swapout to react to. ] +	 */ +	return ptep_test_and_clear_young(vma, address, ptep); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, +			   unsigned long address, pmd_t *pmdp) +{  	int young; -	young = ptep_test_and_clear_young(vma, address, ptep); +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +	young = pmdp_test_and_clear_young(vma, address, pmdp);  	if (young) -		flush_tlb_page(vma, address); +		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);  	return young;  } +void pmdp_splitting_flush(struct vm_area_struct *vma, +			  unsigned long address, pmd_t *pmdp) +{ +	int set; +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); +	set = !test_and_set_bit(_PAGE_BIT_SPLITTING, +				(unsigned long *)pmdp); +	if (set) { +		pmd_update(vma->vm_mm, address, pmdp); +		/* need tlb flush only to serialize against gup-fast */ +		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +	} +} +#endif +  /**   * reserve_top_address - reserves a hole in the top of kernel address space   * @reserve - size of hole to reserve @@ -358,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)  {  #ifdef CONFIG_X86_32  	BUG_ON(fixmaps_set > 0); -	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", -	       (int)-reserve); -	__FIXADDR_TOP = -reserve - PAGE_SIZE; +	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; +	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", +	       -reserve, __FIXADDR_TOP + PAGE_SIZE);  #endif  }  | 
