diff options
Diffstat (limited to 'mm/mprotect.c')
| -rw-r--r-- | mm/mprotect.c | 207 | 
1 files changed, 165 insertions, 42 deletions
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c513387309..c43d557941f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@  #include <linux/mmu_notifier.h>  #include <linux/migrate.h>  #include <linux/perf_event.h> +#include <linux/ksm.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h>  #include <asm/cacheflush.h> @@ -35,100 +36,224 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)  }  #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, +			unsigned long addr, int prot_numa, spinlock_t **ptl) +{ +	pte_t *pte; +	spinlock_t *pmdl; + +	/* !prot_numa is protected by mmap_sem held for write */ +	if (!prot_numa) +		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + +	pmdl = pmd_lock(vma->vm_mm, pmd); +	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { +		spin_unlock(pmdl); +		return NULL; +	} + +	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); +	spin_unlock(pmdl); +	return pte; +} + +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  { +	struct mm_struct *mm = vma->vm_mm;  	pte_t *pte, oldpte;  	spinlock_t *ptl; +	unsigned long pages = 0; + +	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); +	if (!pte) +		return 0; -	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);  	arch_enter_lazy_mmu_mode();  	do {  		oldpte = *pte;  		if (pte_present(oldpte)) {  			pte_t ptent; +			bool updated = false; -			ptent = ptep_modify_prot_start(mm, addr, pte); -			ptent = pte_modify(ptent, newprot); - -			/* -			 * Avoid taking write faults for pages we know to be -			 * dirty. -			 */ -			if (dirty_accountable && pte_dirty(ptent)) -				ptent = pte_mkwrite(ptent); - -			ptep_modify_prot_commit(mm, addr, pte, ptent); -		} else if (PAGE_MIGRATION && !pte_file(oldpte)) { +			if (!prot_numa) { +				ptent = ptep_modify_prot_start(mm, addr, pte); +				if (pte_numa(ptent)) +					ptent = pte_mknonnuma(ptent); +				ptent = pte_modify(ptent, newprot); +				/* +				 * Avoid taking write faults for pages we +				 * know to be dirty. +				 */ +				if (dirty_accountable && pte_dirty(ptent)) +					ptent = pte_mkwrite(ptent); +				ptep_modify_prot_commit(mm, addr, pte, ptent); +				updated = true; +			} else { +				struct page *page; + +				page = vm_normal_page(vma, addr, oldpte); +				if (page && !PageKsm(page)) { +					if (!pte_numa(oldpte)) { +						ptep_set_numa(mm, addr, pte); +						updated = true; +					} +				} +			} +			if (updated) +				pages++; +		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {  			swp_entry_t entry = pte_to_swp_entry(oldpte);  			if (is_write_migration_entry(entry)) { +				pte_t newpte;  				/*  				 * A protection check is difficult so  				 * just be safe and disable write  				 */  				make_migration_entry_read(&entry); -				set_pte_at(mm, addr, pte, -					swp_entry_to_pte(entry)); +				newpte = swp_entry_to_pte(entry); +				if (pte_swp_soft_dirty(oldpte)) +					newpte = pte_swp_mksoft_dirty(newpte); +				set_pte_at(mm, addr, pte, newpte); + +				pages++;  			}  		}  	} while (pte++, addr += PAGE_SIZE, addr != end);  	arch_leave_lazy_mmu_mode();  	pte_unmap_unlock(pte - 1, ptl); + +	return pages;  } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, -		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, +		pud_t *pud, unsigned long addr, unsigned long end, +		pgprot_t newprot, int dirty_accountable, int prot_numa)  {  	pmd_t *pmd; +	struct mm_struct *mm = vma->vm_mm;  	unsigned long next; +	unsigned long pages = 0; +	unsigned long nr_huge_updates = 0; +	unsigned long mni_start = 0;  	pmd = pmd_offset(pud, addr);  	do { +		unsigned long this_pages; +  		next = pmd_addr_end(addr, end); -		if (pmd_none_or_clear_bad(pmd)) +		if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))  			continue; -		change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + +		/* invoke the mmu notifier if the pmd is populated */ +		if (!mni_start) { +			mni_start = addr; +			mmu_notifier_invalidate_range_start(mm, mni_start, end); +		} + +		if (pmd_trans_huge(*pmd)) { +			if (next - addr != HPAGE_PMD_SIZE) +				split_huge_page_pmd(vma, addr, pmd); +			else { +				int nr_ptes = change_huge_pmd(vma, pmd, addr, +						newprot, prot_numa); + +				if (nr_ptes) { +					if (nr_ptes == HPAGE_PMD_NR) { +						pages += HPAGE_PMD_NR; +						nr_huge_updates++; +					} + +					/* huge pmd was handled */ +					continue; +				} +			} +			/* fall through, the trans huge pmd just split */ +		} +		this_pages = change_pte_range(vma, pmd, addr, next, newprot, +				 dirty_accountable, prot_numa); +		pages += this_pages;  	} while (pmd++, addr = next, addr != end); + +	if (mni_start) +		mmu_notifier_invalidate_range_end(mm, mni_start, end); + +	if (nr_huge_updates) +		count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); +	return pages;  } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, -		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, +		pgd_t *pgd, unsigned long addr, unsigned long end, +		pgprot_t newprot, int dirty_accountable, int prot_numa)  {  	pud_t *pud;  	unsigned long next; +	unsigned long pages = 0;  	pud = pud_offset(pgd, addr);  	do {  		next = pud_addr_end(addr, end);  		if (pud_none_or_clear_bad(pud))  			continue; -		change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); +		pages += change_pmd_range(vma, pud, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pud++, addr = next, addr != end); + +	return pages;  } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma,  		unsigned long addr, unsigned long end, pgprot_t newprot, -		int dirty_accountable) +		int dirty_accountable, int prot_numa)  {  	struct mm_struct *mm = vma->vm_mm;  	pgd_t *pgd;  	unsigned long next;  	unsigned long start = addr; +	unsigned long pages = 0;  	BUG_ON(addr >= end);  	pgd = pgd_offset(mm, addr);  	flush_cache_range(vma, addr, end); +	set_tlb_flush_pending(mm);  	do {  		next = pgd_addr_end(addr, end);  		if (pgd_none_or_clear_bad(pgd))  			continue; -		change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); +		pages += change_pud_range(vma, pgd, addr, next, newprot, +				 dirty_accountable, prot_numa);  	} while (pgd++, addr = next, addr != end); -	flush_tlb_range(vma, start, end); + +	/* Only flush the TLB if we actually modified any entries: */ +	if (pages) +		flush_tlb_range(vma, start, end); +	clear_tlb_flush_pending(mm); + +	return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +		       unsigned long end, pgprot_t newprot, +		       int dirty_accountable, int prot_numa) +{ +	unsigned long pages; + +	if (is_vm_hugetlb_page(vma)) +		pages = hugetlb_change_protection(vma, start, end, newprot); +	else +		pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + +	return pages;  }  int @@ -158,7 +283,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,  		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|  						VM_SHARED|VM_NORESERVE))) {  			charged = nrpages; -			if (security_vm_enough_memory(charged)) +			if (security_vm_enough_memory_mm(mm, charged))  				return -ENOMEM;  			newflags |= VM_ACCOUNT;  		} @@ -203,12 +328,9 @@ success:  		dirty_accountable = 1;  	} -	mmu_notifier_invalidate_range_start(mm, start, end); -	if (is_vm_hugetlb_page(vma)) -		hugetlb_change_protection(vma, start, end, vma->vm_page_prot); -	else -		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); -	mmu_notifier_invalidate_range_end(mm, start, end); +	change_protection(vma, start, end, vma->vm_page_prot, +			  dirty_accountable, 0); +  	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);  	vm_stat_account(mm, newflags, vma->vm_file, nrpages);  	perf_event_mmap(vma); @@ -252,10 +374,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,  	down_write(¤t->mm->mmap_sem); -	vma = find_vma_prev(current->mm, start, &prev); +	vma = find_vma(current->mm, start);  	error = -ENOMEM;  	if (!vma)  		goto out; +	prev = vma->vm_prev;  	if (unlikely(grows & PROT_GROWSDOWN)) {  		if (vma->vm_start >= end)  			goto out; @@ -263,8 +386,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,  		error = -EINVAL;  		if (!(vma->vm_flags & VM_GROWSDOWN))  			goto out; -	} -	else { +	} else {  		if (vma->vm_start > start)  			goto out;  		if (unlikely(grows & PROT_GROWSUP)) { @@ -280,9 +402,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,  	for (nstart = start ; ; ) {  		unsigned long newflags; -		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */ +		/* Here we know that vma->vm_start <= nstart < vma->vm_end. */ -		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); +		newflags = vm_flags; +		newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));  		/* newflags >> 4 shift VM_MAY% in place of VM_% */  		if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {  | 
